#!/usr/bin/env python3 from sentence_transformers import SentenceTransformer from sentence_transformers.models import Transformer, Pooling import numpy as np # Model name #model_name = 'sentence-transformers/all-mpnet-base-v2' model_name = 'BAAI/bge-large-en-v1.5' # Load the transformer component transformer = Transformer(model_name) # Get the embedding dimension (768 for this model) dim = transformer.get_word_embedding_dimension() # Create the pooling layer (mean pooling, standard for sentence-transformers) pooling = Pooling(dim) # Assemble the model without the Normalize module model = SentenceTransformer(modules=[transformer, pooling]) # Function to get embedding and L2 norm for a word def get_word_norm(word): # Embed the word as a short sentence # word = f"The {word}" # print(f"word: {word}") embedding = model.encode(word, normalize_embeddings=False) # No normalization l2_norm = np.linalg.norm(embedding) return l2_norm # Function to score category-likeness (higher = more category-like) def category_score(norm, scale_factor=1.0): return scale_factor / norm # Simple inverse # Example words words = ['animal', 'cat', 'mammal', 'siamese', 'thing', 'eiffel tower'] # From general to specific # Compute norms and scores results = {} for word in words: norm = get_word_norm(word) score = category_score(norm) results[word] = {'norm': norm, 'score': score} # Print results (sorted by score descending) sorted_results = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True) for word, data in sorted_results: print(f"Word: {word}\tNorm: {data['norm']:.4f}\tCategory Score: {data['score']:.4f}")