File size: 1,677 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env python3

from sentence_transformers import SentenceTransformer
from sentence_transformers.models import Transformer, Pooling
import numpy as np

# Model name
#model_name = 'sentence-transformers/all-mpnet-base-v2'
model_name = 'BAAI/bge-large-en-v1.5'

# Load the transformer component
transformer = Transformer(model_name)

# Get the embedding dimension (768 for this model)
dim = transformer.get_word_embedding_dimension()

# Create the pooling layer (mean pooling, standard for sentence-transformers)
pooling = Pooling(dim)

# Assemble the model without the Normalize module
model = SentenceTransformer(modules=[transformer, pooling])

# Function to get embedding and L2 norm for a word
def get_word_norm(word):
    # Embed the word as a short sentence
    # word = f"The {word}"
    # print(f"word: {word}")
    embedding = model.encode(word, normalize_embeddings=False)  # No normalization
    l2_norm = np.linalg.norm(embedding)
    return l2_norm

# Function to score category-likeness (higher = more category-like)
def category_score(norm, scale_factor=1.0):
    return scale_factor / norm  # Simple inverse

# Example words
words = ['animal', 'cat', 'mammal', 'siamese', 'thing', 'eiffel tower']  # From general to specific

# Compute norms and scores
results = {}
for word in words:
    norm = get_word_norm(word)
    score = category_score(norm)
    results[word] = {'norm': norm, 'score': score}

# Print results (sorted by score descending)
sorted_results = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True)
for word, data in sorted_results:
    print(f"Word: {word}\tNorm: {data['norm']:.4f}\tCategory Score: {data['score']:.4f}")