#!/usr/bin/env python3 | |
import gensim.downloader as api | |
import numpy as np | |
# Load the pre-trained Word2Vec model (downloads ~1.6 GB on first run) | |
model = api.load('word2vec-google-news-300') | |
# Function to get embedding and L2 norm for a word or phrase | |
def get_word_norm(word): | |
# Handle multi-word phrases by splitting and averaging vectors | |
subwords = word.split() # Split on spaces | |
vectors = [] | |
for sub in subwords: | |
if sub in model: | |
vectors.append(model[sub]) | |
else: | |
print(f"Warning: '{sub}' not in vocabulary.") | |
if not vectors: | |
raise ValueError(f"No vectors found for '{word}' in model.") | |
# Average the vectors for multi-word | |
embedding = np.mean(vectors, axis=0) | |
l2_norm = np.linalg.norm(embedding) # L2 norm (Euclidean norm) | |
return l2_norm | |
# Function to score category-likeness (higher = more category-like) | |
def category_score(norm, scale_factor=1.0): | |
return scale_factor / norm # Simple inverse; adjust scale_factor if needed | |
# Example words/phrases | |
words = ['animal', 'cat', 'mammal', 'siamese', 'thing', 'eiffel tower'] # From general to specific | |
# Compute norms and scores | |
results = {} | |
for word in words: | |
try: | |
norm = get_word_norm(word) | |
score = category_score(norm) | |
results[word] = {'norm': norm, 'score': score} | |
except ValueError as e: | |
print(e) | |
continue | |
# Print results (sorted by score descending) | |
sorted_results = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True) | |
for word, data in sorted_results: | |
print(f"Word: {word}\tNorm: {data['norm']:.4f}\tCategory Score: {data['score']:.4f}") | |