#!/usr/bin/env python3 import gensim.downloader as api import numpy as np # Load the pre-trained Word2Vec model (downloads ~1.6 GB on first run) model = api.load('word2vec-google-news-300') # Function to get embedding and L2 norm for a word or phrase def get_word_norm(word): # Handle multi-word phrases by splitting and averaging vectors subwords = word.split() # Split on spaces vectors = [] for sub in subwords: if sub in model: vectors.append(model[sub]) else: print(f"Warning: '{sub}' not in vocabulary.") if not vectors: raise ValueError(f"No vectors found for '{word}' in model.") # Average the vectors for multi-word embedding = np.mean(vectors, axis=0) l2_norm = np.linalg.norm(embedding) # L2 norm (Euclidean norm) return l2_norm # Function to score category-likeness (higher = more category-like) def category_score(norm, scale_factor=1.0): return scale_factor / norm # Simple inverse; adjust scale_factor if needed # Example words/phrases words = ['animal', 'cat', 'mammal', 'siamese', 'thing', 'eiffel tower'] # From general to specific # Compute norms and scores results = {} for word in words: try: norm = get_word_norm(word) score = category_score(norm) results[word] = {'norm': norm, 'score': score} except ValueError as e: print(e) continue # Print results (sorted by score descending) sorted_results = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True) for word, data in sorted_results: print(f"Word: {word}\tNorm: {data['norm']:.4f}\tCategory Score: {data['score']:.4f}")