#!/usr/bin/env python3

import gensim.downloader as api
import numpy as np

# Load the pre-trained Word2Vec model (downloads ~1.6 GB on first run)
model = api.load('word2vec-google-news-300')

# Function to get embedding and L2 norm for a word or phrase
def get_word_norm(word):
    # Handle multi-word phrases by splitting and averaging vectors
    subwords = word.split()  # Split on spaces
    vectors = []
    for sub in subwords:
        if sub in model:
            vectors.append(model[sub])
        else:
            print(f"Warning: '{sub}' not in vocabulary.")
    if not vectors:
        raise ValueError(f"No vectors found for '{word}' in model.")
    # Average the vectors for multi-word
    embedding = np.mean(vectors, axis=0)
    l2_norm = np.linalg.norm(embedding)  # L2 norm (Euclidean norm)
    return l2_norm

# Function to score category-likeness (higher = more category-like)
def category_score(norm, scale_factor=1.0):
    return scale_factor / norm  # Simple inverse; adjust scale_factor if needed

# Example words/phrases
words = ['animal', 'cat', 'mammal', 'siamese', 'thing', 'eiffel tower']  # From general to specific

# Compute norms and scores
results = {}
for word in words:
    try:
        norm = get_word_norm(word)
        score = category_score(norm)
        results[word] = {'norm': norm, 'score': score}
    except ValueError as e:
        print(e)
        continue

# Print results (sorted by score descending)
sorted_results = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True)
for word, data in sorted_results:
    print(f"Word: {word}\tNorm: {data['norm']:.4f}\tCategory Score: {data['score']:.4f}")