Spaces:

vimalk78
/

abc123

Running

abc123 / hack /norm-word2vec.py

feat(crossword): generated crosswords with clues

486eff6 23 days ago

1.66 kB

	#!/usr/bin/env python3

	import gensim.downloader as api
	import numpy as np

	# Load the pre-trained Word2Vec model (downloads ~1.6 GB on first run)
	model = api.load('word2vec-google-news-300')

	# Function to get embedding and L2 norm for a word or phrase
	def get_word_norm(word):
	# Handle multi-word phrases by splitting and averaging vectors
	subwords = word.split() # Split on spaces
	vectors = []
	for sub in subwords:
	if sub in model:
	vectors.append(model[sub])
	else:
	print(f"Warning: '{sub}' not in vocabulary.")
	if not vectors:
	raise ValueError(f"No vectors found for '{word}' in model.")
	# Average the vectors for multi-word
	embedding = np.mean(vectors, axis=0)
	l2_norm = np.linalg.norm(embedding) # L2 norm (Euclidean norm)
	return l2_norm

	# Function to score category-likeness (higher = more category-like)
	def category_score(norm, scale_factor=1.0):
	return scale_factor / norm # Simple inverse; adjust scale_factor if needed

	# Example words/phrases
	words = ['animal', 'cat', 'mammal', 'siamese', 'thing', 'eiffel tower'] # From general to specific

	# Compute norms and scores
	results = {}
	for word in words:
	try:
	norm = get_word_norm(word)
	score = category_score(norm)
	results[word] = {'norm': norm, 'score': score}
	except ValueError as e:
	print(e)
	continue

	# Print results (sorted by score descending)
	sorted_results = sorted(results.items(), key=lambda x: x[1]['score'], reverse=True)
	for word, data in sorted_results:
	print(f"Word: {word}\tNorm: {data['norm']:.4f}\tCategory Score: {data['score']:.4f}")