|
|
|
""" |
|
Test Geometric Mean Method for Multi-Topic Word Finding |
|
|
|
The geometric mean approach: score = (sim1 × sim2 × ... × simN)^(1/N) |
|
This method penalizes low scores more heavily than arithmetic mean, |
|
potentially finding better intersection words. |
|
""" |
|
|
|
import os |
|
import sys |
|
import numpy as np |
|
from typing import List, Tuple, Dict |
|
import warnings |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
def setup_environment(): |
|
"""Setup environment and imports""" |
|
|
|
cache_dir = os.path.join(os.path.dirname(__file__), '..', 'cache-dir') |
|
cache_dir = os.path.abspath(cache_dir) |
|
os.environ['HF_HOME'] = cache_dir |
|
os.environ['TRANSFORMERS_CACHE'] = cache_dir |
|
os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir |
|
|
|
try: |
|
from sentence_transformers import SentenceTransformer |
|
import torch |
|
return SentenceTransformer, torch |
|
except ImportError as e: |
|
print(f"❌ Missing dependencies: {e}") |
|
print("Install with: pip install sentence-transformers torch") |
|
sys.exit(1) |
|
|
|
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: |
|
"""Calculate cosine similarity between two vectors""" |
|
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) |
|
|
|
def geometric_mean_method(topic_vectors: List[np.ndarray], word_vectors: Dict[str, np.ndarray]) -> List[Tuple[str, float]]: |
|
""" |
|
Geometric mean method - finds words relevant to ALL topics. |
|
Score = (similarity_to_topic1 × similarity_to_topic2 × ...)^(1/N) |
|
""" |
|
similarities = [] |
|
|
|
for word, word_vec in word_vectors.items(): |
|
|
|
topic_similarities = [] |
|
for topic_vec in topic_vectors: |
|
sim = cosine_similarity(word_vec, topic_vec) |
|
|
|
sim = max(sim, 0.001) |
|
topic_similarities.append(sim) |
|
|
|
|
|
geo_mean = np.prod(topic_similarities) ** (1/len(topic_similarities)) |
|
similarities.append((word, geo_mean)) |
|
|
|
return sorted(similarities, key=lambda x: x[1], reverse=True) |
|
|
|
def harmonic_mean_method(topic_vectors: List[np.ndarray], word_vectors: Dict[str, np.ndarray]) -> List[Tuple[str, float]]: |
|
""" |
|
Harmonic mean method - heavily penalizes low scores. |
|
Score = N / (1/sim1 + 1/sim2 + ... + 1/simN) |
|
""" |
|
similarities = [] |
|
|
|
for word, word_vec in word_vectors.items(): |
|
|
|
topic_similarities = [] |
|
for topic_vec in topic_vectors: |
|
sim = cosine_similarity(word_vec, topic_vec) |
|
|
|
sim = max(sim, 0.001) |
|
topic_similarities.append(sim) |
|
|
|
|
|
harmonic_mean = len(topic_similarities) / sum(1/s for s in topic_similarities) |
|
similarities.append((word, harmonic_mean)) |
|
|
|
return sorted(similarities, key=lambda x: x[1], reverse=True) |
|
|
|
def soft_min_method(topic_vectors: List[np.ndarray], word_vectors: Dict[str, np.ndarray], beta: float = 10.0) -> List[Tuple[str, float]]: |
|
""" |
|
Soft minimum method - smooth approximation to minimum similarity. |
|
Score = -log(sum(exp(-beta * sim_i))) / beta |
|
""" |
|
similarities = [] |
|
|
|
for word, word_vec in word_vectors.items(): |
|
|
|
topic_similarities = [] |
|
for topic_vec in topic_vectors: |
|
sim = cosine_similarity(word_vec, topic_vec) |
|
topic_similarities.append(sim) |
|
|
|
|
|
score = -np.log(sum(np.exp(-beta * s) for s in topic_similarities)) / beta |
|
similarities.append((word, score)) |
|
|
|
return sorted(similarities, key=lambda x: x[1], reverse=True) |
|
|
|
def simple_averaging(topic_vectors: List[np.ndarray], word_vectors: Dict[str, np.ndarray]) -> List[Tuple[str, float]]: |
|
"""Simple averaging method (current approach)""" |
|
avg_vector = np.mean(topic_vectors, axis=0) |
|
|
|
similarities = [] |
|
for word, word_vec in word_vectors.items(): |
|
sim = cosine_similarity(avg_vector, word_vec) |
|
similarities.append((word, sim)) |
|
|
|
return sorted(similarities, key=lambda x: x[1], reverse=True) |
|
|
|
def load_sample_words() -> List[str]: |
|
"""Load actual sample words from the art-and-books sample file""" |
|
sample_file = os.path.join(os.path.dirname(__file__), '..', 'samples', 'art-and-books-sample-words.txt') |
|
|
|
words = [] |
|
current_section = None |
|
|
|
if os.path.exists(sample_file): |
|
with open(sample_file, 'r') as f: |
|
for line in f: |
|
line = line.strip() |
|
if line.startswith("['art', 'books']"): |
|
current_section = "separated" |
|
continue |
|
elif line.startswith("['art and books']") or line.startswith("['words related to art and books']"): |
|
current_section = "combined" |
|
continue |
|
elif line and not line.startswith('[') and line != '' and current_section == "separated": |
|
|
|
words.append(line) |
|
if len(words) >= 100: |
|
break |
|
|
|
return words |
|
|
|
def test_multiple_methods(model): |
|
"""Compare all intersection methods""" |
|
print("🔍 Comparing Multiple Intersection Methods") |
|
print("=" * 70) |
|
|
|
|
|
sample_words = load_sample_words() |
|
print(f"Loaded {len(sample_words)} sample words") |
|
|
|
if len(sample_words) < 10: |
|
print("❌ Not enough sample words loaded") |
|
return |
|
|
|
|
|
topics = ["Art", "Books"] |
|
topic_embeddings = model.encode(topics) |
|
topic_vectors = [emb for emb in topic_embeddings] |
|
|
|
|
|
print("Encoding word embeddings...") |
|
word_embeddings = model.encode(sample_words) |
|
word_vectors = dict(zip(sample_words, word_embeddings)) |
|
|
|
|
|
methods = [ |
|
("Simple Averaging", simple_averaging), |
|
("Geometric Mean", geometric_mean_method), |
|
("Harmonic Mean", harmonic_mean_method), |
|
("Soft Minimum", lambda tv, wv: soft_min_method(tv, wv, beta=10.0)) |
|
] |
|
|
|
all_results = {} |
|
|
|
for method_name, method_func in methods: |
|
print(f"\n📊 {method_name} - Top 15:") |
|
results = method_func(topic_vectors, word_vectors) |
|
all_results[method_name] = results |
|
|
|
for i, (word, score) in enumerate(results[:15], 1): |
|
print(f" {i:2d}. {word:20s}: {score:.4f}") |
|
|
|
|
|
print(f"\n🔄 Method Comparison Analysis:") |
|
|
|
|
|
word_rankings = {} |
|
for method_name, results in all_results.items(): |
|
rankings = {word: rank for rank, (word, _) in enumerate(results)} |
|
word_rankings[method_name] = rankings |
|
|
|
|
|
significant_differences = [] |
|
for word in sample_words[:50]: |
|
rankings = [word_rankings[method].get(word, len(sample_words)) for method in word_rankings] |
|
if max(rankings) - min(rankings) >= 10: |
|
significant_differences.append((word, rankings)) |
|
|
|
if significant_differences: |
|
print(f" Words with significant ranking differences:") |
|
method_names = list(all_results.keys()) |
|
header = f" {'Word':<20s} " + " ".join(f"{name[:8]:>8s}" for name in method_names) |
|
print(header) |
|
print(" " + "-" * len(header)) |
|
|
|
for word, rankings in significant_differences[:10]: |
|
rank_str = " ".join(f"{rank+1:8d}" for rank in rankings) |
|
print(f" {word:<20s} {rank_str}") |
|
else: |
|
print(" No significant ranking differences found") |
|
|
|
|
|
problematic_words = ["ethology", "guns", "porn", "calibre"] |
|
good_words = ["illustration", "literature", "painting", "library", "poetry"] |
|
|
|
print(f"\n🎯 Analysis of Known Problematic Words:") |
|
for word in problematic_words: |
|
if word in word_rankings["Simple Averaging"]: |
|
ranks = [] |
|
for method_name in all_results.keys(): |
|
rank = word_rankings[method_name].get(word, len(sample_words)) |
|
ranks.append(f"{rank+1:3d}") |
|
print(f" {word:15s}: " + " | ".join(f"{method[:10]:>10s}: {rank}" for method, rank in zip(all_results.keys(), ranks))) |
|
|
|
print(f"\n✅ Analysis of Good Intersection Words:") |
|
for word in good_words: |
|
if word in word_rankings["Simple Averaging"]: |
|
ranks = [] |
|
for method_name in all_results.keys(): |
|
rank = word_rankings[method_name].get(word, len(sample_words)) |
|
ranks.append(f"{rank+1:3d}") |
|
print(f" {word:15s}: " + " | ".join(f"{method[:10]:>10s}: {rank}" for method, rank in zip(all_results.keys(), ranks))) |
|
|
|
def test_individual_similarities(model): |
|
"""Analyze individual topic similarities for key words""" |
|
print("\n\n🔬 Individual Topic Similarity Analysis") |
|
print("=" * 70) |
|
|
|
|
|
test_words = ["ethology", "illustration", "literature", "guns", "art", "books", "poetry"] |
|
topics = ["Art", "Books"] |
|
|
|
|
|
topic_embeddings = model.encode(topics) |
|
word_embeddings = model.encode(test_words) |
|
|
|
print(f"Individual similarities to each topic:") |
|
print(f"{'Word':<15s} {'Art':<8s} {'Books':<8s} {'Geo Mean':<10s} {'Harm Mean':<10s} {'Soft Min':<10s}") |
|
print("-" * 70) |
|
|
|
for word, word_emb in zip(test_words, word_embeddings): |
|
art_sim = cosine_similarity(word_emb, topic_embeddings[0]) |
|
books_sim = cosine_similarity(word_emb, topic_embeddings[1]) |
|
|
|
|
|
sims = [art_sim, books_sim] |
|
geo_mean = np.prod([max(s, 0.001) for s in sims]) ** (1/len(sims)) |
|
harm_mean = len(sims) / sum(1/max(s, 0.001) for s in sims) |
|
soft_min = -np.log(sum(np.exp(-10.0 * s) for s in sims)) / 10.0 |
|
|
|
print(f"{word:<15s} {art_sim:8.4f} {books_sim:8.4f} {geo_mean:10.4f} {harm_mean:10.4f} {soft_min:10.4f}") |
|
|
|
def main(): |
|
"""Main test runner""" |
|
print("🧪 Geometric Mean and Multiple Methods Test") |
|
print("Using production model: sentence-transformers/all-mpnet-base-v2") |
|
print("=" * 70) |
|
|
|
|
|
SentenceTransformer, torch = setup_environment() |
|
|
|
|
|
model_name = "sentence-transformers/all-mpnet-base-v2" |
|
print(f"Loading model: {model_name}") |
|
model = SentenceTransformer(model_name) |
|
print(f"✅ Model loaded successfully") |
|
|
|
|
|
test_multiple_methods(model) |
|
test_individual_similarities(model) |
|
|
|
print("\n" + "=" * 70) |
|
print("🎯 KEY INSIGHTS:") |
|
print("1. Geometric mean penalizes words with low similarity to any topic") |
|
print("2. Harmonic mean is even more aggressive at finding intersections") |
|
print("3. Soft minimum provides smooth approximation to true intersection") |
|
print("4. All methods may show similar results if topics are semantically close") |
|
print("=" * 70) |
|
|
|
if __name__ == "__main__": |
|
main() |