File size: 6,382 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
#!/usr/bin/env python3
import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import requests
from sklearn.cluster import KMeans
import networkx as nx
def get_vocab():
# Dynamically fetch a list of common English words from a public GitHub repository
url = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt"
response = requests.get(url)
if response.status_code == 200:
return [word.strip().lower() for word in response.text.splitlines() if word.strip()]
else:
raise Exception("Failed to fetch vocabulary list")
class CrosswordGenerator:
def __init__(self):
self.vocab = get_vocab()
self.model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = self.model.encode(self.vocab, convert_to_numpy=True)
embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
faiss.normalize_L2(embeddings)
self.dimension = embeddings.shape[1]
# Use IndexFlatIP for cosine similarity (since normalized)
self.faiss_index = faiss.IndexFlatIP(self.dimension)
self.faiss_index.add(embeddings)
self.max_results = 50 # Adjustable
def is_subcategory(self, topic, word):
# Dynamically check if word is a subcategory using Wikipedia API
url = f"https://en.wikipedia.org/w/api.php?action=query&prop=categories&format=json&titles={word.capitalize()}"
try:
response = requests.get(url).json()
pages = response.get('query', {}).get('pages', {})
if pages:
cats = list(pages.values())[0].get('categories', [])
return any(topic.lower() in cat['title'].lower() for cat in cats)
return False
except Exception:
return False
def generate_words(self, topic, num_words=20):
variations = [topic.lower()]
# if topic.endswith('s'):
# variations.append(topic[:-1])
# else:
# variations.append(topic + 's')
all_results = {}
for variation in variations:
# Get topic embedding
topic_embedding = self.model.encode([variation], convert_to_numpy=True)
# Add search randomness
noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
if noise_factor > 0:
noise = np.random.normal(0, noise_factor, topic_embedding.shape)
topic_embedding += noise
topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
faiss.normalize_L2(topic_embedding)
search_size = min(self.max_results * 3, len(self.vocab))
scores, indices = self.faiss_index.search(topic_embedding, search_size)
# Filter initial results with a similarity threshold (e.g., >0.3 for cosine)
initial_results = []
for i in range(len(indices[0])):
idx = indices[0][i]
score = scores[0][i]
if score > 0.3: # Adjustable threshold
initial_results.append(self.vocab[idx])
# Step 2: Identify subcategories from initial results
subcats = [w for w in initial_results[:30] if self.is_subcategory(topic, w)] # Check top 30
print(f"subcats {subcats}")
# Fallback: Use clustering to discover potential subcategories
if not subcats and len(initial_results) >= 3:
result_embeddings = self.model.encode(initial_results, convert_to_numpy=True)
result_embeddings = np.ascontiguousarray(result_embeddings, dtype=np.float32)
faiss.normalize_L2(result_embeddings)
kmeans = KMeans(n_clusters=min(3, len(initial_results)), random_state=42).fit(result_embeddings)
cluster_centers = kmeans.cluster_centers_.astype(np.float32)
faiss.normalize_L2(cluster_centers)
_, subcat_indices = self.faiss_index.search(cluster_centers, 1)
subcats = [self.vocab[subcat_indices[j][0]] for j in range(len(subcat_indices))]
# Step 3: Search subcategories for more specific words
for level, subs in enumerate([subcats], start=1):
for sub in subs:
sub_embedding = self.model.encode([sub], convert_to_numpy=True)
sub_embedding = np.ascontiguousarray(sub_embedding, dtype=np.float32)
faiss.normalize_L2(sub_embedding)
sub_scores, sub_indices = self.faiss_index.search(sub_embedding, search_size)
for i in range(len(sub_indices[0])):
idx = sub_indices[0][i]
score = sub_scores[0][i]
if score > 0.3:
w = self.vocab[idx]
# Weight by level (decay)
weighted_score = score * (0.8 ** level)
all_results[w] = all_results.get(w, 0) + weighted_score
# Add initial results
for i in range(len(indices[0])):
idx = indices[0][i]
score = scores[0][i]
if score > 0.3:
w = self.vocab[idx]
all_results[w] = all_results.get(w, 0) + score
# Step 4: Combine and weight all results using graph-based aggregation
G = nx.Graph()
G.add_node(topic)
for w, score in all_results.items():
G.add_edge(topic, w, weight=score)
pr = nx.pagerank(G, weight='weight')
# Sort and return top results (exclude topic itself)
sorted_results = sorted(pr.items(), key=lambda x: x[1], reverse=True)
final_words = [w for w, _ in sorted_results if w != topic][:num_words]
return final_words
if __name__ == "__main__":
generator = CrosswordGenerator()
topics = ["animal", "animal", "science", "technology", "food", "indian food", "chinese food"] # Example topic
for topic in topics:
print(f"------------- {topic} ------------")
generated_words = generator.generate_words(topic)
sorted_generated_words = sorted(generated_words)
print(f"Generated words for topic '{topic}':")
print(sorted_generated_words)
|