Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /cw-gen.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 21 days ago

raw

history blame

6.38 kB

	#!/usr/bin/env python3
	import os
	import numpy as np
	import faiss
	from sentence_transformers import SentenceTransformer
	import requests
	from sklearn.cluster import KMeans
	import networkx as nx

	def get_vocab():
	# Dynamically fetch a list of common English words from a public GitHub repository
	url = "https://raw.githubusercontent.com/first20hours/google-10000-english/master/google-10000-english-no-swears.txt"
	response = requests.get(url)
	if response.status_code == 200:
	return [word.strip().lower() for word in response.text.splitlines() if word.strip()]
	else:
	raise Exception("Failed to fetch vocabulary list")

	class CrosswordGenerator:
	def __init__(self):
	self.vocab = get_vocab()
	self.model = SentenceTransformer('all-MiniLM-L6-v2')
	embeddings = self.model.encode(self.vocab, convert_to_numpy=True)
	embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
	faiss.normalize_L2(embeddings)
	self.dimension = embeddings.shape[1]
	# Use IndexFlatIP for cosine similarity (since normalized)
	self.faiss_index = faiss.IndexFlatIP(self.dimension)
	self.faiss_index.add(embeddings)
	self.max_results = 50 # Adjustable

	def is_subcategory(self, topic, word):
	# Dynamically check if word is a subcategory using Wikipedia API
	url = f"https://en.wikipedia.org/w/api.php?action=query&prop=categories&format=json&titles={word.capitalize()}"
	try:
	response = requests.get(url).json()
	pages = response.get('query', {}).get('pages', {})
	if pages:
	cats = list(pages.values())[0].get('categories', [])
	return any(topic.lower() in cat['title'].lower() for cat in cats)
	return False
	except Exception:
	return False

	def generate_words(self, topic, num_words=20):
	variations = [topic.lower()]
	# if topic.endswith('s'):
	# variations.append(topic[:-1])
	# else:
	# variations.append(topic + 's')

	all_results = {}

	for variation in variations:
	# Get topic embedding
	topic_embedding = self.model.encode([variation], convert_to_numpy=True)
	# Add search randomness
	noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
	if noise_factor > 0:
	noise = np.random.normal(0, noise_factor, topic_embedding.shape)
	topic_embedding += noise
	topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
	faiss.normalize_L2(topic_embedding)

	search_size = min(self.max_results * 3, len(self.vocab))
	scores, indices = self.faiss_index.search(topic_embedding, search_size)

	# Filter initial results with a similarity threshold (e.g., >0.3 for cosine)
	initial_results = []
	for i in range(len(indices[0])):
	idx = indices[0][i]
	score = scores[0][i]
	if score > 0.3: # Adjustable threshold
	initial_results.append(self.vocab[idx])

	# Step 2: Identify subcategories from initial results
	subcats = [w for w in initial_results[:30] if self.is_subcategory(topic, w)] # Check top 30
	print(f"subcats {subcats}")

	# Fallback: Use clustering to discover potential subcategories
	if not subcats and len(initial_results) >= 3:
	result_embeddings = self.model.encode(initial_results, convert_to_numpy=True)
	result_embeddings = np.ascontiguousarray(result_embeddings, dtype=np.float32)
	faiss.normalize_L2(result_embeddings)
	kmeans = KMeans(n_clusters=min(3, len(initial_results)), random_state=42).fit(result_embeddings)
	cluster_centers = kmeans.cluster_centers_.astype(np.float32)
	faiss.normalize_L2(cluster_centers)
	_, subcat_indices = self.faiss_index.search(cluster_centers, 1)
	subcats = [self.vocab[subcat_indices[j][0]] for j in range(len(subcat_indices))]

	# Step 3: Search subcategories for more specific words
	for level, subs in enumerate([subcats], start=1):
	for sub in subs:
	sub_embedding = self.model.encode([sub], convert_to_numpy=True)
	sub_embedding = np.ascontiguousarray(sub_embedding, dtype=np.float32)
	faiss.normalize_L2(sub_embedding)
	sub_scores, sub_indices = self.faiss_index.search(sub_embedding, search_size)
	for i in range(len(sub_indices[0])):
	idx = sub_indices[0][i]
	score = sub_scores[0][i]
	if score > 0.3:
	w = self.vocab[idx]
	# Weight by level (decay)
	weighted_score = score * (0.8 ** level)
	all_results[w] = all_results.get(w, 0) + weighted_score

	# Add initial results
	for i in range(len(indices[0])):
	idx = indices[0][i]
	score = scores[0][i]
	if score > 0.3:
	w = self.vocab[idx]
	all_results[w] = all_results.get(w, 0) + score

	# Step 4: Combine and weight all results using graph-based aggregation
	G = nx.Graph()
	G.add_node(topic)
	for w, score in all_results.items():
	G.add_edge(topic, w, weight=score)
	pr = nx.pagerank(G, weight='weight')

	# Sort and return top results (exclude topic itself)
	sorted_results = sorted(pr.items(), key=lambda x: x[1], reverse=True)
	final_words = [w for w, _ in sorted_results if w != topic][:num_words]

	return final_words

	if __name__ == "__main__":
	generator = CrosswordGenerator()
	topics = ["animal", "animal", "science", "technology", "food", "indian food", "chinese food"] # Example topic
	for topic in topics:
	print(f"------------- {topic} ------------")
	generated_words = generator.generate_words(topic)
	sorted_generated_words = sorted(generated_words)
	print(f"Generated words for topic '{topic}':")
	print(sorted_generated_words)