Spaces:

vimalk78
/

abc123

Running

File size: 10,285 Bytes

486eff6

#!/usr/bin/env python3

import os
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import requests
from sklearn.cluster import KMeans
import networkx as nx
import csv

def get_vocab():
    # Dynamically fetch a large list of English words from a public GitHub repository
    url = "https://raw.githubusercontent.com/dwyl/english-words/master/words.txt"
    response = requests.get(url)
    if response.status_code == 200:
        return [word.strip().lower() for word in response.text.splitlines() if word.strip() and len(word) > 2]  # Filter short words
    else:
        raise Exception("Failed to fetch vocabulary list")

class CrosswordGenerator2:
    def get_dict_vocab(self):
        """Read the dictionary CSV file and return list of words."""
        dict_path = os.path.join(os.path.dirname(__file__), 'dict-words', 'dict.csv')
        words = []
        
        try:
            with open(dict_path, 'r', encoding='utf-8') as csvfile:
                reader = csv.DictReader(csvfile)
                for row in reader:
                    word = row['word'].strip().lower()
                    if word and len(word) > 2:  # Filter short words
                        words.append(word)
        except FileNotFoundError:
            raise Exception(f"Dictionary file not found: {dict_path}")
        except Exception as e:
            raise Exception(f"Error reading dictionary file: {e}")
        
        return words

    def __init__(self, cache_dir='./model_cache'):
        self.vocab = self.get_dict_vocab()
        self.model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', cache_folder=cache_dir)
        embeddings = self.model.encode(self.vocab, convert_to_numpy=True)
        embeddings = np.ascontiguousarray(embeddings, dtype=np.float32)
        faiss.normalize_L2(embeddings)
        self.dimension = embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(self.dimension)
        self.faiss_index.add(embeddings)
        self.max_results = 50  # Adjustable

    def get_wikipedia_subcats(self, topic):
        topic_cap = topic.capitalize().replace(' ', '_')
        url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{topic_cap}&cmtype=subcat&format=json&cmlimit=50"
        try:
            response = requests.get(url).json()
            members = response.get('query', {}).get('categorymembers', [])
            if members:
                return [member['title'].replace('Category:', '').lower() for member in members]
            else:
                # Fallback: Search for main page and get relevant category subcats
                search_url = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={topic}&format=json"
                search_response = requests.get(search_url).json()
                search_results = search_response.get('query', {}).get('search', [])
                if search_results:
                    main_title = search_results[0]['title']
                    cat_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=categories&titles={main_title}&format=json&cllimit=50"
                    cat_response = requests.get(cat_url).json()
                    pages = cat_response.get('query', {}).get('pages', {})
                    if pages:
                        cats = list(pages.values())[0].get('categories', [])
                        cat_titles = [cat['title'].replace('Category:', '').lower() for cat in cats]
                        relevant_cats = [c for c in cat_titles if any(t in c for t in topic.lower().split())]
                        if relevant_cats:
                            subcat_topic = relevant_cats[0].capitalize().replace(' ', '_')
                            sub_url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{subcat_topic}&cmtype=subcat&format=json&cmlimit=50"
                            sub_response = requests.get(sub_url).json()
                            sub_members = sub_response.get('query', {}).get('categorymembers', [])
                            return [m['title'].replace('Category:', '').lower() for m in sub_members]
            return []
        except Exception:
            return []

    def get_category_pages(self, category):
        cat_cap = category.capitalize().replace(' ', '_')
        url = f"https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:{cat_cap}&cmtype=page&format=json&cmlimit=50"
        try:
            response = requests.get(url).json()
            members = response.get('query', {}).get('categorymembers', [])
            # Filter to single words, lower case
            return [member['title'].lower() for member in members if ' ' not in member['title'] and len(member['title']) > 3]
        except Exception:
            return []

    def is_subcategory(self, topic, word):
        url = f"https://en.wikipedia.org/w/api.php?action=query&prop=categories&format=json&titles={word.capitalize()}"
        try:
            response = requests.get(url).json()
            pages = response.get('query', {}).get('pages', {})
            if pages:
                cats = list(pages.values())[0].get('categories', [])
                return any(topic.lower() in cat['title'].lower() for cat in cats)
            return False
        except Exception:
            return False

    def generate_words(self, topic, num_words=20):
        variations = [topic.lower()]
        if topic.endswith('s'):
            variations.append(topic[:-1])
        else:
            variations.append(topic + 's')

        all_results = {}

        subcats = self.get_wikipedia_subcats(topic)
        print('wiki subcats', subcats)

        # Add specific words from subcategory pages
        for sub in subcats:
            pages = self.get_category_pages(sub)
            for p in pages:
                # Assign a high score for direct Wikipedia matches
                all_results[p] = all_results.get(p, 0) + 0.8  # High base score

        for variation in variations:
            # Get topic embedding
            topic_embedding = self.model.encode([variation], convert_to_numpy=True)
            noise_factor = float(os.getenv("SEARCH_RANDOMNESS", "0.02"))
            if noise_factor > 0:
                noise = np.random.normal(0, noise_factor, topic_embedding.shape)
                topic_embedding += noise
            topic_embedding = np.ascontiguousarray(topic_embedding, dtype=np.float32)
            faiss.normalize_L2(topic_embedding)

            search_size = min(self.max_results * 3, len(self.vocab))
            scores, indices = self.faiss_index.search(topic_embedding, search_size)

            initial_results = []
            for i in range(len(indices[0])):
                idx = indices[0][i]
                score = scores[0][i]
                if score > 0.3:
                    initial_results.append(self.vocab[idx])

            # Identify additional subcats from initial results if wiki didn't provide
            if not subcats:
                additional_subcats = [w for w in initial_results[:30] if self.is_subcategory(topic, w)]
                subcats.extend(additional_subcats)

            # Fallback clustering if still no subcats
            if not subcats and len(initial_results) >= 3:
                result_embeddings = self.model.encode(initial_results, convert_to_numpy=True)
                result_embeddings = np.ascontiguousarray(result_embeddings, dtype=np.float32)
                faiss.normalize_L2(result_embeddings)
                kmeans = KMeans(n_clusters=min(3, len(initial_results)), random_state=42).fit(result_embeddings)
                cluster_centers = kmeans.cluster_centers_.astype(np.float32)
                faiss.normalize_L2(cluster_centers)
                _, subcat_indices = self.faiss_index.search(cluster_centers, 1)
                subcats = [self.vocab[subcat_indices[j][0]] for j in range(len(subcat_indices))]

            # Search subcategories
            for level, subs in enumerate([subcats], start=1):
                for sub in subs:
                    sub_embedding = self.model.encode([sub], convert_to_numpy=True)
                    sub_embedding = np.ascontiguousarray(sub_embedding, dtype=np.float32)
                    faiss.normalize_L2(sub_embedding)
                    sub_scores, sub_indices = self.faiss_index.search(sub_embedding, search_size)
                    for i in range(len(sub_indices[0])):
                        idx = sub_indices[0][i]
                        score = sub_scores[0][i]
                        if score > 0.3:
                            w = self.vocab[idx]
                            weighted_score = score * (0.8 ** level)
                            all_results[w] = all_results.get(w, 0) + weighted_score

            # Add initial results
            for i in range(len(indices[0])):
                idx = indices[0][i]
                score = scores[0][i]
                if score > 0.3:
                    w = self.vocab[idx]
                    all_results[w] = all_results.get(w, 0) + score

        # Combine with graph-based weighting
        G = nx.Graph()
        G.add_node(topic)
        for w, score in all_results.items():
            G.add_edge(topic, w, weight=score)
        pr = nx.pagerank(G, weight='weight')

        # Sort and return top, exclude topic
        sorted_results = sorted(pr.items(), key=lambda x: x[1], reverse=True)
        final_words = [w for w, _ in sorted_results if w != topic][:num_words]

        return final_words

if __name__ == "__main__":
    # Create a cache directory if it doesn't exist
    cache_dir = os.path.join(os.path.dirname(__file__), 'model_cache')
    os.makedirs(cache_dir, exist_ok=True)
    
    generator = CrosswordGenerator2(cache_dir=cache_dir)
    topics = ["animal", "animal", "science", "technology", "food", "indian food", "chinese food"]  # Example topic
    for topic in topics:
        print(f"------------- {topic} ------------")
        generated_words = generator.generate_words(topic)
        sorted_generated_words = sorted(generated_words)
        print(f"Generated words for topic '{topic}':")
        print(sorted_generated_words)