|
|
|
""" |
|
WordNet-Based Clue Generator for Crossword Puzzles |
|
|
|
Uses NLTK WordNet to generate crossword clues by analyzing word definitions, |
|
synonyms, hypernyms, and semantic relationships. Integrated with the thematic |
|
word generator for complete crossword creation without API dependencies. |
|
|
|
Features: |
|
- WordNet-based clue generation using definitions and relationships |
|
- Integration with UnifiedThematicWordGenerator for word discovery |
|
- Interactive mode with topic-based generation |
|
- Multiple clue styles (definition, synonym, category, descriptive) |
|
- Difficulty-based clue complexity |
|
- Caching for improved performance |
|
""" |
|
|
|
import os |
|
import sys |
|
import re |
|
import time |
|
import logging |
|
from typing import List, Dict, Optional, Tuple, Set, Any |
|
from pathlib import Path |
|
from dataclasses import dataclass |
|
from collections import defaultdict |
|
import random |
|
|
|
|
|
try: |
|
import nltk |
|
from nltk.corpus import wordnet as wn |
|
from nltk.stem import WordNetLemmatizer |
|
NLTK_AVAILABLE = True |
|
except ImportError: |
|
print("❌ NLTK not available. Install with: pip install nltk") |
|
NLTK_AVAILABLE = False |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
try: |
|
from thematic_word_generator import UnifiedThematicWordGenerator |
|
THEMATIC_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"❌ Thematic generator import error: {e}") |
|
THEMATIC_AVAILABLE = False |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@dataclass |
|
class WordNetClueEntry: |
|
"""Complete crossword entry with WordNet-generated clue and metadata.""" |
|
word: str |
|
clue: str |
|
topic: str |
|
similarity_score: float |
|
frequency_tier: str |
|
tier_description: str |
|
clue_type: str |
|
synset_info: Optional[str] = None |
|
definition_source: Optional[str] = None |
|
|
|
|
|
def ensure_nltk_data(nltk_data_dir: Optional[str] = None): |
|
"""Ensure required NLTK data is downloaded to specified directory. |
|
|
|
Args: |
|
nltk_data_dir: Custom directory for NLTK data. If None, uses default. |
|
""" |
|
if not NLTK_AVAILABLE: |
|
return False |
|
|
|
|
|
if nltk_data_dir: |
|
nltk_data_path = Path(nltk_data_dir) |
|
nltk_data_path.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
if str(nltk_data_path) not in nltk.data.path: |
|
nltk.data.path.insert(0, str(nltk_data_path)) |
|
logger.info(f"📂 Added NLTK data path: {nltk_data_path}") |
|
|
|
|
|
corpus_paths = { |
|
'wordnet': 'corpora/wordnet', |
|
'omw-1.4': 'corpora/omw-1.4', |
|
'punkt': 'tokenizers/punkt', |
|
'averaged_perceptron_tagger': 'taggers/averaged_perceptron_tagger' |
|
} |
|
|
|
required_corpora = ['wordnet', 'punkt', 'averaged_perceptron_tagger', 'omw-1.4'] |
|
|
|
for corpus in required_corpora: |
|
corpus_path = corpus_paths[corpus] |
|
|
|
try: |
|
|
|
found_corpus = nltk.data.find(corpus_path) |
|
logger.info(f"✅ Found {corpus} at: {found_corpus}") |
|
except LookupError: |
|
|
|
if nltk_data_dir: |
|
local_corpus_path = Path(nltk_data_dir) / corpus_path |
|
if local_corpus_path.exists(): |
|
logger.info(f"✅ Found {corpus} locally at: {local_corpus_path}") |
|
continue |
|
|
|
|
|
logger.warning(f"❌ {corpus} not found, attempting download...") |
|
try: |
|
if nltk_data_dir: |
|
|
|
logger.info(f"📥 Downloading {corpus} to: {nltk_data_dir}") |
|
nltk.download(corpus, download_dir=nltk_data_dir, quiet=False) |
|
logger.info(f"✅ Downloaded {corpus} to: {nltk_data_dir}") |
|
else: |
|
|
|
logger.info(f"📥 Downloading {corpus} to default location") |
|
nltk.download(corpus, quiet=False) |
|
logger.info(f"✅ Downloaded {corpus} to default location") |
|
except Exception as e: |
|
logger.warning(f"⚠️ Failed to download {corpus}: {e}") |
|
return False |
|
|
|
return True |
|
|
|
|
|
class WordNetClueGenerator: |
|
""" |
|
WordNet-based clue generator that creates crossword clues using semantic |
|
relationships and definitions from the WordNet lexical database. |
|
""" |
|
|
|
def __init__(self, cache_dir: Optional[str] = None): |
|
"""Initialize WordNet clue generator. |
|
|
|
Args: |
|
cache_dir: Directory for caching (used for both model cache and NLTK data) |
|
""" |
|
self.cache_dir = cache_dir or str(Path(__file__).parent / 'model_cache') |
|
self.nltk_data_dir = str(Path(self.cache_dir) / 'nltk_data') |
|
self.lemmatizer = None |
|
self.clue_cache = {} |
|
self.is_initialized = False |
|
|
|
|
|
|
|
|
|
self.avoid_words = { |
|
'thing', 'stuff', 'item', 'object', 'entity', 'something', 'anything', |
|
'person', 'people', 'someone', 'anyone', 'somebody', 'anybody', |
|
'place', 'location', 'somewhere', 'anywhere', 'area', 'spot', |
|
'time', 'moment', 'period', 'while', 'when', 'then', |
|
'way', 'manner', 'method', 'means', 'how', 'what', 'which' |
|
} |
|
|
|
def initialize(self): |
|
"""Initialize the WordNet clue generator.""" |
|
if self.is_initialized: |
|
return True |
|
|
|
if not NLTK_AVAILABLE: |
|
logger.error("❌ NLTK not available - cannot initialize WordNet generator") |
|
return False |
|
|
|
logger.info("🚀 Initializing WordNet Clue Generator...") |
|
logger.info(f"📂 Using cache directory: {self.cache_dir}") |
|
logger.info(f"📂 Using NLTK data directory: {self.nltk_data_dir}") |
|
start_time = time.time() |
|
|
|
|
|
if not ensure_nltk_data(self.nltk_data_dir): |
|
logger.error("❌ Failed to download required NLTK data") |
|
return False |
|
|
|
|
|
try: |
|
self.lemmatizer = WordNetLemmatizer() |
|
logger.info("✅ WordNet lemmatizer initialized") |
|
except Exception as e: |
|
logger.error(f"❌ Failed to initialize lemmatizer: {e}") |
|
return False |
|
|
|
self.is_initialized = True |
|
init_time = time.time() - start_time |
|
logger.info(f"✅ WordNet clue generator ready in {init_time:.2f}s") |
|
|
|
return True |
|
|
|
def generate_clue(self, word: str, topic: str = "", clue_style: str = "auto", |
|
difficulty: str = "medium") -> str: |
|
"""Generate a crossword clue using WordNet definitions. |
|
|
|
Args: |
|
word: Target word for clue generation |
|
topic: Topic context (for fallback only) |
|
clue_style: Ignored - kept for compatibility |
|
difficulty: Ignored - kept for compatibility |
|
|
|
Returns: |
|
Generated crossword clue |
|
""" |
|
if not self.is_initialized: |
|
if not self.initialize(): |
|
return f"Related to {topic}" if topic else "Crossword answer" |
|
|
|
word_clean = word.lower().strip() |
|
|
|
|
|
synsets = wn.synsets(word_clean) |
|
if not synsets: |
|
return f"Related to {topic}" if topic else "Crossword answer" |
|
|
|
|
|
if len(synsets) > 3: |
|
import random |
|
synsets = random.sample(synsets, 3) |
|
|
|
|
|
definitions = [] |
|
word_variants = { |
|
word_clean, |
|
word_clean + 's', |
|
word_clean + 'ing', |
|
word_clean + 'ed', |
|
word_clean + 'er', |
|
word_clean + 'ly' |
|
} |
|
|
|
for syn in synsets: |
|
definition = syn.definition() |
|
definition_lower = definition.lower() |
|
|
|
|
|
contains_target = False |
|
for variant in word_variants: |
|
if f" {variant} " in f" {definition_lower} " or definition_lower.startswith(variant + " "): |
|
contains_target = True |
|
break |
|
|
|
|
|
if not contains_target: |
|
definitions.append(definition) |
|
|
|
|
|
if not definitions: |
|
return f"Related to {topic}" if topic else "Crossword answer" |
|
|
|
|
|
clue = "; ".join(definitions) |
|
|
|
return clue |
|
|
|
def _generate_fallback_clue(self, word: str, topic: str) -> str: |
|
"""Generate fallback clue when WordNet fails.""" |
|
if topic: |
|
return f"Related to {topic}" |
|
return "Crossword answer" |
|
|
|
|
|
def get_clue_info(self, word: str) -> Dict[str, Any]: |
|
"""Get detailed information about WordNet data for a word.""" |
|
if not self.is_initialized: |
|
return {"error": "Generator not initialized"} |
|
|
|
word_clean = word.lower().strip() |
|
synsets = self._get_synsets(word_clean) |
|
|
|
info = { |
|
"word": word, |
|
"synsets_count": len(synsets), |
|
"synsets": [] |
|
} |
|
|
|
for synset in synsets[:3]: |
|
synset_info = { |
|
"name": synset.name(), |
|
"pos": synset.pos(), |
|
"definition": synset.definition(), |
|
"examples": synset.examples()[:2], |
|
"hypernyms": [h.name() for h in synset.hypernyms()[:2]], |
|
"synonyms": [l.name().replace('_', ' ') for l in synset.lemmas()[:3]] |
|
} |
|
info["synsets"].append(synset_info) |
|
|
|
return info |
|
|
|
|
|
class IntegratedWordNetCrosswordGenerator: |
|
""" |
|
Complete crossword generation system using WordNet clues and thematic word discovery. |
|
""" |
|
|
|
def __init__(self, vocab_size_limit: Optional[int] = None, cache_dir: Optional[str] = None): |
|
"""Initialize the integrated WordNet crossword generator. |
|
|
|
Args: |
|
vocab_size_limit: Maximum vocabulary size for thematic generator |
|
cache_dir: Cache directory for models and data |
|
""" |
|
self.cache_dir = cache_dir or str(Path(__file__).parent / 'model_cache') |
|
self.vocab_size_limit = vocab_size_limit or 50000 |
|
|
|
|
|
self.thematic_generator = None |
|
self.clue_generator = None |
|
self.is_initialized = False |
|
|
|
|
|
self.stats = { |
|
'words_discovered': 0, |
|
'clues_generated': 0, |
|
'cache_hits': 0, |
|
'total_time': 0.0 |
|
} |
|
|
|
def initialize(self): |
|
"""Initialize both generators.""" |
|
if self.is_initialized: |
|
return True |
|
|
|
start_time = time.time() |
|
logger.info("🚀 Initializing Integrated WordNet Crossword Generator...") |
|
|
|
success = True |
|
|
|
|
|
logger.info("🔄 Initializing WordNet clue generator...") |
|
self.clue_generator = WordNetClueGenerator(self.cache_dir) |
|
if not self.clue_generator.initialize(): |
|
logger.error("❌ Failed to initialize WordNet clue generator") |
|
success = False |
|
else: |
|
logger.info("✅ WordNet clue generator ready") |
|
logger.info(f"📂 NLTK data stored in: {self.clue_generator.nltk_data_dir}") |
|
|
|
|
|
if THEMATIC_AVAILABLE: |
|
logger.info("🔄 Initializing thematic word generator...") |
|
try: |
|
self.thematic_generator = UnifiedThematicWordGenerator( |
|
cache_dir=self.cache_dir, |
|
vocab_size_limit=self.vocab_size_limit |
|
) |
|
self.thematic_generator.initialize() |
|
logger.info(f"✅ Thematic generator ready ({self.thematic_generator.get_vocabulary_size():,} words)") |
|
except Exception as e: |
|
logger.error(f"❌ Failed to initialize thematic generator: {e}") |
|
success = False |
|
else: |
|
logger.warning("⚠️ Thematic generator not available - limited word discovery") |
|
|
|
self.is_initialized = success |
|
init_time = time.time() - start_time |
|
logger.info(f"{'✅' if success else '❌'} Initialization {'completed' if success else 'failed'} in {init_time:.2f}s") |
|
|
|
return success |
|
|
|
def generate_crossword_entries(self, topic: str, num_words: int = 15, |
|
difficulty: str = "medium", clue_style: str = "auto") -> List[WordNetClueEntry]: |
|
"""Generate complete crossword entries for a topic. |
|
|
|
Args: |
|
topic: Topic for word generation |
|
num_words: Number of entries to generate |
|
difficulty: Difficulty level ('easy', 'medium', 'hard') |
|
clue_style: Clue generation style |
|
|
|
Returns: |
|
List of WordNetClueEntry objects |
|
""" |
|
if not self.is_initialized: |
|
if not self.initialize(): |
|
return [] |
|
|
|
start_time = time.time() |
|
logger.info(f"🎯 Generating {num_words} crossword entries for '{topic}' (difficulty: {difficulty})") |
|
|
|
|
|
if self.thematic_generator: |
|
try: |
|
|
|
word_results = self.thematic_generator.generate_thematic_words( |
|
inputs=topic, |
|
num_words=num_words * 2, |
|
min_similarity=0.2 |
|
) |
|
self.stats['words_discovered'] += len(word_results) |
|
except Exception as e: |
|
logger.error(f"❌ Word generation failed: {e}") |
|
word_results = [] |
|
else: |
|
|
|
word_results = [(topic.upper(), 0.9, "tier_5_common")] |
|
|
|
if not word_results: |
|
logger.warning(f"⚠️ No words found for topic '{topic}'") |
|
return [] |
|
|
|
|
|
entries = [] |
|
for word, similarity, tier in word_results[:num_words]: |
|
try: |
|
clue = self.clue_generator.generate_clue( |
|
word=word, |
|
topic=topic, |
|
clue_style=clue_style, |
|
difficulty=difficulty |
|
) |
|
|
|
if clue: |
|
tier_desc = self._get_tier_description(tier) |
|
entry = WordNetClueEntry( |
|
word=word.upper(), |
|
clue=clue, |
|
topic=topic, |
|
similarity_score=similarity, |
|
frequency_tier=tier, |
|
tier_description=tier_desc, |
|
clue_type=clue_style |
|
) |
|
entries.append(entry) |
|
self.stats['clues_generated'] += 1 |
|
|
|
except Exception as e: |
|
logger.error(f"❌ Failed to generate clue for '{word}': {e}") |
|
|
|
|
|
entries.sort(key=lambda x: x.similarity_score, reverse=True) |
|
final_entries = entries[:num_words] |
|
|
|
total_time = time.time() - start_time |
|
self.stats['total_time'] += total_time |
|
|
|
logger.info(f"✅ Generated {len(final_entries)} entries in {total_time:.2f}s") |
|
return final_entries |
|
|
|
def _get_tier_description(self, tier: str) -> str: |
|
"""Get tier description from thematic generator or provide default.""" |
|
if self.thematic_generator and hasattr(self.thematic_generator, 'tier_descriptions'): |
|
return self.thematic_generator.tier_descriptions.get(tier, tier) |
|
return tier.replace('_', ' ').title() |
|
|
|
def get_stats(self) -> Dict[str, Any]: |
|
"""Get generation statistics.""" |
|
return { |
|
**self.stats, |
|
'thematic_available': self.thematic_generator is not None, |
|
'wordnet_available': self.clue_generator is not None and self.clue_generator.is_initialized, |
|
'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_generator else 0 |
|
} |
|
|
|
|
|
def main(): |
|
"""Interactive WordNet crossword generator.""" |
|
if not NLTK_AVAILABLE: |
|
print("❌ NLTK not available. Please install with: pip install nltk") |
|
return |
|
|
|
print("🚀 WordNet Crossword Generator") |
|
print("=" * 60) |
|
print("Using NLTK WordNet for clue generation + thematic word discovery") |
|
|
|
|
|
cache_dir = str(Path(__file__).parent / 'model_cache') |
|
generator = IntegratedWordNetCrosswordGenerator( |
|
vocab_size_limit=50000, |
|
cache_dir=cache_dir |
|
) |
|
|
|
print("\n🔄 Initializing system...") |
|
if not generator.initialize(): |
|
print("❌ Failed to initialize system") |
|
return |
|
|
|
stats = generator.get_stats() |
|
print(f"\n📊 System Status:") |
|
print(f" WordNet clues: {'✅' if stats['wordnet_available'] else '❌'}") |
|
print(f" Thematic words: {'✅' if stats['thematic_available'] else '❌'}") |
|
if stats['vocab_size'] > 0: |
|
print(f" Vocabulary: {stats['vocab_size']:,} words") |
|
|
|
print(f"\n🎮 INTERACTIVE MODE") |
|
print("=" * 60) |
|
print("Commands:") |
|
print(" <topic> - Generate words and clues for topic") |
|
print(" <topic> <num_words> - Generate specific number of entries") |
|
print(" <topic> <num_words> <diff> - Set difficulty (easy/medium/hard)") |
|
print(" <topic> style <style> - Set clue style (definition/synonym/hypernym/category)") |
|
print(" info <word> - Show WordNet information for word") |
|
print(" test <word> <topic> - Test clue generation for specific word") |
|
print(" stats - Show generation statistics") |
|
print(" help - Show this help") |
|
print(" quit - Exit") |
|
print() |
|
print("Examples:") |
|
print(" animals - Generate animal-related crossword entries") |
|
print(" technology 10 hard - 10 hard technology entries") |
|
print(" music style synonym - Music entries with synonym-style clues") |
|
print(" info elephant - WordNet info for 'elephant'") |
|
|
|
while True: |
|
try: |
|
user_input = input("\n🎯 Enter command: ").strip() |
|
|
|
if user_input.lower() in ['quit', 'exit', 'q']: |
|
break |
|
|
|
if not user_input: |
|
continue |
|
|
|
parts = user_input.split() |
|
|
|
if user_input.lower() == 'help': |
|
print("\nCommands:") |
|
print(" <topic> [num_words] [difficulty] - Generate crossword entries") |
|
print(" <topic> style <clue_style> - Generate with specific clue style") |
|
print(" info <word> - Show WordNet info for word") |
|
print(" test <word> <topic> - Test clue generation") |
|
print(" stats - Show statistics") |
|
print(" quit - Exit") |
|
continue |
|
|
|
elif user_input.lower() == 'stats': |
|
stats = generator.get_stats() |
|
print("\n📊 Generation Statistics:") |
|
print(f" Words discovered: {stats['words_discovered']}") |
|
print(f" Clues generated: {stats['clues_generated']}") |
|
print(f" Total time: {stats['total_time']:.2f}s") |
|
if stats['clues_generated'] > 0: |
|
avg_time = stats['total_time'] / stats['clues_generated'] |
|
print(f" Avg time per clue: {avg_time:.2f}s") |
|
continue |
|
|
|
elif parts[0].lower() == 'info' and len(parts) > 1: |
|
word = parts[1] |
|
print(f"\n📝 WordNet Information: '{word}'") |
|
info = generator.clue_generator.get_clue_info(word) |
|
|
|
if 'error' in info: |
|
print(f" ❌ {info['error']}") |
|
else: |
|
print(f" Synsets found: {info['synsets_count']}") |
|
for i, synset in enumerate(info['synsets'], 1): |
|
print(f"\n {i}. {synset['name']} ({synset['pos']})") |
|
print(f" Definition: {synset['definition']}") |
|
if synset['examples']: |
|
print(f" Examples: {', '.join(synset['examples'])}") |
|
if synset['synonyms']: |
|
print(f" Synonyms: {', '.join(synset['synonyms'])}") |
|
if synset['hypernyms']: |
|
print(f" Categories: {', '.join(synset['hypernyms'])}") |
|
continue |
|
|
|
elif parts[0].lower() == 'test' and len(parts) >= 3: |
|
word = parts[1] |
|
topic = parts[2] |
|
print(f"\n🧪 Testing clue generation: '{word}' + '{topic}'") |
|
|
|
styles = ['definition', 'synonym', 'hypernym', 'category', 'descriptive'] |
|
for style in styles: |
|
clue = generator.clue_generator.generate_clue(word, topic, style, 'medium') |
|
print(f" {style:12}: {clue if clue else '(no clue generated)'}") |
|
continue |
|
|
|
|
|
topic = parts[0] |
|
num_words = 8 |
|
difficulty = 'medium' |
|
clue_style = 'auto' |
|
|
|
|
|
i = 1 |
|
while i < len(parts): |
|
if parts[i].isdigit(): |
|
num_words = int(parts[i]) |
|
elif parts[i].lower() in ['easy', 'medium', 'hard']: |
|
difficulty = parts[i].lower() |
|
elif parts[i].lower() == 'style' and i + 1 < len(parts): |
|
clue_style = parts[i + 1].lower() |
|
i += 1 |
|
elif parts[i].lower() in ['definition', 'synonym', 'hypernym', 'category', 'descriptive']: |
|
clue_style = parts[i].lower() |
|
i += 1 |
|
|
|
print(f"\n🎯 Generating {num_words} {difficulty} entries for '{topic}'" + |
|
(f" (style: {clue_style})" if clue_style != 'auto' else "")) |
|
print("-" * 60) |
|
|
|
try: |
|
start_time = time.time() |
|
entries = generator.generate_crossword_entries( |
|
topic=topic, |
|
num_words=num_words, |
|
difficulty=difficulty, |
|
clue_style=clue_style |
|
) |
|
generation_time = time.time() - start_time |
|
|
|
if entries: |
|
print(f"✅ Generated {len(entries)} entries in {generation_time:.2f}s:") |
|
print() |
|
|
|
for i, entry in enumerate(entries, 1): |
|
tier_short = entry.frequency_tier.split('_')[1] if '_' in entry.frequency_tier else 'unk' |
|
print(f" {i:2}. {entry.word:<12} | {entry.clue}") |
|
print(f" Similarity: {entry.similarity_score:.3f} | Tier: {tier_short} | Type: {entry.clue_type}") |
|
print() |
|
else: |
|
print("❌ No entries generated. Try a different topic.") |
|
|
|
except Exception as e: |
|
print(f"❌ Error: {e}") |
|
|
|
except KeyboardInterrupt: |
|
print("\n\n👋 Exiting WordNet crossword generator") |
|
break |
|
except Exception as e: |
|
print(f"❌ Error: {e}") |
|
|
|
|
|
final_stats = generator.get_stats() |
|
if final_stats['clues_generated'] > 0: |
|
print(f"\n📊 Session Summary:") |
|
print(f" Entries generated: {final_stats['clues_generated']}") |
|
print(f" Total time: {final_stats['total_time']:.2f}s") |
|
print(f" Average per entry: {final_stats['total_time']/final_stats['clues_generated']:.2f}s") |
|
|
|
print("\n✅ Thanks for using WordNet Crossword Generator!") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|