#!/usr/bin/env python3 """ Integrated Crossword Generator Combines thematic word discovery with API-based clue generation for complete crossword creation. This system integrates: - UnifiedThematicWordGenerator: Smart word discovery using semantic embeddings - APIClueGenerator: High-quality clue generation using multiple AI models Creates a complete crossword generation pipeline with both intelligent word selection and professional-quality clues. """ import sys import os import time import logging import asyncio from typing import List, Dict, Optional, Tuple, Any from pathlib import Path from dataclasses import dataclass # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from thematic_word_generator import UnifiedThematicWordGenerator THEMATIC_AVAILABLE = True except ImportError as e: print(f"āŒ Thematic generator import error: {e}") THEMATIC_AVAILABLE = False try: from api_clue_generator import APIClueGenerator API_AVAILABLE = True except ImportError as e: print(f"āŒ API generator import error: {e}") API_AVAILABLE = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) @dataclass class CrosswordEntry: """Complete crossword entry with word, clue, and metadata.""" word: str clue: str topic: str similarity_score: float frequency_tier: str tier_description: str clue_quality: str clue_model: str class IntegratedCrosswordGenerator: """ Integrated crossword generator combining thematic word discovery with API clue generation. This class provides a complete pipeline from topic input to finished crossword entries with both intelligent word selection and high-quality clue generation. """ def __init__(self, cache_dir: Optional[str] = None, vocab_size_limit: Optional[int] = None, hf_token: Optional[str] = None): """Initialize the integrated crossword generator. Args: cache_dir: Directory for caching models and embeddings vocab_size_limit: Maximum vocabulary size for thematic generator hf_token: Hugging Face API token for clue generation """ self.cache_dir = cache_dir or os.path.join(os.path.dirname(__file__), 'model_cache') self.vocab_size_limit = vocab_size_limit # Core components self.thematic_generator: Optional[UnifiedThematicWordGenerator] = None self.api_clue_generator: Optional[APIClueGenerator] = None # Initialization status self.is_initialized = False self.thematic_ready = False self.api_ready = False # Performance tracking self.stats = { 'words_discovered': 0, 'clues_generated': 0, 'api_calls': 0, 'cache_hits': 0, 'total_time': 0.0 } # Check availability if not THEMATIC_AVAILABLE: logger.error("āŒ UnifiedThematicWordGenerator not available - word discovery disabled") if not API_AVAILABLE: logger.error("āŒ APIClueGenerator not available - API clue generation disabled") # Initialize components if available if THEMATIC_AVAILABLE: self.thematic_generator = UnifiedThematicWordGenerator( cache_dir=cache_dir, vocab_size_limit=vocab_size_limit ) if API_AVAILABLE: self.api_clue_generator = APIClueGenerator(hf_token=hf_token) def initialize(self): """Initialize both generators.""" if self.is_initialized: return start_time = time.time() logger.info("šŸš€ Initializing Integrated Crossword Generator...") # Initialize thematic word generator if self.thematic_generator: logger.info("šŸ”„ Initializing thematic word generator...") try: self.thematic_generator.initialize() self.thematic_ready = True logger.info("āœ… Thematic word generator ready") except Exception as e: logger.error(f"āŒ Failed to initialize thematic generator: {e}") # API clue generator is ready immediately (no initialization needed) if self.api_clue_generator: if self.api_clue_generator.hf_token: self.api_ready = True logger.info("āœ… API clue generator ready") else: logger.warning("āš ļø API clue generator has no token - clue generation may fail") self.is_initialized = True init_time = time.time() - start_time logger.info(f"šŸŽ‰ Integrated generator initialized in {init_time:.2f}s") # Log capabilities capabilities = [] if self.thematic_ready: vocab_size = self.thematic_generator.get_vocabulary_size() capabilities.append(f"Word Discovery ({vocab_size:,} words)") if self.api_ready: model_count = len(self.api_clue_generator.models) capabilities.append(f"API Clues ({model_count} models)") logger.info(f"šŸ’” Capabilities: {', '.join(capabilities) if capabilities else 'Limited (check dependencies)'}") async def initialize_async(self): """Async initialization for backend compatibility.""" return self.initialize() def generate_crossword_entries(self, topic: str, num_words: int = 15, difficulty: str = "medium", min_similarity: float = 0.3) -> List[CrosswordEntry]: """Generate complete crossword entries for a topic. Args: topic: Topic or theme for word generation num_words: Number of words to generate difficulty: Difficulty level (easy/medium/hard) min_similarity: Minimum similarity threshold for word discovery Returns: List of complete CrosswordEntry objects with words, clues, and metadata """ if not self.is_initialized: self.initialize() start_time = time.time() logger.info(f"šŸŽÆ Generating {num_words} crossword entries for topic: '{topic}' (difficulty: {difficulty})") # Step 1: Discover thematic words words_with_metadata = self._discover_words(topic, num_words, difficulty, min_similarity) if not words_with_metadata: logger.warning(f"āš ļø No words discovered for topic '{topic}'") return [] logger.info(f"āœ… Discovered {len(words_with_metadata)} words") # Step 2: Generate clues for discovered words crossword_entries = self._generate_clues_for_words(words_with_metadata, topic) # Step 3: Sort by quality and similarity crossword_entries.sort(key=lambda x: (x.clue_quality == "EXCELLENT", x.similarity_score), reverse=True) total_time = time.time() - start_time self.stats['total_time'] += total_time logger.info(f"šŸŽ‰ Generated {len(crossword_entries)} complete crossword entries in {total_time:.2f}s") return crossword_entries[:num_words] # Return requested number def _discover_words(self, topic: str, num_words: int, difficulty: str, min_similarity: float) -> List[Tuple[str, float, str]]: """Discover thematic words using the thematic generator.""" if not self.thematic_ready: logger.error("āŒ Thematic word generator not ready - cannot discover words") return [] try: # Map difficulty to word count multiplier (get extra words for better selection) word_multipliers = {"easy": 2, "medium": 2.5, "hard": 3} multiplier = word_multipliers.get(difficulty, 2.5) discover_count = int(num_words * multiplier) logger.info(f"šŸ” Discovering {discover_count} candidate words for '{topic}'...") # Use thematic generator with difficulty mapping results = self.thematic_generator.generate_thematic_words( inputs=topic, num_words=discover_count, min_similarity=min_similarity, multi_theme=False # Single topic for focused results ) # Filter by difficulty if needed filtered_results = self._filter_by_difficulty(results, difficulty) self.stats['words_discovered'] += len(filtered_results) return filtered_results except Exception as e: logger.error(f"āŒ Word discovery failed: {e}") return [] def _filter_by_difficulty(self, results: List[Tuple[str, float, str]], difficulty: str) -> List[Tuple[str, float, str]]: """Filter words by difficulty level using frequency tiers and length.""" # Define difficulty criteria difficulty_config = { "easy": { "preferred_tiers": ["tier_2_extremely_common", "tier_3_very_common", "tier_4_highly_common"], "min_length": 3, "max_length": 6 }, "medium": { "preferred_tiers": ["tier_4_highly_common", "tier_5_common", "tier_6_moderately_common"], "min_length": 4, "max_length": 10 }, "hard": { "preferred_tiers": ["tier_6_moderately_common", "tier_7_somewhat_uncommon", "tier_8_uncommon"], "min_length": 5, "max_length": 15 } } config = difficulty_config.get(difficulty, difficulty_config["medium"]) # Apply filters filtered = [] for word, similarity, tier in results: # Length filter if not (config["min_length"] <= len(word) <= config["max_length"]): continue # Tier preference (but don't exclude entirely - just prefer) tier_score = 1.0 if tier in config["preferred_tiers"] else 0.8 adjusted_similarity = similarity * tier_score filtered.append((word, adjusted_similarity, tier)) # Sort by adjusted similarity filtered.sort(key=lambda x: x[1], reverse=True) return filtered def _generate_clues_for_words(self, words_with_metadata: List[Tuple[str, float, str]], topic: str) -> List[CrosswordEntry]: """Generate clues for discovered words using API generator.""" if not self.api_ready: logger.error("āŒ API clue generator not ready - using basic clues") return self._generate_basic_clues(words_with_metadata, topic) logger.info(f"šŸ¤– Generating API clues for {len(words_with_metadata)} words...") crossword_entries = [] for word, similarity, tier in words_with_metadata: try: # Generate clue using API clue_results = self.api_clue_generator.generate_clue(word, topic) # Find best clue from all models best_clue = None best_quality = "FAILED" best_model = "none" for model_key, clue in clue_results.items(): if clue: quality, score = self.api_clue_generator.evaluate_clue_quality(word, clue) if self._is_better_quality(quality, best_quality): best_clue = clue best_quality = quality best_model = model_key self.stats['api_calls'] += len([c for c in clue_results.values() if c]) # Create crossword entry if best_clue: tier_desc = self._get_tier_description(tier) entry = CrosswordEntry( word=word.upper(), # Crosswords typically use uppercase clue=best_clue, topic=topic, similarity_score=similarity, frequency_tier=tier, tier_description=tier_desc, clue_quality=best_quality, clue_model=best_model ) crossword_entries.append(entry) self.stats['clues_generated'] += 1 else: logger.warning(f"āš ļø No valid clue generated for '{word}'") except Exception as e: logger.error(f"āŒ Failed to generate clue for '{word}': {e}") return crossword_entries def _generate_basic_clues(self, words_with_metadata: List[Tuple[str, float, str]], topic: str) -> List[CrosswordEntry]: """Generate basic fallback clues when API is not available.""" logger.info(f"šŸ”„ Generating basic fallback clues for {len(words_with_metadata)} words...") crossword_entries = [] for word, similarity, tier in words_with_metadata: # Simple template-based clue clue = f"Term related to {topic.lower()}" tier_desc = self._get_tier_description(tier) entry = CrosswordEntry( word=word.upper(), clue=clue, topic=topic, similarity_score=similarity, frequency_tier=tier, tier_description=tier_desc, clue_quality="BASIC", clue_model="template" ) crossword_entries.append(entry) self.stats['clues_generated'] += 1 return crossword_entries def _is_better_quality(self, quality1: str, quality2: str) -> bool: """Compare clue quality levels.""" quality_order = ["FAILED", "POOR", "ACCEPTABLE", "GOOD", "EXCELLENT"] try: return quality_order.index(quality1) > quality_order.index(quality2) except ValueError: return False def _get_tier_description(self, tier: str) -> str: """Get human-readable tier description.""" if self.thematic_ready and hasattr(self.thematic_generator, 'tier_descriptions'): return self.thematic_generator.tier_descriptions.get(tier, tier) return tier def generate_by_multiple_topics(self, topics: List[str], words_per_topic: int = 10, difficulty: str = "medium") -> Dict[str, List[CrosswordEntry]]: """Generate crossword entries for multiple topics. Args: topics: List of topics to generate words for words_per_topic: Number of words per topic difficulty: Difficulty level Returns: Dictionary mapping topics to their crossword entries """ logger.info(f"šŸŽÆ Generating crossword entries for {len(topics)} topics") results = {} for topic in topics: logger.info(f"šŸ“ Processing topic: '{topic}'") entries = self.generate_crossword_entries( topic=topic, num_words=words_per_topic, difficulty=difficulty ) results[topic] = entries return results def get_stats(self) -> Dict[str, Any]: """Get performance statistics.""" return { **self.stats, 'thematic_ready': self.thematic_ready, 'api_ready': self.api_ready, 'is_initialized': self.is_initialized, 'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0, 'api_models': len(self.api_clue_generator.models) if self.api_ready else 0 } def get_system_info(self) -> Dict[str, Any]: """Get comprehensive system information.""" info = { 'system': 'IntegratedCrosswordGenerator', 'components': { 'thematic_generator': { 'available': THEMATIC_AVAILABLE, 'ready': self.thematic_ready, 'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0 }, 'api_clue_generator': { 'available': API_AVAILABLE, 'ready': self.api_ready, 'models': list(self.api_clue_generator.models.keys()) if self.api_ready else [] } }, 'stats': self.get_stats() } return info def main(): """Demo the integrated crossword generator.""" print("šŸš€ Integrated Crossword Generator Demo") print("=" * 60) # Check if required token is available hf_token = os.getenv('HF_TOKEN') if not hf_token: print("āŒ HF_TOKEN environment variable not set") print("Set your token: export HF_TOKEN='your_token_here'") return print("šŸ”„ Initializing integrated system...") generator = IntegratedCrosswordGenerator(vocab_size_limit=50000) # Smaller for demo generator.initialize() # Show system info system_info = generator.get_system_info() print(f"\nšŸ“Š System Status:") for component, info in system_info['components'].items(): status = "āœ… Ready" if info['ready'] else "āŒ Not Ready" print(f" {component}: {status}") if not (generator.thematic_ready and generator.api_ready): print("\nāš ļø System not fully ready - some features may be limited") print("Continuing with demo using available components...") # Demo topics demo_topics = ["animals", "technology", "music"] print(f"\nšŸŽÆ Generating crossword entries for {len(demo_topics)} topics") print("=" * 60) for topic in demo_topics: print(f"\nšŸ“ Topic: '{topic.upper()}'") print("-" * 40) try: start_time = time.time() entries = generator.generate_crossword_entries( topic=topic, num_words=5, # Small number for demo difficulty="medium" ) generation_time = time.time() - start_time if entries: print(f"ā±ļø Generated {len(entries)} entries in {generation_time:.2f}s") print() for i, entry in enumerate(entries, 1): quality_icon = { "EXCELLENT": "šŸ†", "GOOD": "āœ…", "ACCEPTABLE": "šŸ”„", "POOR": "āŒ", "BASIC": "šŸ“" }.get(entry.clue_quality, "?") print(f" {i}. {entry.word:<12} | {quality_icon} {entry.clue}") print(f" Similarity: {entry.similarity_score:.3f} | {entry.tier_description}") print(f" Model: {entry.clue_model}") print() else: print("āŒ No entries generated") except Exception as e: print(f"āŒ Error generating entries for '{topic}': {e}") # Show final stats print("=" * 60) print("šŸ“Š FINAL STATISTICS") print("=" * 60) stats = generator.get_stats() print(f"Words discovered: {stats['words_discovered']}") print(f"Clues generated: {stats['clues_generated']}") print(f"API calls made: {stats['api_calls']}") print(f"Total time: {stats['total_time']:.2f}s") print("\nāœ… Integrated crossword generator demo complete!") print("\nšŸ’” This system combines:") print(" šŸ” Smart word discovery (100K+ vocabulary, semantic analysis)") print(" šŸ¤– High-quality clue generation (multiple AI models)") print(" šŸ“Š Difficulty control (frequency tiers)") print(" šŸŽÆ Topic-focused generation") if __name__ == "__main__": main()