abc123 / hack /integrated_crossword_generator.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
#!/usr/bin/env python3
"""
Integrated Crossword Generator
Combines thematic word discovery with API-based clue generation for complete crossword creation.
This system integrates:
- UnifiedThematicWordGenerator: Smart word discovery using semantic embeddings
- APIClueGenerator: High-quality clue generation using multiple AI models
Creates a complete crossword generation pipeline with both intelligent word selection
and professional-quality clues.
"""
import sys
import os
import time
import logging
import asyncio
from typing import List, Dict, Optional, Tuple, Any
from pathlib import Path
from dataclasses import dataclass
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from thematic_word_generator import UnifiedThematicWordGenerator
THEMATIC_AVAILABLE = True
except ImportError as e:
print(f"❌ Thematic generator import error: {e}")
THEMATIC_AVAILABLE = False
try:
from api_clue_generator import APIClueGenerator
API_AVAILABLE = True
except ImportError as e:
print(f"❌ API generator import error: {e}")
API_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
@dataclass
class CrosswordEntry:
"""Complete crossword entry with word, clue, and metadata."""
word: str
clue: str
topic: str
similarity_score: float
frequency_tier: str
tier_description: str
clue_quality: str
clue_model: str
class IntegratedCrosswordGenerator:
"""
Integrated crossword generator combining thematic word discovery with API clue generation.
This class provides a complete pipeline from topic input to finished crossword entries
with both intelligent word selection and high-quality clue generation.
"""
def __init__(self,
cache_dir: Optional[str] = None,
vocab_size_limit: Optional[int] = None,
hf_token: Optional[str] = None):
"""Initialize the integrated crossword generator.
Args:
cache_dir: Directory for caching models and embeddings
vocab_size_limit: Maximum vocabulary size for thematic generator
hf_token: Hugging Face API token for clue generation
"""
self.cache_dir = cache_dir or os.path.join(os.path.dirname(__file__), 'model_cache')
self.vocab_size_limit = vocab_size_limit
# Core components
self.thematic_generator: Optional[UnifiedThematicWordGenerator] = None
self.api_clue_generator: Optional[APIClueGenerator] = None
# Initialization status
self.is_initialized = False
self.thematic_ready = False
self.api_ready = False
# Performance tracking
self.stats = {
'words_discovered': 0,
'clues_generated': 0,
'api_calls': 0,
'cache_hits': 0,
'total_time': 0.0
}
# Check availability
if not THEMATIC_AVAILABLE:
logger.error("❌ UnifiedThematicWordGenerator not available - word discovery disabled")
if not API_AVAILABLE:
logger.error("❌ APIClueGenerator not available - API clue generation disabled")
# Initialize components if available
if THEMATIC_AVAILABLE:
self.thematic_generator = UnifiedThematicWordGenerator(
cache_dir=cache_dir,
vocab_size_limit=vocab_size_limit
)
if API_AVAILABLE:
self.api_clue_generator = APIClueGenerator(hf_token=hf_token)
def initialize(self):
"""Initialize both generators."""
if self.is_initialized:
return
start_time = time.time()
logger.info("πŸš€ Initializing Integrated Crossword Generator...")
# Initialize thematic word generator
if self.thematic_generator:
logger.info("πŸ”„ Initializing thematic word generator...")
try:
self.thematic_generator.initialize()
self.thematic_ready = True
logger.info("βœ… Thematic word generator ready")
except Exception as e:
logger.error(f"❌ Failed to initialize thematic generator: {e}")
# API clue generator is ready immediately (no initialization needed)
if self.api_clue_generator:
if self.api_clue_generator.hf_token:
self.api_ready = True
logger.info("βœ… API clue generator ready")
else:
logger.warning("⚠️ API clue generator has no token - clue generation may fail")
self.is_initialized = True
init_time = time.time() - start_time
logger.info(f"πŸŽ‰ Integrated generator initialized in {init_time:.2f}s")
# Log capabilities
capabilities = []
if self.thematic_ready:
vocab_size = self.thematic_generator.get_vocabulary_size()
capabilities.append(f"Word Discovery ({vocab_size:,} words)")
if self.api_ready:
model_count = len(self.api_clue_generator.models)
capabilities.append(f"API Clues ({model_count} models)")
logger.info(f"πŸ’‘ Capabilities: {', '.join(capabilities) if capabilities else 'Limited (check dependencies)'}")
async def initialize_async(self):
"""Async initialization for backend compatibility."""
return self.initialize()
def generate_crossword_entries(self,
topic: str,
num_words: int = 15,
difficulty: str = "medium",
min_similarity: float = 0.3) -> List[CrosswordEntry]:
"""Generate complete crossword entries for a topic.
Args:
topic: Topic or theme for word generation
num_words: Number of words to generate
difficulty: Difficulty level (easy/medium/hard)
min_similarity: Minimum similarity threshold for word discovery
Returns:
List of complete CrosswordEntry objects with words, clues, and metadata
"""
if not self.is_initialized:
self.initialize()
start_time = time.time()
logger.info(f"🎯 Generating {num_words} crossword entries for topic: '{topic}' (difficulty: {difficulty})")
# Step 1: Discover thematic words
words_with_metadata = self._discover_words(topic, num_words, difficulty, min_similarity)
if not words_with_metadata:
logger.warning(f"⚠️ No words discovered for topic '{topic}'")
return []
logger.info(f"βœ… Discovered {len(words_with_metadata)} words")
# Step 2: Generate clues for discovered words
crossword_entries = self._generate_clues_for_words(words_with_metadata, topic)
# Step 3: Sort by quality and similarity
crossword_entries.sort(key=lambda x: (x.clue_quality == "EXCELLENT", x.similarity_score), reverse=True)
total_time = time.time() - start_time
self.stats['total_time'] += total_time
logger.info(f"πŸŽ‰ Generated {len(crossword_entries)} complete crossword entries in {total_time:.2f}s")
return crossword_entries[:num_words] # Return requested number
def _discover_words(self,
topic: str,
num_words: int,
difficulty: str,
min_similarity: float) -> List[Tuple[str, float, str]]:
"""Discover thematic words using the thematic generator."""
if not self.thematic_ready:
logger.error("❌ Thematic word generator not ready - cannot discover words")
return []
try:
# Map difficulty to word count multiplier (get extra words for better selection)
word_multipliers = {"easy": 2, "medium": 2.5, "hard": 3}
multiplier = word_multipliers.get(difficulty, 2.5)
discover_count = int(num_words * multiplier)
logger.info(f"πŸ” Discovering {discover_count} candidate words for '{topic}'...")
# Use thematic generator with difficulty mapping
results = self.thematic_generator.generate_thematic_words(
inputs=topic,
num_words=discover_count,
min_similarity=min_similarity,
multi_theme=False # Single topic for focused results
)
# Filter by difficulty if needed
filtered_results = self._filter_by_difficulty(results, difficulty)
self.stats['words_discovered'] += len(filtered_results)
return filtered_results
except Exception as e:
logger.error(f"❌ Word discovery failed: {e}")
return []
def _filter_by_difficulty(self,
results: List[Tuple[str, float, str]],
difficulty: str) -> List[Tuple[str, float, str]]:
"""Filter words by difficulty level using frequency tiers and length."""
# Define difficulty criteria
difficulty_config = {
"easy": {
"preferred_tiers": ["tier_2_extremely_common", "tier_3_very_common", "tier_4_highly_common"],
"min_length": 3,
"max_length": 6
},
"medium": {
"preferred_tiers": ["tier_4_highly_common", "tier_5_common", "tier_6_moderately_common"],
"min_length": 4,
"max_length": 10
},
"hard": {
"preferred_tiers": ["tier_6_moderately_common", "tier_7_somewhat_uncommon", "tier_8_uncommon"],
"min_length": 5,
"max_length": 15
}
}
config = difficulty_config.get(difficulty, difficulty_config["medium"])
# Apply filters
filtered = []
for word, similarity, tier in results:
# Length filter
if not (config["min_length"] <= len(word) <= config["max_length"]):
continue
# Tier preference (but don't exclude entirely - just prefer)
tier_score = 1.0 if tier in config["preferred_tiers"] else 0.8
adjusted_similarity = similarity * tier_score
filtered.append((word, adjusted_similarity, tier))
# Sort by adjusted similarity
filtered.sort(key=lambda x: x[1], reverse=True)
return filtered
def _generate_clues_for_words(self,
words_with_metadata: List[Tuple[str, float, str]],
topic: str) -> List[CrosswordEntry]:
"""Generate clues for discovered words using API generator."""
if not self.api_ready:
logger.error("❌ API clue generator not ready - using basic clues")
return self._generate_basic_clues(words_with_metadata, topic)
logger.info(f"πŸ€– Generating API clues for {len(words_with_metadata)} words...")
crossword_entries = []
for word, similarity, tier in words_with_metadata:
try:
# Generate clue using API
clue_results = self.api_clue_generator.generate_clue(word, topic)
# Find best clue from all models
best_clue = None
best_quality = "FAILED"
best_model = "none"
for model_key, clue in clue_results.items():
if clue:
quality, score = self.api_clue_generator.evaluate_clue_quality(word, clue)
if self._is_better_quality(quality, best_quality):
best_clue = clue
best_quality = quality
best_model = model_key
self.stats['api_calls'] += len([c for c in clue_results.values() if c])
# Create crossword entry
if best_clue:
tier_desc = self._get_tier_description(tier)
entry = CrosswordEntry(
word=word.upper(), # Crosswords typically use uppercase
clue=best_clue,
topic=topic,
similarity_score=similarity,
frequency_tier=tier,
tier_description=tier_desc,
clue_quality=best_quality,
clue_model=best_model
)
crossword_entries.append(entry)
self.stats['clues_generated'] += 1
else:
logger.warning(f"⚠️ No valid clue generated for '{word}'")
except Exception as e:
logger.error(f"❌ Failed to generate clue for '{word}': {e}")
return crossword_entries
def _generate_basic_clues(self,
words_with_metadata: List[Tuple[str, float, str]],
topic: str) -> List[CrosswordEntry]:
"""Generate basic fallback clues when API is not available."""
logger.info(f"πŸ”„ Generating basic fallback clues for {len(words_with_metadata)} words...")
crossword_entries = []
for word, similarity, tier in words_with_metadata:
# Simple template-based clue
clue = f"Term related to {topic.lower()}"
tier_desc = self._get_tier_description(tier)
entry = CrosswordEntry(
word=word.upper(),
clue=clue,
topic=topic,
similarity_score=similarity,
frequency_tier=tier,
tier_description=tier_desc,
clue_quality="BASIC",
clue_model="template"
)
crossword_entries.append(entry)
self.stats['clues_generated'] += 1
return crossword_entries
def _is_better_quality(self, quality1: str, quality2: str) -> bool:
"""Compare clue quality levels."""
quality_order = ["FAILED", "POOR", "ACCEPTABLE", "GOOD", "EXCELLENT"]
try:
return quality_order.index(quality1) > quality_order.index(quality2)
except ValueError:
return False
def _get_tier_description(self, tier: str) -> str:
"""Get human-readable tier description."""
if self.thematic_ready and hasattr(self.thematic_generator, 'tier_descriptions'):
return self.thematic_generator.tier_descriptions.get(tier, tier)
return tier
def generate_by_multiple_topics(self,
topics: List[str],
words_per_topic: int = 10,
difficulty: str = "medium") -> Dict[str, List[CrosswordEntry]]:
"""Generate crossword entries for multiple topics.
Args:
topics: List of topics to generate words for
words_per_topic: Number of words per topic
difficulty: Difficulty level
Returns:
Dictionary mapping topics to their crossword entries
"""
logger.info(f"🎯 Generating crossword entries for {len(topics)} topics")
results = {}
for topic in topics:
logger.info(f"πŸ“ Processing topic: '{topic}'")
entries = self.generate_crossword_entries(
topic=topic,
num_words=words_per_topic,
difficulty=difficulty
)
results[topic] = entries
return results
def get_stats(self) -> Dict[str, Any]:
"""Get performance statistics."""
return {
**self.stats,
'thematic_ready': self.thematic_ready,
'api_ready': self.api_ready,
'is_initialized': self.is_initialized,
'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0,
'api_models': len(self.api_clue_generator.models) if self.api_ready else 0
}
def get_system_info(self) -> Dict[str, Any]:
"""Get comprehensive system information."""
info = {
'system': 'IntegratedCrosswordGenerator',
'components': {
'thematic_generator': {
'available': THEMATIC_AVAILABLE,
'ready': self.thematic_ready,
'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0
},
'api_clue_generator': {
'available': API_AVAILABLE,
'ready': self.api_ready,
'models': list(self.api_clue_generator.models.keys()) if self.api_ready else []
}
},
'stats': self.get_stats()
}
return info
def main():
"""Demo the integrated crossword generator."""
print("πŸš€ Integrated Crossword Generator Demo")
print("=" * 60)
# Check if required token is available
hf_token = os.getenv('HF_TOKEN')
if not hf_token:
print("❌ HF_TOKEN environment variable not set")
print("Set your token: export HF_TOKEN='your_token_here'")
return
print("πŸ”„ Initializing integrated system...")
generator = IntegratedCrosswordGenerator(vocab_size_limit=50000) # Smaller for demo
generator.initialize()
# Show system info
system_info = generator.get_system_info()
print(f"\nπŸ“Š System Status:")
for component, info in system_info['components'].items():
status = "βœ… Ready" if info['ready'] else "❌ Not Ready"
print(f" {component}: {status}")
if not (generator.thematic_ready and generator.api_ready):
print("\n⚠️ System not fully ready - some features may be limited")
print("Continuing with demo using available components...")
# Demo topics
demo_topics = ["animals", "technology", "music"]
print(f"\n🎯 Generating crossword entries for {len(demo_topics)} topics")
print("=" * 60)
for topic in demo_topics:
print(f"\nπŸ“ Topic: '{topic.upper()}'")
print("-" * 40)
try:
start_time = time.time()
entries = generator.generate_crossword_entries(
topic=topic,
num_words=5, # Small number for demo
difficulty="medium"
)
generation_time = time.time() - start_time
if entries:
print(f"⏱️ Generated {len(entries)} entries in {generation_time:.2f}s")
print()
for i, entry in enumerate(entries, 1):
quality_icon = {
"EXCELLENT": "πŸ†",
"GOOD": "βœ…",
"ACCEPTABLE": "πŸ”„",
"POOR": "❌",
"BASIC": "πŸ“"
}.get(entry.clue_quality, "?")
print(f" {i}. {entry.word:<12} | {quality_icon} {entry.clue}")
print(f" Similarity: {entry.similarity_score:.3f} | {entry.tier_description}")
print(f" Model: {entry.clue_model}")
print()
else:
print("❌ No entries generated")
except Exception as e:
print(f"❌ Error generating entries for '{topic}': {e}")
# Show final stats
print("=" * 60)
print("πŸ“Š FINAL STATISTICS")
print("=" * 60)
stats = generator.get_stats()
print(f"Words discovered: {stats['words_discovered']}")
print(f"Clues generated: {stats['clues_generated']}")
print(f"API calls made: {stats['api_calls']}")
print(f"Total time: {stats['total_time']:.2f}s")
print("\nβœ… Integrated crossword generator demo complete!")
print("\nπŸ’‘ This system combines:")
print(" πŸ” Smart word discovery (100K+ vocabulary, semantic analysis)")
print(" πŸ€– High-quality clue generation (multiple AI models)")
print(" πŸ“Š Difficulty control (frequency tiers)")
print(" 🎯 Topic-focused generation")
if __name__ == "__main__":
main()