|
|
|
""" |
|
Integrated Crossword Generator |
|
Combines thematic word discovery with API-based clue generation for complete crossword creation. |
|
|
|
This system integrates: |
|
- UnifiedThematicWordGenerator: Smart word discovery using semantic embeddings |
|
- APIClueGenerator: High-quality clue generation using multiple AI models |
|
|
|
Creates a complete crossword generation pipeline with both intelligent word selection |
|
and professional-quality clues. |
|
""" |
|
|
|
import sys |
|
import os |
|
import time |
|
import logging |
|
import asyncio |
|
from typing import List, Dict, Optional, Tuple, Any |
|
from pathlib import Path |
|
from dataclasses import dataclass |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
try: |
|
from thematic_word_generator import UnifiedThematicWordGenerator |
|
THEMATIC_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"β Thematic generator import error: {e}") |
|
THEMATIC_AVAILABLE = False |
|
|
|
try: |
|
from api_clue_generator import APIClueGenerator |
|
API_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"β API generator import error: {e}") |
|
API_AVAILABLE = False |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
@dataclass |
|
class CrosswordEntry: |
|
"""Complete crossword entry with word, clue, and metadata.""" |
|
word: str |
|
clue: str |
|
topic: str |
|
similarity_score: float |
|
frequency_tier: str |
|
tier_description: str |
|
clue_quality: str |
|
clue_model: str |
|
|
|
|
|
class IntegratedCrosswordGenerator: |
|
""" |
|
Integrated crossword generator combining thematic word discovery with API clue generation. |
|
|
|
This class provides a complete pipeline from topic input to finished crossword entries |
|
with both intelligent word selection and high-quality clue generation. |
|
""" |
|
|
|
def __init__(self, |
|
cache_dir: Optional[str] = None, |
|
vocab_size_limit: Optional[int] = None, |
|
hf_token: Optional[str] = None): |
|
"""Initialize the integrated crossword generator. |
|
|
|
Args: |
|
cache_dir: Directory for caching models and embeddings |
|
vocab_size_limit: Maximum vocabulary size for thematic generator |
|
hf_token: Hugging Face API token for clue generation |
|
""" |
|
self.cache_dir = cache_dir or os.path.join(os.path.dirname(__file__), 'model_cache') |
|
self.vocab_size_limit = vocab_size_limit |
|
|
|
|
|
self.thematic_generator: Optional[UnifiedThematicWordGenerator] = None |
|
self.api_clue_generator: Optional[APIClueGenerator] = None |
|
|
|
|
|
self.is_initialized = False |
|
self.thematic_ready = False |
|
self.api_ready = False |
|
|
|
|
|
self.stats = { |
|
'words_discovered': 0, |
|
'clues_generated': 0, |
|
'api_calls': 0, |
|
'cache_hits': 0, |
|
'total_time': 0.0 |
|
} |
|
|
|
|
|
if not THEMATIC_AVAILABLE: |
|
logger.error("β UnifiedThematicWordGenerator not available - word discovery disabled") |
|
if not API_AVAILABLE: |
|
logger.error("β APIClueGenerator not available - API clue generation disabled") |
|
|
|
|
|
if THEMATIC_AVAILABLE: |
|
self.thematic_generator = UnifiedThematicWordGenerator( |
|
cache_dir=cache_dir, |
|
vocab_size_limit=vocab_size_limit |
|
) |
|
|
|
if API_AVAILABLE: |
|
self.api_clue_generator = APIClueGenerator(hf_token=hf_token) |
|
|
|
def initialize(self): |
|
"""Initialize both generators.""" |
|
if self.is_initialized: |
|
return |
|
|
|
start_time = time.time() |
|
logger.info("π Initializing Integrated Crossword Generator...") |
|
|
|
|
|
if self.thematic_generator: |
|
logger.info("π Initializing thematic word generator...") |
|
try: |
|
self.thematic_generator.initialize() |
|
self.thematic_ready = True |
|
logger.info("β
Thematic word generator ready") |
|
except Exception as e: |
|
logger.error(f"β Failed to initialize thematic generator: {e}") |
|
|
|
|
|
if self.api_clue_generator: |
|
if self.api_clue_generator.hf_token: |
|
self.api_ready = True |
|
logger.info("β
API clue generator ready") |
|
else: |
|
logger.warning("β οΈ API clue generator has no token - clue generation may fail") |
|
|
|
self.is_initialized = True |
|
init_time = time.time() - start_time |
|
logger.info(f"π Integrated generator initialized in {init_time:.2f}s") |
|
|
|
|
|
capabilities = [] |
|
if self.thematic_ready: |
|
vocab_size = self.thematic_generator.get_vocabulary_size() |
|
capabilities.append(f"Word Discovery ({vocab_size:,} words)") |
|
if self.api_ready: |
|
model_count = len(self.api_clue_generator.models) |
|
capabilities.append(f"API Clues ({model_count} models)") |
|
|
|
logger.info(f"π‘ Capabilities: {', '.join(capabilities) if capabilities else 'Limited (check dependencies)'}") |
|
|
|
async def initialize_async(self): |
|
"""Async initialization for backend compatibility.""" |
|
return self.initialize() |
|
|
|
def generate_crossword_entries(self, |
|
topic: str, |
|
num_words: int = 15, |
|
difficulty: str = "medium", |
|
min_similarity: float = 0.3) -> List[CrosswordEntry]: |
|
"""Generate complete crossword entries for a topic. |
|
|
|
Args: |
|
topic: Topic or theme for word generation |
|
num_words: Number of words to generate |
|
difficulty: Difficulty level (easy/medium/hard) |
|
min_similarity: Minimum similarity threshold for word discovery |
|
|
|
Returns: |
|
List of complete CrosswordEntry objects with words, clues, and metadata |
|
""" |
|
if not self.is_initialized: |
|
self.initialize() |
|
|
|
start_time = time.time() |
|
logger.info(f"π― Generating {num_words} crossword entries for topic: '{topic}' (difficulty: {difficulty})") |
|
|
|
|
|
words_with_metadata = self._discover_words(topic, num_words, difficulty, min_similarity) |
|
|
|
if not words_with_metadata: |
|
logger.warning(f"β οΈ No words discovered for topic '{topic}'") |
|
return [] |
|
|
|
logger.info(f"β
Discovered {len(words_with_metadata)} words") |
|
|
|
|
|
crossword_entries = self._generate_clues_for_words(words_with_metadata, topic) |
|
|
|
|
|
crossword_entries.sort(key=lambda x: (x.clue_quality == "EXCELLENT", x.similarity_score), reverse=True) |
|
|
|
total_time = time.time() - start_time |
|
self.stats['total_time'] += total_time |
|
|
|
logger.info(f"π Generated {len(crossword_entries)} complete crossword entries in {total_time:.2f}s") |
|
|
|
return crossword_entries[:num_words] |
|
|
|
def _discover_words(self, |
|
topic: str, |
|
num_words: int, |
|
difficulty: str, |
|
min_similarity: float) -> List[Tuple[str, float, str]]: |
|
"""Discover thematic words using the thematic generator.""" |
|
if not self.thematic_ready: |
|
logger.error("β Thematic word generator not ready - cannot discover words") |
|
return [] |
|
|
|
try: |
|
|
|
word_multipliers = {"easy": 2, "medium": 2.5, "hard": 3} |
|
multiplier = word_multipliers.get(difficulty, 2.5) |
|
discover_count = int(num_words * multiplier) |
|
|
|
logger.info(f"π Discovering {discover_count} candidate words for '{topic}'...") |
|
|
|
|
|
results = self.thematic_generator.generate_thematic_words( |
|
inputs=topic, |
|
num_words=discover_count, |
|
min_similarity=min_similarity, |
|
multi_theme=False |
|
) |
|
|
|
|
|
filtered_results = self._filter_by_difficulty(results, difficulty) |
|
|
|
self.stats['words_discovered'] += len(filtered_results) |
|
return filtered_results |
|
|
|
except Exception as e: |
|
logger.error(f"β Word discovery failed: {e}") |
|
return [] |
|
|
|
def _filter_by_difficulty(self, |
|
results: List[Tuple[str, float, str]], |
|
difficulty: str) -> List[Tuple[str, float, str]]: |
|
"""Filter words by difficulty level using frequency tiers and length.""" |
|
|
|
|
|
difficulty_config = { |
|
"easy": { |
|
"preferred_tiers": ["tier_2_extremely_common", "tier_3_very_common", "tier_4_highly_common"], |
|
"min_length": 3, |
|
"max_length": 6 |
|
}, |
|
"medium": { |
|
"preferred_tiers": ["tier_4_highly_common", "tier_5_common", "tier_6_moderately_common"], |
|
"min_length": 4, |
|
"max_length": 10 |
|
}, |
|
"hard": { |
|
"preferred_tiers": ["tier_6_moderately_common", "tier_7_somewhat_uncommon", "tier_8_uncommon"], |
|
"min_length": 5, |
|
"max_length": 15 |
|
} |
|
} |
|
|
|
config = difficulty_config.get(difficulty, difficulty_config["medium"]) |
|
|
|
|
|
filtered = [] |
|
for word, similarity, tier in results: |
|
|
|
if not (config["min_length"] <= len(word) <= config["max_length"]): |
|
continue |
|
|
|
|
|
tier_score = 1.0 if tier in config["preferred_tiers"] else 0.8 |
|
adjusted_similarity = similarity * tier_score |
|
|
|
filtered.append((word, adjusted_similarity, tier)) |
|
|
|
|
|
filtered.sort(key=lambda x: x[1], reverse=True) |
|
return filtered |
|
|
|
def _generate_clues_for_words(self, |
|
words_with_metadata: List[Tuple[str, float, str]], |
|
topic: str) -> List[CrosswordEntry]: |
|
"""Generate clues for discovered words using API generator.""" |
|
if not self.api_ready: |
|
logger.error("β API clue generator not ready - using basic clues") |
|
return self._generate_basic_clues(words_with_metadata, topic) |
|
|
|
logger.info(f"π€ Generating API clues for {len(words_with_metadata)} words...") |
|
|
|
crossword_entries = [] |
|
|
|
for word, similarity, tier in words_with_metadata: |
|
try: |
|
|
|
clue_results = self.api_clue_generator.generate_clue(word, topic) |
|
|
|
|
|
best_clue = None |
|
best_quality = "FAILED" |
|
best_model = "none" |
|
|
|
for model_key, clue in clue_results.items(): |
|
if clue: |
|
quality, score = self.api_clue_generator.evaluate_clue_quality(word, clue) |
|
if self._is_better_quality(quality, best_quality): |
|
best_clue = clue |
|
best_quality = quality |
|
best_model = model_key |
|
|
|
self.stats['api_calls'] += len([c for c in clue_results.values() if c]) |
|
|
|
|
|
if best_clue: |
|
tier_desc = self._get_tier_description(tier) |
|
entry = CrosswordEntry( |
|
word=word.upper(), |
|
clue=best_clue, |
|
topic=topic, |
|
similarity_score=similarity, |
|
frequency_tier=tier, |
|
tier_description=tier_desc, |
|
clue_quality=best_quality, |
|
clue_model=best_model |
|
) |
|
crossword_entries.append(entry) |
|
self.stats['clues_generated'] += 1 |
|
else: |
|
logger.warning(f"β οΈ No valid clue generated for '{word}'") |
|
|
|
except Exception as e: |
|
logger.error(f"β Failed to generate clue for '{word}': {e}") |
|
|
|
return crossword_entries |
|
|
|
def _generate_basic_clues(self, |
|
words_with_metadata: List[Tuple[str, float, str]], |
|
topic: str) -> List[CrosswordEntry]: |
|
"""Generate basic fallback clues when API is not available.""" |
|
logger.info(f"π Generating basic fallback clues for {len(words_with_metadata)} words...") |
|
|
|
crossword_entries = [] |
|
for word, similarity, tier in words_with_metadata: |
|
|
|
clue = f"Term related to {topic.lower()}" |
|
tier_desc = self._get_tier_description(tier) |
|
|
|
entry = CrosswordEntry( |
|
word=word.upper(), |
|
clue=clue, |
|
topic=topic, |
|
similarity_score=similarity, |
|
frequency_tier=tier, |
|
tier_description=tier_desc, |
|
clue_quality="BASIC", |
|
clue_model="template" |
|
) |
|
crossword_entries.append(entry) |
|
self.stats['clues_generated'] += 1 |
|
|
|
return crossword_entries |
|
|
|
def _is_better_quality(self, quality1: str, quality2: str) -> bool: |
|
"""Compare clue quality levels.""" |
|
quality_order = ["FAILED", "POOR", "ACCEPTABLE", "GOOD", "EXCELLENT"] |
|
try: |
|
return quality_order.index(quality1) > quality_order.index(quality2) |
|
except ValueError: |
|
return False |
|
|
|
def _get_tier_description(self, tier: str) -> str: |
|
"""Get human-readable tier description.""" |
|
if self.thematic_ready and hasattr(self.thematic_generator, 'tier_descriptions'): |
|
return self.thematic_generator.tier_descriptions.get(tier, tier) |
|
return tier |
|
|
|
def generate_by_multiple_topics(self, |
|
topics: List[str], |
|
words_per_topic: int = 10, |
|
difficulty: str = "medium") -> Dict[str, List[CrosswordEntry]]: |
|
"""Generate crossword entries for multiple topics. |
|
|
|
Args: |
|
topics: List of topics to generate words for |
|
words_per_topic: Number of words per topic |
|
difficulty: Difficulty level |
|
|
|
Returns: |
|
Dictionary mapping topics to their crossword entries |
|
""" |
|
logger.info(f"π― Generating crossword entries for {len(topics)} topics") |
|
|
|
results = {} |
|
for topic in topics: |
|
logger.info(f"π Processing topic: '{topic}'") |
|
entries = self.generate_crossword_entries( |
|
topic=topic, |
|
num_words=words_per_topic, |
|
difficulty=difficulty |
|
) |
|
results[topic] = entries |
|
|
|
return results |
|
|
|
def get_stats(self) -> Dict[str, Any]: |
|
"""Get performance statistics.""" |
|
return { |
|
**self.stats, |
|
'thematic_ready': self.thematic_ready, |
|
'api_ready': self.api_ready, |
|
'is_initialized': self.is_initialized, |
|
'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0, |
|
'api_models': len(self.api_clue_generator.models) if self.api_ready else 0 |
|
} |
|
|
|
def get_system_info(self) -> Dict[str, Any]: |
|
"""Get comprehensive system information.""" |
|
info = { |
|
'system': 'IntegratedCrosswordGenerator', |
|
'components': { |
|
'thematic_generator': { |
|
'available': THEMATIC_AVAILABLE, |
|
'ready': self.thematic_ready, |
|
'vocab_size': self.thematic_generator.get_vocabulary_size() if self.thematic_ready else 0 |
|
}, |
|
'api_clue_generator': { |
|
'available': API_AVAILABLE, |
|
'ready': self.api_ready, |
|
'models': list(self.api_clue_generator.models.keys()) if self.api_ready else [] |
|
} |
|
}, |
|
'stats': self.get_stats() |
|
} |
|
|
|
return info |
|
|
|
|
|
def main(): |
|
"""Demo the integrated crossword generator.""" |
|
print("π Integrated Crossword Generator Demo") |
|
print("=" * 60) |
|
|
|
|
|
hf_token = os.getenv('HF_TOKEN') |
|
if not hf_token: |
|
print("β HF_TOKEN environment variable not set") |
|
print("Set your token: export HF_TOKEN='your_token_here'") |
|
return |
|
|
|
print("π Initializing integrated system...") |
|
generator = IntegratedCrosswordGenerator(vocab_size_limit=50000) |
|
generator.initialize() |
|
|
|
|
|
system_info = generator.get_system_info() |
|
print(f"\nπ System Status:") |
|
for component, info in system_info['components'].items(): |
|
status = "β
Ready" if info['ready'] else "β Not Ready" |
|
print(f" {component}: {status}") |
|
|
|
if not (generator.thematic_ready and generator.api_ready): |
|
print("\nβ οΈ System not fully ready - some features may be limited") |
|
print("Continuing with demo using available components...") |
|
|
|
|
|
demo_topics = ["animals", "technology", "music"] |
|
|
|
print(f"\nπ― Generating crossword entries for {len(demo_topics)} topics") |
|
print("=" * 60) |
|
|
|
for topic in demo_topics: |
|
print(f"\nπ Topic: '{topic.upper()}'") |
|
print("-" * 40) |
|
|
|
try: |
|
start_time = time.time() |
|
entries = generator.generate_crossword_entries( |
|
topic=topic, |
|
num_words=5, |
|
difficulty="medium" |
|
) |
|
generation_time = time.time() - start_time |
|
|
|
if entries: |
|
print(f"β±οΈ Generated {len(entries)} entries in {generation_time:.2f}s") |
|
print() |
|
|
|
for i, entry in enumerate(entries, 1): |
|
quality_icon = { |
|
"EXCELLENT": "π", |
|
"GOOD": "β
", |
|
"ACCEPTABLE": "π", |
|
"POOR": "β", |
|
"BASIC": "π" |
|
}.get(entry.clue_quality, "?") |
|
|
|
print(f" {i}. {entry.word:<12} | {quality_icon} {entry.clue}") |
|
print(f" Similarity: {entry.similarity_score:.3f} | {entry.tier_description}") |
|
print(f" Model: {entry.clue_model}") |
|
print() |
|
else: |
|
print("β No entries generated") |
|
|
|
except Exception as e: |
|
print(f"β Error generating entries for '{topic}': {e}") |
|
|
|
|
|
print("=" * 60) |
|
print("π FINAL STATISTICS") |
|
print("=" * 60) |
|
stats = generator.get_stats() |
|
print(f"Words discovered: {stats['words_discovered']}") |
|
print(f"Clues generated: {stats['clues_generated']}") |
|
print(f"API calls made: {stats['api_calls']}") |
|
print(f"Total time: {stats['total_time']:.2f}s") |
|
|
|
print("\nβ
Integrated crossword generator demo complete!") |
|
print("\nπ‘ This system combines:") |
|
print(" π Smart word discovery (100K+ vocabulary, semantic analysis)") |
|
print(" π€ High-quality clue generation (multiple AI models)") |
|
print(" π Difficulty control (frequency tiers)") |
|
print(" π― Topic-focused generation") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |