abc123 / hack /test_clue_generation.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
24.4 kB
#!/usr/bin/env python3
"""
Test Script: Thematic Word Generation + LLM Clue Generation
Integrates the existing thematic_word_generator.py with the new llm_clue_generator.py
to create a complete word-to-clue pipeline for crossword puzzles.
Tests various scenarios:
- Single topics
- Multiple topics
- Custom sentences
- Different difficulties
- Performance analysis
"""
import os
import sys
import time
import logging
from typing import List, Dict, Tuple, Any
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from thematic_word_generator import UnifiedThematicWordGenerator
from llm_clue_generator import LLMClueGenerator
GENERATORS_AVAILABLE = True
except ImportError as e:
print(f"❌ Import error: {e}")
print("Make sure thematic_word_generator.py and llm_clue_generator.py are in the same directory")
GENERATORS_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
class CrosswordClueTestSuite:
"""
Test suite for integrated thematic word generation + LLM clue generation.
"""
def __init__(self, vocab_size_limit: int = 50000):
"""Initialize the test suite.
Args:
vocab_size_limit: Vocabulary size for thematic generator (smaller for faster testing)
"""
if not GENERATORS_AVAILABLE:
raise ImportError("Required generators not available")
self.vocab_size_limit = vocab_size_limit
self.word_generator = None
self.clue_generator = None
# Test results storage
self.test_results = {}
self.performance_stats = {}
def initialize(self):
"""Initialize both generators."""
print("🚀 Initializing Crossword Clue Test Suite")
print("=" * 60)
# Initialize thematic word generator
print("\n🔄 Initializing thematic word generator...")
start_time = time.time()
self.word_generator = UnifiedThematicWordGenerator(
vocab_size_limit=self.vocab_size_limit
)
self.word_generator.initialize()
word_gen_time = time.time() - start_time
print(f"✅ Word generator ready in {word_gen_time:.2f}s")
# Initialize LLM clue generator
print("\n🔄 Initializing LLM clue generator...")
start_time = time.time()
self.clue_generator = LLMClueGenerator()
self.clue_generator.initialize()
clue_gen_time = time.time() - start_time
print(f"✅ Clue generator ready in {clue_gen_time:.2f}s")
# Store initialization stats
self.performance_stats['word_gen_init_time'] = word_gen_time
self.performance_stats['clue_gen_init_time'] = clue_gen_time
self.performance_stats['total_init_time'] = word_gen_time + clue_gen_time
print(f"\n✅ Test suite initialized in {word_gen_time + clue_gen_time:.2f}s")
def run_single_topic_test(self, topic: str, num_words: int = 10) -> Dict[str, Any]:
"""Test single topic word+clue generation.
Args:
topic: Single topic string
num_words: Number of words to generate
Returns:
Test results dictionary
"""
print(f"\n🎯 Single Topic Test: '{topic}'")
print("-" * 50)
start_time = time.time()
# Step 1: Generate thematic words
print(f"📝 Generating {num_words} thematic words...")
word_start_time = time.time()
thematic_results = self.word_generator.generate_thematic_words(
inputs=topic,
num_words=num_words,
min_similarity=0.3
)
word_gen_time = time.time() - word_start_time
print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")
if not thematic_results:
return {"error": "No thematic words generated"}
# Step 2: Generate clues for each word
print(f"🎭 Generating clues for {len(thematic_results)} words...")
clue_start_time = time.time()
word_clue_pairs = []
for word, similarity, tier in thematic_results:
try:
clue = self.clue_generator.generate_clue(
word=word,
topic=topic,
clue_style="category",
difficulty="medium"
)
word_clue_pairs.append({
"word": word.upper(),
"clue": clue,
"similarity": similarity,
"tier": tier,
"length": len(word)
})
except Exception as e:
logger.error(f"Failed to generate clue for '{word}': {e}")
word_clue_pairs.append({
"word": word.upper(),
"clue": f"Related to {topic}: {word}", # Fallback
"similarity": similarity,
"tier": tier,
"length": len(word),
"error": str(e)
})
clue_gen_time = time.time() - clue_start_time
total_time = time.time() - start_time
# Display results
print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
print(f"\n📋 Results for topic '{topic}':")
print("=" * 60)
for i, item in enumerate(word_clue_pairs, 1):
tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
error_marker = " ⚠️" if 'error' in item else ""
print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
print(f" Similarity: {item['similarity']:.3f} | {tier_desc}")
# Performance summary
print(f"\n⏱️ Performance:")
print(f" Word generation: {word_gen_time:.2f}s")
print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
print(f" Total time: {total_time:.2f}s")
return {
"topic": topic,
"num_words_requested": num_words,
"num_words_generated": len(word_clue_pairs),
"word_clue_pairs": word_clue_pairs,
"performance": {
"word_gen_time": word_gen_time,
"clue_gen_time": clue_gen_time,
"total_time": total_time,
"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
}
}
def run_multi_topic_test(self, topics: List[str], num_words: int = 12) -> Dict[str, Any]:
"""Test multi-topic word+clue generation.
Args:
topics: List of topic strings
num_words: Number of words to generate
Returns:
Test results dictionary
"""
print(f"\n🎯 Multi-Topic Test: {topics}")
print("-" * 50)
start_time = time.time()
# Step 1: Generate thematic words (multi-theme enabled)
print(f"📝 Generating {num_words} multi-thematic words...")
word_start_time = time.time()
thematic_results = self.word_generator.generate_thematic_words(
inputs=topics,
num_words=num_words,
min_similarity=0.25, # Lower threshold for multi-topic
multi_theme=True # Enable multi-theme processing
)
word_gen_time = time.time() - word_start_time
print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")
if not thematic_results:
return {"error": "No thematic words generated"}
# Step 2: Generate contextual clues
print(f"🎭 Generating contextual clues...")
clue_start_time = time.time()
# Create topic context string for clue generation
topic_context = " and ".join(topics)
word_clue_pairs = []
for word, similarity, tier in thematic_results:
try:
clue = self.clue_generator.generate_clue(
word=word,
topic=topic_context,
clue_style="description", # Use descriptive style for multi-topic
difficulty="medium"
)
word_clue_pairs.append({
"word": word.upper(),
"clue": clue,
"similarity": similarity,
"tier": tier,
"length": len(word)
})
except Exception as e:
logger.error(f"Failed to generate clue for '{word}': {e}")
word_clue_pairs.append({
"word": word.upper(),
"clue": f"Related to {topic_context}: {word}",
"similarity": similarity,
"tier": tier,
"length": len(word),
"error": str(e)
})
clue_gen_time = time.time() - clue_start_time
total_time = time.time() - start_time
# Display results
print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
print(f"\n📋 Results for topics {topics}:")
print("=" * 70)
for i, item in enumerate(word_clue_pairs, 1):
tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
error_marker = " ⚠️" if 'error' in item else ""
print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
print(f" Similarity: {item['similarity']:.3f} | {tier_desc}")
# Performance summary
print(f"\n⏱️ Performance:")
print(f" Word generation: {word_gen_time:.2f}s")
print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
print(f" Total time: {total_time:.2f}s")
return {
"topics": topics,
"num_words_requested": num_words,
"num_words_generated": len(word_clue_pairs),
"word_clue_pairs": word_clue_pairs,
"performance": {
"word_gen_time": word_gen_time,
"clue_gen_time": clue_gen_time,
"total_time": total_time,
"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
}
}
def run_custom_sentence_test(self, sentence: str, num_words: int = 10) -> Dict[str, Any]:
"""Test custom sentence word+clue generation.
Args:
sentence: Custom sentence input
num_words: Number of words to generate
Returns:
Test results dictionary
"""
print(f"\n🎯 Custom Sentence Test: '{sentence}'")
print("-" * 60)
start_time = time.time()
# Step 1: Generate thematic words from sentence
print(f"📝 Generating {num_words} words from sentence...")
word_start_time = time.time()
thematic_results = self.word_generator.generate_thematic_words(
inputs=sentence,
num_words=num_words,
min_similarity=0.2 # Lower threshold for sentences
)
word_gen_time = time.time() - word_start_time
print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")
if not thematic_results:
return {"error": "No thematic words generated"}
# Step 2: Generate personalized clues
print(f"🎭 Generating personalized clues...")
clue_start_time = time.time()
word_clue_pairs = []
for word, similarity, tier in thematic_results:
try:
# Use the original sentence as context for more personalized clues
clue = self.clue_generator.generate_clue(
word=word,
topic=f"theme: {sentence}",
clue_style="description",
difficulty="medium"
)
word_clue_pairs.append({
"word": word.upper(),
"clue": clue,
"similarity": similarity,
"tier": tier,
"length": len(word)
})
except Exception as e:
logger.error(f"Failed to generate clue for '{word}': {e}")
word_clue_pairs.append({
"word": word.upper(),
"clue": f"From '{sentence[:30]}...': {word}",
"similarity": similarity,
"tier": tier,
"length": len(word),
"error": str(e)
})
clue_gen_time = time.time() - clue_start_time
total_time = time.time() - start_time
# Display results
print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
print(f"\n📋 Results for sentence: '{sentence}'")
print("=" * 70)
for i, item in enumerate(word_clue_pairs, 1):
tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
error_marker = " ⚠️" if 'error' in item else ""
print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
print(f" Similarity: {item['similarity']:.3f} | {tier_desc}")
# Performance summary
print(f"\n⏱️ Performance:")
print(f" Word generation: {word_gen_time:.2f}s")
print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
print(f" Total time: {total_time:.2f}s")
return {
"sentence": sentence,
"num_words_requested": num_words,
"num_words_generated": len(word_clue_pairs),
"word_clue_pairs": word_clue_pairs,
"performance": {
"word_gen_time": word_gen_time,
"clue_gen_time": clue_gen_time,
"total_time": total_time,
"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
}
}
def run_difficulty_comparison_test(self, topic: str, num_words: int = 6) -> Dict[str, Any]:
"""Test different difficulty levels for the same topic.
Args:
topic: Topic to test
num_words: Number of words to generate
Returns:
Comparison results
"""
print(f"\n🎯 Difficulty Comparison Test: '{topic}'")
print("-" * 50)
difficulties = ["easy", "medium", "hard"]
results = {}
# Generate words once (reuse for all difficulties)
thematic_results = self.word_generator.generate_thematic_words(
inputs=topic,
num_words=num_words,
min_similarity=0.3
)[:num_words] # Take only requested number
if not thematic_results:
return {"error": "No thematic words generated"}
print(f"📝 Testing {len(thematic_results)} words at different difficulty levels...")
for difficulty in difficulties:
print(f"\n--- {difficulty.upper()} Difficulty ---")
clue_pairs = []
start_time = time.time()
for word, similarity, tier in thematic_results:
try:
clue = self.clue_generator.generate_clue(
word=word,
topic=topic,
clue_style="category",
difficulty=difficulty
)
clue_pairs.append({
"word": word.upper(),
"clue": clue,
"similarity": similarity,
"tier": tier
})
except Exception as e:
logger.error(f"Failed to generate {difficulty} clue for '{word}': {e}")
clue_pairs.append({
"word": word.upper(),
"clue": f"{difficulty.title()} clue for {word}",
"similarity": similarity,
"tier": tier,
"error": str(e)
})
generation_time = time.time() - start_time
results[difficulty] = {
"clue_pairs": clue_pairs,
"generation_time": generation_time
}
# Display this difficulty's results
for i, item in enumerate(clue_pairs, 1):
error_marker = " ⚠️" if 'error' in item else ""
print(f" {i}. {item['word']:<10} - {item['clue']}{error_marker}")
return {
"topic": topic,
"difficulties_tested": difficulties,
"results": results,
"base_words": [{"word": w, "similarity": s, "tier": t} for w, s, t in thematic_results]
}
def run_performance_analysis(self) -> Dict[str, Any]:
"""Analyze overall performance characteristics."""
print(f"\n📊 Performance Analysis")
print("-" * 40)
# Collect performance stats from previous tests
if not self.test_results:
print("⚠️ No test results available for performance analysis")
return {}
all_word_times = []
all_clue_times = []
all_total_times = []
for test_name, result in self.test_results.items():
if 'performance' in result:
perf = result['performance']
all_word_times.append(perf.get('word_gen_time', 0))
all_clue_times.append(perf.get('clue_gen_time', 0))
all_total_times.append(perf.get('total_time', 0))
if all_word_times:
print(f"📈 Word Generation Performance:")
print(f" Average: {sum(all_word_times)/len(all_word_times):.2f}s")
print(f" Min: {min(all_word_times):.2f}s")
print(f" Max: {max(all_word_times):.2f}s")
print(f"\n🎭 Clue Generation Performance:")
print(f" Average: {sum(all_clue_times)/len(all_clue_times):.2f}s")
print(f" Min: {min(all_clue_times):.2f}s")
print(f" Max: {max(all_clue_times):.2f}s")
print(f"\n⏱️ Total Pipeline Performance:")
print(f" Average: {sum(all_total_times)/len(all_total_times):.2f}s")
print(f" Min: {min(all_total_times):.2f}s")
print(f" Max: {max(all_total_times):.2f}s")
return {
"word_gen_stats": {
"avg": sum(all_word_times)/len(all_word_times) if all_word_times else 0,
"min": min(all_word_times) if all_word_times else 0,
"max": max(all_word_times) if all_word_times else 0
},
"clue_gen_stats": {
"avg": sum(all_clue_times)/len(all_clue_times) if all_clue_times else 0,
"min": min(all_clue_times) if all_clue_times else 0,
"max": max(all_clue_times) if all_clue_times else 0
},
"total_stats": {
"avg": sum(all_total_times)/len(all_total_times) if all_total_times else 0,
"min": min(all_total_times) if all_total_times else 0,
"max": max(all_total_times) if all_total_times else 0
}
}
def run_full_test_suite(self):
"""Run the complete test suite."""
print("🧪 CROSSWORD CLUE GENERATION TEST SUITE")
print("=" * 70)
if not GENERATORS_AVAILABLE:
print("❌ Cannot run tests - generators not available")
return
# Initialize
self.initialize()
# Test 1: Single topics
print("\n" + "="*70)
print("TEST 1: SINGLE TOPIC TESTS")
print("="*70)
single_topics = ["animals", "technology", "music", "food"]
for topic in single_topics:
result = self.run_single_topic_test(topic, num_words=8)
self.test_results[f"single_{topic}"] = result
# Test 2: Multi-topic
print("\n" + "="*70)
print("TEST 2: MULTI-TOPIC TEST")
print("="*70)
multi_result = self.run_multi_topic_test(["science", "technology"], num_words=10)
self.test_results["multi_science_tech"] = multi_result
# Test 3: Custom sentence
print("\n" + "="*70)
print("TEST 3: CUSTOM SENTENCE TEST")
print("="*70)
sentence_result = self.run_custom_sentence_test("I love cats and playing guitar", num_words=8)
self.test_results["sentence_cats_guitar"] = sentence_result
# Test 4: Difficulty comparison
print("\n" + "="*70)
print("TEST 4: DIFFICULTY COMPARISON")
print("="*70)
difficulty_result = self.run_difficulty_comparison_test("sports", num_words=5)
self.test_results["difficulty_sports"] = difficulty_result
# Test 5: Performance analysis
print("\n" + "="*70)
print("TEST 5: PERFORMANCE ANALYSIS")
print("="*70)
perf_result = self.run_performance_analysis()
self.test_results["performance"] = perf_result
# Final summary
print("\n" + "="*70)
print("📋 FINAL SUMMARY")
print("="*70)
print(f"✅ Test suite completed!")
print(f"📊 Tests run: {len(self.test_results)}")
# Model info
word_info = {
"vocab_size": self.word_generator.get_vocabulary_size(),
"tier_distribution": len(self.word_generator.get_tier_distribution())
}
clue_info = self.clue_generator.get_model_info()
print(f"\n🔧 System Information:")
print(f" Word vocabulary: {word_info['vocab_size']:,} words")
print(f" Clue model: {clue_info['model_name']}")
print(f" Model size: {clue_info.get('model_size_mb', 0):.1f} MB")
if perf_result:
avg_total = perf_result['total_stats']['avg']
print(f" Average pipeline time: {avg_total:.2f}s")
print(f"\n💡 Recommendations for HF Spaces:")
if perf_result and perf_result['total_stats']['avg'] < 15:
print(" ✅ Performance suitable for interactive use")
else:
print(" ⚠️ Consider optimizations for better user experience")
print("\n🎉 Test suite complete!")
def main():
"""Run the test suite."""
if not GENERATORS_AVAILABLE:
print("❌ Cannot run tests - required generators not available")
print("Make sure thematic_word_generator.py and llm_clue_generator.py are working")
return
# Create and run test suite
test_suite = CrosswordClueTestSuite(vocab_size_limit=50000) # Use existing cached embeddings
try:
test_suite.run_full_test_suite()
except KeyboardInterrupt:
print("\n\n⏹️ Test suite interrupted by user")
except Exception as e:
print(f"\n❌ Test suite failed: {e}")
logger.error(f"Test suite error: {e}", exc_info=True)
if __name__ == "__main__":
main()