|
|
|
""" |
|
Test Script: Thematic Word Generation + LLM Clue Generation |
|
|
|
Integrates the existing thematic_word_generator.py with the new llm_clue_generator.py |
|
to create a complete word-to-clue pipeline for crossword puzzles. |
|
|
|
Tests various scenarios: |
|
- Single topics |
|
- Multiple topics |
|
- Custom sentences |
|
- Different difficulties |
|
- Performance analysis |
|
""" |
|
|
|
import os |
|
import sys |
|
import time |
|
import logging |
|
from typing import List, Dict, Tuple, Any |
|
from pathlib import Path |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
try: |
|
from thematic_word_generator import UnifiedThematicWordGenerator |
|
from llm_clue_generator import LLMClueGenerator |
|
GENERATORS_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"❌ Import error: {e}") |
|
print("Make sure thematic_word_generator.py and llm_clue_generator.py are in the same directory") |
|
GENERATORS_AVAILABLE = False |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s', |
|
datefmt='%Y-%m-%d %H:%M:%S' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
class CrosswordClueTestSuite: |
|
""" |
|
Test suite for integrated thematic word generation + LLM clue generation. |
|
""" |
|
|
|
def __init__(self, vocab_size_limit: int = 50000): |
|
"""Initialize the test suite. |
|
|
|
Args: |
|
vocab_size_limit: Vocabulary size for thematic generator (smaller for faster testing) |
|
""" |
|
if not GENERATORS_AVAILABLE: |
|
raise ImportError("Required generators not available") |
|
|
|
self.vocab_size_limit = vocab_size_limit |
|
self.word_generator = None |
|
self.clue_generator = None |
|
|
|
|
|
self.test_results = {} |
|
self.performance_stats = {} |
|
|
|
def initialize(self): |
|
"""Initialize both generators.""" |
|
print("🚀 Initializing Crossword Clue Test Suite") |
|
print("=" * 60) |
|
|
|
|
|
print("\n🔄 Initializing thematic word generator...") |
|
start_time = time.time() |
|
|
|
self.word_generator = UnifiedThematicWordGenerator( |
|
vocab_size_limit=self.vocab_size_limit |
|
) |
|
self.word_generator.initialize() |
|
|
|
word_gen_time = time.time() - start_time |
|
print(f"✅ Word generator ready in {word_gen_time:.2f}s") |
|
|
|
|
|
print("\n🔄 Initializing LLM clue generator...") |
|
start_time = time.time() |
|
|
|
self.clue_generator = LLMClueGenerator() |
|
self.clue_generator.initialize() |
|
|
|
clue_gen_time = time.time() - start_time |
|
print(f"✅ Clue generator ready in {clue_gen_time:.2f}s") |
|
|
|
|
|
self.performance_stats['word_gen_init_time'] = word_gen_time |
|
self.performance_stats['clue_gen_init_time'] = clue_gen_time |
|
self.performance_stats['total_init_time'] = word_gen_time + clue_gen_time |
|
|
|
print(f"\n✅ Test suite initialized in {word_gen_time + clue_gen_time:.2f}s") |
|
|
|
def run_single_topic_test(self, topic: str, num_words: int = 10) -> Dict[str, Any]: |
|
"""Test single topic word+clue generation. |
|
|
|
Args: |
|
topic: Single topic string |
|
num_words: Number of words to generate |
|
|
|
Returns: |
|
Test results dictionary |
|
""" |
|
print(f"\n🎯 Single Topic Test: '{topic}'") |
|
print("-" * 50) |
|
|
|
start_time = time.time() |
|
|
|
|
|
print(f"📝 Generating {num_words} thematic words...") |
|
word_start_time = time.time() |
|
|
|
thematic_results = self.word_generator.generate_thematic_words( |
|
inputs=topic, |
|
num_words=num_words, |
|
min_similarity=0.3 |
|
) |
|
|
|
word_gen_time = time.time() - word_start_time |
|
print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s") |
|
|
|
if not thematic_results: |
|
return {"error": "No thematic words generated"} |
|
|
|
|
|
print(f"🎭 Generating clues for {len(thematic_results)} words...") |
|
clue_start_time = time.time() |
|
|
|
word_clue_pairs = [] |
|
for word, similarity, tier in thematic_results: |
|
try: |
|
clue = self.clue_generator.generate_clue( |
|
word=word, |
|
topic=topic, |
|
clue_style="category", |
|
difficulty="medium" |
|
) |
|
|
|
word_clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": clue, |
|
"similarity": similarity, |
|
"tier": tier, |
|
"length": len(word) |
|
}) |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to generate clue for '{word}': {e}") |
|
word_clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": f"Related to {topic}: {word}", |
|
"similarity": similarity, |
|
"tier": tier, |
|
"length": len(word), |
|
"error": str(e) |
|
}) |
|
|
|
clue_gen_time = time.time() - clue_start_time |
|
total_time = time.time() - start_time |
|
|
|
|
|
print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s") |
|
print(f"\n📋 Results for topic '{topic}':") |
|
print("=" * 60) |
|
|
|
for i, item in enumerate(word_clue_pairs, 1): |
|
tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier']) |
|
error_marker = " ⚠️" if 'error' in item else "" |
|
print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}") |
|
print(f" Similarity: {item['similarity']:.3f} | {tier_desc}") |
|
|
|
|
|
print(f"\n⏱️ Performance:") |
|
print(f" Word generation: {word_gen_time:.2f}s") |
|
print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)") |
|
print(f" Total time: {total_time:.2f}s") |
|
|
|
return { |
|
"topic": topic, |
|
"num_words_requested": num_words, |
|
"num_words_generated": len(word_clue_pairs), |
|
"word_clue_pairs": word_clue_pairs, |
|
"performance": { |
|
"word_gen_time": word_gen_time, |
|
"clue_gen_time": clue_gen_time, |
|
"total_time": total_time, |
|
"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0 |
|
} |
|
} |
|
|
|
def run_multi_topic_test(self, topics: List[str], num_words: int = 12) -> Dict[str, Any]: |
|
"""Test multi-topic word+clue generation. |
|
|
|
Args: |
|
topics: List of topic strings |
|
num_words: Number of words to generate |
|
|
|
Returns: |
|
Test results dictionary |
|
""" |
|
print(f"\n🎯 Multi-Topic Test: {topics}") |
|
print("-" * 50) |
|
|
|
start_time = time.time() |
|
|
|
|
|
print(f"📝 Generating {num_words} multi-thematic words...") |
|
word_start_time = time.time() |
|
|
|
thematic_results = self.word_generator.generate_thematic_words( |
|
inputs=topics, |
|
num_words=num_words, |
|
min_similarity=0.25, |
|
multi_theme=True |
|
) |
|
|
|
word_gen_time = time.time() - word_start_time |
|
print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s") |
|
|
|
if not thematic_results: |
|
return {"error": "No thematic words generated"} |
|
|
|
|
|
print(f"🎭 Generating contextual clues...") |
|
clue_start_time = time.time() |
|
|
|
|
|
topic_context = " and ".join(topics) |
|
|
|
word_clue_pairs = [] |
|
for word, similarity, tier in thematic_results: |
|
try: |
|
clue = self.clue_generator.generate_clue( |
|
word=word, |
|
topic=topic_context, |
|
clue_style="description", |
|
difficulty="medium" |
|
) |
|
|
|
word_clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": clue, |
|
"similarity": similarity, |
|
"tier": tier, |
|
"length": len(word) |
|
}) |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to generate clue for '{word}': {e}") |
|
word_clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": f"Related to {topic_context}: {word}", |
|
"similarity": similarity, |
|
"tier": tier, |
|
"length": len(word), |
|
"error": str(e) |
|
}) |
|
|
|
clue_gen_time = time.time() - clue_start_time |
|
total_time = time.time() - start_time |
|
|
|
|
|
print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s") |
|
print(f"\n📋 Results for topics {topics}:") |
|
print("=" * 70) |
|
|
|
for i, item in enumerate(word_clue_pairs, 1): |
|
tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier']) |
|
error_marker = " ⚠️" if 'error' in item else "" |
|
print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}") |
|
print(f" Similarity: {item['similarity']:.3f} | {tier_desc}") |
|
|
|
|
|
print(f"\n⏱️ Performance:") |
|
print(f" Word generation: {word_gen_time:.2f}s") |
|
print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)") |
|
print(f" Total time: {total_time:.2f}s") |
|
|
|
return { |
|
"topics": topics, |
|
"num_words_requested": num_words, |
|
"num_words_generated": len(word_clue_pairs), |
|
"word_clue_pairs": word_clue_pairs, |
|
"performance": { |
|
"word_gen_time": word_gen_time, |
|
"clue_gen_time": clue_gen_time, |
|
"total_time": total_time, |
|
"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0 |
|
} |
|
} |
|
|
|
def run_custom_sentence_test(self, sentence: str, num_words: int = 10) -> Dict[str, Any]: |
|
"""Test custom sentence word+clue generation. |
|
|
|
Args: |
|
sentence: Custom sentence input |
|
num_words: Number of words to generate |
|
|
|
Returns: |
|
Test results dictionary |
|
""" |
|
print(f"\n🎯 Custom Sentence Test: '{sentence}'") |
|
print("-" * 60) |
|
|
|
start_time = time.time() |
|
|
|
|
|
print(f"📝 Generating {num_words} words from sentence...") |
|
word_start_time = time.time() |
|
|
|
thematic_results = self.word_generator.generate_thematic_words( |
|
inputs=sentence, |
|
num_words=num_words, |
|
min_similarity=0.2 |
|
) |
|
|
|
word_gen_time = time.time() - word_start_time |
|
print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s") |
|
|
|
if not thematic_results: |
|
return {"error": "No thematic words generated"} |
|
|
|
|
|
print(f"🎭 Generating personalized clues...") |
|
clue_start_time = time.time() |
|
|
|
word_clue_pairs = [] |
|
for word, similarity, tier in thematic_results: |
|
try: |
|
|
|
clue = self.clue_generator.generate_clue( |
|
word=word, |
|
topic=f"theme: {sentence}", |
|
clue_style="description", |
|
difficulty="medium" |
|
) |
|
|
|
word_clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": clue, |
|
"similarity": similarity, |
|
"tier": tier, |
|
"length": len(word) |
|
}) |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to generate clue for '{word}': {e}") |
|
word_clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": f"From '{sentence[:30]}...': {word}", |
|
"similarity": similarity, |
|
"tier": tier, |
|
"length": len(word), |
|
"error": str(e) |
|
}) |
|
|
|
clue_gen_time = time.time() - clue_start_time |
|
total_time = time.time() - start_time |
|
|
|
|
|
print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s") |
|
print(f"\n📋 Results for sentence: '{sentence}'") |
|
print("=" * 70) |
|
|
|
for i, item in enumerate(word_clue_pairs, 1): |
|
tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier']) |
|
error_marker = " ⚠️" if 'error' in item else "" |
|
print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}") |
|
print(f" Similarity: {item['similarity']:.3f} | {tier_desc}") |
|
|
|
|
|
print(f"\n⏱️ Performance:") |
|
print(f" Word generation: {word_gen_time:.2f}s") |
|
print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)") |
|
print(f" Total time: {total_time:.2f}s") |
|
|
|
return { |
|
"sentence": sentence, |
|
"num_words_requested": num_words, |
|
"num_words_generated": len(word_clue_pairs), |
|
"word_clue_pairs": word_clue_pairs, |
|
"performance": { |
|
"word_gen_time": word_gen_time, |
|
"clue_gen_time": clue_gen_time, |
|
"total_time": total_time, |
|
"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0 |
|
} |
|
} |
|
|
|
def run_difficulty_comparison_test(self, topic: str, num_words: int = 6) -> Dict[str, Any]: |
|
"""Test different difficulty levels for the same topic. |
|
|
|
Args: |
|
topic: Topic to test |
|
num_words: Number of words to generate |
|
|
|
Returns: |
|
Comparison results |
|
""" |
|
print(f"\n🎯 Difficulty Comparison Test: '{topic}'") |
|
print("-" * 50) |
|
|
|
difficulties = ["easy", "medium", "hard"] |
|
results = {} |
|
|
|
|
|
thematic_results = self.word_generator.generate_thematic_words( |
|
inputs=topic, |
|
num_words=num_words, |
|
min_similarity=0.3 |
|
)[:num_words] |
|
|
|
if not thematic_results: |
|
return {"error": "No thematic words generated"} |
|
|
|
print(f"📝 Testing {len(thematic_results)} words at different difficulty levels...") |
|
|
|
for difficulty in difficulties: |
|
print(f"\n--- {difficulty.upper()} Difficulty ---") |
|
|
|
clue_pairs = [] |
|
start_time = time.time() |
|
|
|
for word, similarity, tier in thematic_results: |
|
try: |
|
clue = self.clue_generator.generate_clue( |
|
word=word, |
|
topic=topic, |
|
clue_style="category", |
|
difficulty=difficulty |
|
) |
|
|
|
clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": clue, |
|
"similarity": similarity, |
|
"tier": tier |
|
}) |
|
|
|
except Exception as e: |
|
logger.error(f"Failed to generate {difficulty} clue for '{word}': {e}") |
|
clue_pairs.append({ |
|
"word": word.upper(), |
|
"clue": f"{difficulty.title()} clue for {word}", |
|
"similarity": similarity, |
|
"tier": tier, |
|
"error": str(e) |
|
}) |
|
|
|
generation_time = time.time() - start_time |
|
results[difficulty] = { |
|
"clue_pairs": clue_pairs, |
|
"generation_time": generation_time |
|
} |
|
|
|
|
|
for i, item in enumerate(clue_pairs, 1): |
|
error_marker = " ⚠️" if 'error' in item else "" |
|
print(f" {i}. {item['word']:<10} - {item['clue']}{error_marker}") |
|
|
|
return { |
|
"topic": topic, |
|
"difficulties_tested": difficulties, |
|
"results": results, |
|
"base_words": [{"word": w, "similarity": s, "tier": t} for w, s, t in thematic_results] |
|
} |
|
|
|
def run_performance_analysis(self) -> Dict[str, Any]: |
|
"""Analyze overall performance characteristics.""" |
|
print(f"\n📊 Performance Analysis") |
|
print("-" * 40) |
|
|
|
|
|
if not self.test_results: |
|
print("⚠️ No test results available for performance analysis") |
|
return {} |
|
|
|
all_word_times = [] |
|
all_clue_times = [] |
|
all_total_times = [] |
|
|
|
for test_name, result in self.test_results.items(): |
|
if 'performance' in result: |
|
perf = result['performance'] |
|
all_word_times.append(perf.get('word_gen_time', 0)) |
|
all_clue_times.append(perf.get('clue_gen_time', 0)) |
|
all_total_times.append(perf.get('total_time', 0)) |
|
|
|
if all_word_times: |
|
print(f"📈 Word Generation Performance:") |
|
print(f" Average: {sum(all_word_times)/len(all_word_times):.2f}s") |
|
print(f" Min: {min(all_word_times):.2f}s") |
|
print(f" Max: {max(all_word_times):.2f}s") |
|
|
|
print(f"\n🎭 Clue Generation Performance:") |
|
print(f" Average: {sum(all_clue_times)/len(all_clue_times):.2f}s") |
|
print(f" Min: {min(all_clue_times):.2f}s") |
|
print(f" Max: {max(all_clue_times):.2f}s") |
|
|
|
print(f"\n⏱️ Total Pipeline Performance:") |
|
print(f" Average: {sum(all_total_times)/len(all_total_times):.2f}s") |
|
print(f" Min: {min(all_total_times):.2f}s") |
|
print(f" Max: {max(all_total_times):.2f}s") |
|
|
|
return { |
|
"word_gen_stats": { |
|
"avg": sum(all_word_times)/len(all_word_times) if all_word_times else 0, |
|
"min": min(all_word_times) if all_word_times else 0, |
|
"max": max(all_word_times) if all_word_times else 0 |
|
}, |
|
"clue_gen_stats": { |
|
"avg": sum(all_clue_times)/len(all_clue_times) if all_clue_times else 0, |
|
"min": min(all_clue_times) if all_clue_times else 0, |
|
"max": max(all_clue_times) if all_clue_times else 0 |
|
}, |
|
"total_stats": { |
|
"avg": sum(all_total_times)/len(all_total_times) if all_total_times else 0, |
|
"min": min(all_total_times) if all_total_times else 0, |
|
"max": max(all_total_times) if all_total_times else 0 |
|
} |
|
} |
|
|
|
def run_full_test_suite(self): |
|
"""Run the complete test suite.""" |
|
print("🧪 CROSSWORD CLUE GENERATION TEST SUITE") |
|
print("=" * 70) |
|
|
|
if not GENERATORS_AVAILABLE: |
|
print("❌ Cannot run tests - generators not available") |
|
return |
|
|
|
|
|
self.initialize() |
|
|
|
|
|
print("\n" + "="*70) |
|
print("TEST 1: SINGLE TOPIC TESTS") |
|
print("="*70) |
|
|
|
single_topics = ["animals", "technology", "music", "food"] |
|
for topic in single_topics: |
|
result = self.run_single_topic_test(topic, num_words=8) |
|
self.test_results[f"single_{topic}"] = result |
|
|
|
|
|
print("\n" + "="*70) |
|
print("TEST 2: MULTI-TOPIC TEST") |
|
print("="*70) |
|
|
|
multi_result = self.run_multi_topic_test(["science", "technology"], num_words=10) |
|
self.test_results["multi_science_tech"] = multi_result |
|
|
|
|
|
print("\n" + "="*70) |
|
print("TEST 3: CUSTOM SENTENCE TEST") |
|
print("="*70) |
|
|
|
sentence_result = self.run_custom_sentence_test("I love cats and playing guitar", num_words=8) |
|
self.test_results["sentence_cats_guitar"] = sentence_result |
|
|
|
|
|
print("\n" + "="*70) |
|
print("TEST 4: DIFFICULTY COMPARISON") |
|
print("="*70) |
|
|
|
difficulty_result = self.run_difficulty_comparison_test("sports", num_words=5) |
|
self.test_results["difficulty_sports"] = difficulty_result |
|
|
|
|
|
print("\n" + "="*70) |
|
print("TEST 5: PERFORMANCE ANALYSIS") |
|
print("="*70) |
|
|
|
perf_result = self.run_performance_analysis() |
|
self.test_results["performance"] = perf_result |
|
|
|
|
|
print("\n" + "="*70) |
|
print("📋 FINAL SUMMARY") |
|
print("="*70) |
|
|
|
print(f"✅ Test suite completed!") |
|
print(f"📊 Tests run: {len(self.test_results)}") |
|
|
|
|
|
word_info = { |
|
"vocab_size": self.word_generator.get_vocabulary_size(), |
|
"tier_distribution": len(self.word_generator.get_tier_distribution()) |
|
} |
|
|
|
clue_info = self.clue_generator.get_model_info() |
|
|
|
print(f"\n🔧 System Information:") |
|
print(f" Word vocabulary: {word_info['vocab_size']:,} words") |
|
print(f" Clue model: {clue_info['model_name']}") |
|
print(f" Model size: {clue_info.get('model_size_mb', 0):.1f} MB") |
|
|
|
if perf_result: |
|
avg_total = perf_result['total_stats']['avg'] |
|
print(f" Average pipeline time: {avg_total:.2f}s") |
|
|
|
print(f"\n💡 Recommendations for HF Spaces:") |
|
if perf_result and perf_result['total_stats']['avg'] < 15: |
|
print(" ✅ Performance suitable for interactive use") |
|
else: |
|
print(" ⚠️ Consider optimizations for better user experience") |
|
|
|
print("\n🎉 Test suite complete!") |
|
|
|
|
|
def main(): |
|
"""Run the test suite.""" |
|
if not GENERATORS_AVAILABLE: |
|
print("❌ Cannot run tests - required generators not available") |
|
print("Make sure thematic_word_generator.py and llm_clue_generator.py are working") |
|
return |
|
|
|
|
|
test_suite = CrosswordClueTestSuite(vocab_size_limit=50000) |
|
|
|
try: |
|
test_suite.run_full_test_suite() |
|
except KeyboardInterrupt: |
|
print("\n\n⏹️ Test suite interrupted by user") |
|
except Exception as e: |
|
print(f"\n❌ Test suite failed: {e}") |
|
logger.error(f"Test suite error: {e}", exc_info=True) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |