#!/usr/bin/env python3 """ Test Script: Thematic Word Generation + LLM Clue Generation Integrates the existing thematic_word_generator.py with the new llm_clue_generator.py to create a complete word-to-clue pipeline for crossword puzzles. Tests various scenarios: - Single topics - Multiple topics - Custom sentences - Different difficulties - Performance analysis """ import os import sys import time import logging from typing import List, Dict, Tuple, Any from pathlib import Path # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from thematic_word_generator import UnifiedThematicWordGenerator from llm_clue_generator import LLMClueGenerator GENERATORS_AVAILABLE = True except ImportError as e: print(f"โŒ Import error: {e}") print("Make sure thematic_word_generator.py and llm_clue_generator.py are in the same directory") GENERATORS_AVAILABLE = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) class CrosswordClueTestSuite: """ Test suite for integrated thematic word generation + LLM clue generation. """ def __init__(self, vocab_size_limit: int = 50000): """Initialize the test suite. Args: vocab_size_limit: Vocabulary size for thematic generator (smaller for faster testing) """ if not GENERATORS_AVAILABLE: raise ImportError("Required generators not available") self.vocab_size_limit = vocab_size_limit self.word_generator = None self.clue_generator = None # Test results storage self.test_results = {} self.performance_stats = {} def initialize(self): """Initialize both generators.""" print("๐Ÿš€ Initializing Crossword Clue Test Suite") print("=" * 60) # Initialize thematic word generator print("\n๐Ÿ”„ Initializing thematic word generator...") start_time = time.time() self.word_generator = UnifiedThematicWordGenerator( vocab_size_limit=self.vocab_size_limit ) self.word_generator.initialize() word_gen_time = time.time() - start_time print(f"โœ… Word generator ready in {word_gen_time:.2f}s") # Initialize LLM clue generator print("\n๐Ÿ”„ Initializing LLM clue generator...") start_time = time.time() self.clue_generator = LLMClueGenerator() self.clue_generator.initialize() clue_gen_time = time.time() - start_time print(f"โœ… Clue generator ready in {clue_gen_time:.2f}s") # Store initialization stats self.performance_stats['word_gen_init_time'] = word_gen_time self.performance_stats['clue_gen_init_time'] = clue_gen_time self.performance_stats['total_init_time'] = word_gen_time + clue_gen_time print(f"\nโœ… Test suite initialized in {word_gen_time + clue_gen_time:.2f}s") def run_single_topic_test(self, topic: str, num_words: int = 10) -> Dict[str, Any]: """Test single topic word+clue generation. Args: topic: Single topic string num_words: Number of words to generate Returns: Test results dictionary """ print(f"\n๐ŸŽฏ Single Topic Test: '{topic}'") print("-" * 50) start_time = time.time() # Step 1: Generate thematic words print(f"๐Ÿ“ Generating {num_words} thematic words...") word_start_time = time.time() thematic_results = self.word_generator.generate_thematic_words( inputs=topic, num_words=num_words, min_similarity=0.3 ) word_gen_time = time.time() - word_start_time print(f"โœ… Generated {len(thematic_results)} words in {word_gen_time:.2f}s") if not thematic_results: return {"error": "No thematic words generated"} # Step 2: Generate clues for each word print(f"๐ŸŽญ Generating clues for {len(thematic_results)} words...") clue_start_time = time.time() word_clue_pairs = [] for word, similarity, tier in thematic_results: try: clue = self.clue_generator.generate_clue( word=word, topic=topic, clue_style="category", difficulty="medium" ) word_clue_pairs.append({ "word": word.upper(), "clue": clue, "similarity": similarity, "tier": tier, "length": len(word) }) except Exception as e: logger.error(f"Failed to generate clue for '{word}': {e}") word_clue_pairs.append({ "word": word.upper(), "clue": f"Related to {topic}: {word}", # Fallback "similarity": similarity, "tier": tier, "length": len(word), "error": str(e) }) clue_gen_time = time.time() - clue_start_time total_time = time.time() - start_time # Display results print(f"โœ… Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s") print(f"\n๐Ÿ“‹ Results for topic '{topic}':") print("=" * 60) for i, item in enumerate(word_clue_pairs, 1): tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier']) error_marker = " โš ๏ธ" if 'error' in item else "" print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}") print(f" Similarity: {item['similarity']:.3f} | {tier_desc}") # Performance summary print(f"\nโฑ๏ธ Performance:") print(f" Word generation: {word_gen_time:.2f}s") print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)") print(f" Total time: {total_time:.2f}s") return { "topic": topic, "num_words_requested": num_words, "num_words_generated": len(word_clue_pairs), "word_clue_pairs": word_clue_pairs, "performance": { "word_gen_time": word_gen_time, "clue_gen_time": clue_gen_time, "total_time": total_time, "avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0 } } def run_multi_topic_test(self, topics: List[str], num_words: int = 12) -> Dict[str, Any]: """Test multi-topic word+clue generation. Args: topics: List of topic strings num_words: Number of words to generate Returns: Test results dictionary """ print(f"\n๐ŸŽฏ Multi-Topic Test: {topics}") print("-" * 50) start_time = time.time() # Step 1: Generate thematic words (multi-theme enabled) print(f"๐Ÿ“ Generating {num_words} multi-thematic words...") word_start_time = time.time() thematic_results = self.word_generator.generate_thematic_words( inputs=topics, num_words=num_words, min_similarity=0.25, # Lower threshold for multi-topic multi_theme=True # Enable multi-theme processing ) word_gen_time = time.time() - word_start_time print(f"โœ… Generated {len(thematic_results)} words in {word_gen_time:.2f}s") if not thematic_results: return {"error": "No thematic words generated"} # Step 2: Generate contextual clues print(f"๐ŸŽญ Generating contextual clues...") clue_start_time = time.time() # Create topic context string for clue generation topic_context = " and ".join(topics) word_clue_pairs = [] for word, similarity, tier in thematic_results: try: clue = self.clue_generator.generate_clue( word=word, topic=topic_context, clue_style="description", # Use descriptive style for multi-topic difficulty="medium" ) word_clue_pairs.append({ "word": word.upper(), "clue": clue, "similarity": similarity, "tier": tier, "length": len(word) }) except Exception as e: logger.error(f"Failed to generate clue for '{word}': {e}") word_clue_pairs.append({ "word": word.upper(), "clue": f"Related to {topic_context}: {word}", "similarity": similarity, "tier": tier, "length": len(word), "error": str(e) }) clue_gen_time = time.time() - clue_start_time total_time = time.time() - start_time # Display results print(f"โœ… Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s") print(f"\n๐Ÿ“‹ Results for topics {topics}:") print("=" * 70) for i, item in enumerate(word_clue_pairs, 1): tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier']) error_marker = " โš ๏ธ" if 'error' in item else "" print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}") print(f" Similarity: {item['similarity']:.3f} | {tier_desc}") # Performance summary print(f"\nโฑ๏ธ Performance:") print(f" Word generation: {word_gen_time:.2f}s") print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)") print(f" Total time: {total_time:.2f}s") return { "topics": topics, "num_words_requested": num_words, "num_words_generated": len(word_clue_pairs), "word_clue_pairs": word_clue_pairs, "performance": { "word_gen_time": word_gen_time, "clue_gen_time": clue_gen_time, "total_time": total_time, "avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0 } } def run_custom_sentence_test(self, sentence: str, num_words: int = 10) -> Dict[str, Any]: """Test custom sentence word+clue generation. Args: sentence: Custom sentence input num_words: Number of words to generate Returns: Test results dictionary """ print(f"\n๐ŸŽฏ Custom Sentence Test: '{sentence}'") print("-" * 60) start_time = time.time() # Step 1: Generate thematic words from sentence print(f"๐Ÿ“ Generating {num_words} words from sentence...") word_start_time = time.time() thematic_results = self.word_generator.generate_thematic_words( inputs=sentence, num_words=num_words, min_similarity=0.2 # Lower threshold for sentences ) word_gen_time = time.time() - word_start_time print(f"โœ… Generated {len(thematic_results)} words in {word_gen_time:.2f}s") if not thematic_results: return {"error": "No thematic words generated"} # Step 2: Generate personalized clues print(f"๐ŸŽญ Generating personalized clues...") clue_start_time = time.time() word_clue_pairs = [] for word, similarity, tier in thematic_results: try: # Use the original sentence as context for more personalized clues clue = self.clue_generator.generate_clue( word=word, topic=f"theme: {sentence}", clue_style="description", difficulty="medium" ) word_clue_pairs.append({ "word": word.upper(), "clue": clue, "similarity": similarity, "tier": tier, "length": len(word) }) except Exception as e: logger.error(f"Failed to generate clue for '{word}': {e}") word_clue_pairs.append({ "word": word.upper(), "clue": f"From '{sentence[:30]}...': {word}", "similarity": similarity, "tier": tier, "length": len(word), "error": str(e) }) clue_gen_time = time.time() - clue_start_time total_time = time.time() - start_time # Display results print(f"โœ… Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s") print(f"\n๐Ÿ“‹ Results for sentence: '{sentence}'") print("=" * 70) for i, item in enumerate(word_clue_pairs, 1): tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier']) error_marker = " โš ๏ธ" if 'error' in item else "" print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}") print(f" Similarity: {item['similarity']:.3f} | {tier_desc}") # Performance summary print(f"\nโฑ๏ธ Performance:") print(f" Word generation: {word_gen_time:.2f}s") print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)") print(f" Total time: {total_time:.2f}s") return { "sentence": sentence, "num_words_requested": num_words, "num_words_generated": len(word_clue_pairs), "word_clue_pairs": word_clue_pairs, "performance": { "word_gen_time": word_gen_time, "clue_gen_time": clue_gen_time, "total_time": total_time, "avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0 } } def run_difficulty_comparison_test(self, topic: str, num_words: int = 6) -> Dict[str, Any]: """Test different difficulty levels for the same topic. Args: topic: Topic to test num_words: Number of words to generate Returns: Comparison results """ print(f"\n๐ŸŽฏ Difficulty Comparison Test: '{topic}'") print("-" * 50) difficulties = ["easy", "medium", "hard"] results = {} # Generate words once (reuse for all difficulties) thematic_results = self.word_generator.generate_thematic_words( inputs=topic, num_words=num_words, min_similarity=0.3 )[:num_words] # Take only requested number if not thematic_results: return {"error": "No thematic words generated"} print(f"๐Ÿ“ Testing {len(thematic_results)} words at different difficulty levels...") for difficulty in difficulties: print(f"\n--- {difficulty.upper()} Difficulty ---") clue_pairs = [] start_time = time.time() for word, similarity, tier in thematic_results: try: clue = self.clue_generator.generate_clue( word=word, topic=topic, clue_style="category", difficulty=difficulty ) clue_pairs.append({ "word": word.upper(), "clue": clue, "similarity": similarity, "tier": tier }) except Exception as e: logger.error(f"Failed to generate {difficulty} clue for '{word}': {e}") clue_pairs.append({ "word": word.upper(), "clue": f"{difficulty.title()} clue for {word}", "similarity": similarity, "tier": tier, "error": str(e) }) generation_time = time.time() - start_time results[difficulty] = { "clue_pairs": clue_pairs, "generation_time": generation_time } # Display this difficulty's results for i, item in enumerate(clue_pairs, 1): error_marker = " โš ๏ธ" if 'error' in item else "" print(f" {i}. {item['word']:<10} - {item['clue']}{error_marker}") return { "topic": topic, "difficulties_tested": difficulties, "results": results, "base_words": [{"word": w, "similarity": s, "tier": t} for w, s, t in thematic_results] } def run_performance_analysis(self) -> Dict[str, Any]: """Analyze overall performance characteristics.""" print(f"\n๐Ÿ“Š Performance Analysis") print("-" * 40) # Collect performance stats from previous tests if not self.test_results: print("โš ๏ธ No test results available for performance analysis") return {} all_word_times = [] all_clue_times = [] all_total_times = [] for test_name, result in self.test_results.items(): if 'performance' in result: perf = result['performance'] all_word_times.append(perf.get('word_gen_time', 0)) all_clue_times.append(perf.get('clue_gen_time', 0)) all_total_times.append(perf.get('total_time', 0)) if all_word_times: print(f"๐Ÿ“ˆ Word Generation Performance:") print(f" Average: {sum(all_word_times)/len(all_word_times):.2f}s") print(f" Min: {min(all_word_times):.2f}s") print(f" Max: {max(all_word_times):.2f}s") print(f"\n๐ŸŽญ Clue Generation Performance:") print(f" Average: {sum(all_clue_times)/len(all_clue_times):.2f}s") print(f" Min: {min(all_clue_times):.2f}s") print(f" Max: {max(all_clue_times):.2f}s") print(f"\nโฑ๏ธ Total Pipeline Performance:") print(f" Average: {sum(all_total_times)/len(all_total_times):.2f}s") print(f" Min: {min(all_total_times):.2f}s") print(f" Max: {max(all_total_times):.2f}s") return { "word_gen_stats": { "avg": sum(all_word_times)/len(all_word_times) if all_word_times else 0, "min": min(all_word_times) if all_word_times else 0, "max": max(all_word_times) if all_word_times else 0 }, "clue_gen_stats": { "avg": sum(all_clue_times)/len(all_clue_times) if all_clue_times else 0, "min": min(all_clue_times) if all_clue_times else 0, "max": max(all_clue_times) if all_clue_times else 0 }, "total_stats": { "avg": sum(all_total_times)/len(all_total_times) if all_total_times else 0, "min": min(all_total_times) if all_total_times else 0, "max": max(all_total_times) if all_total_times else 0 } } def run_full_test_suite(self): """Run the complete test suite.""" print("๐Ÿงช CROSSWORD CLUE GENERATION TEST SUITE") print("=" * 70) if not GENERATORS_AVAILABLE: print("โŒ Cannot run tests - generators not available") return # Initialize self.initialize() # Test 1: Single topics print("\n" + "="*70) print("TEST 1: SINGLE TOPIC TESTS") print("="*70) single_topics = ["animals", "technology", "music", "food"] for topic in single_topics: result = self.run_single_topic_test(topic, num_words=8) self.test_results[f"single_{topic}"] = result # Test 2: Multi-topic print("\n" + "="*70) print("TEST 2: MULTI-TOPIC TEST") print("="*70) multi_result = self.run_multi_topic_test(["science", "technology"], num_words=10) self.test_results["multi_science_tech"] = multi_result # Test 3: Custom sentence print("\n" + "="*70) print("TEST 3: CUSTOM SENTENCE TEST") print("="*70) sentence_result = self.run_custom_sentence_test("I love cats and playing guitar", num_words=8) self.test_results["sentence_cats_guitar"] = sentence_result # Test 4: Difficulty comparison print("\n" + "="*70) print("TEST 4: DIFFICULTY COMPARISON") print("="*70) difficulty_result = self.run_difficulty_comparison_test("sports", num_words=5) self.test_results["difficulty_sports"] = difficulty_result # Test 5: Performance analysis print("\n" + "="*70) print("TEST 5: PERFORMANCE ANALYSIS") print("="*70) perf_result = self.run_performance_analysis() self.test_results["performance"] = perf_result # Final summary print("\n" + "="*70) print("๐Ÿ“‹ FINAL SUMMARY") print("="*70) print(f"โœ… Test suite completed!") print(f"๐Ÿ“Š Tests run: {len(self.test_results)}") # Model info word_info = { "vocab_size": self.word_generator.get_vocabulary_size(), "tier_distribution": len(self.word_generator.get_tier_distribution()) } clue_info = self.clue_generator.get_model_info() print(f"\n๐Ÿ”ง System Information:") print(f" Word vocabulary: {word_info['vocab_size']:,} words") print(f" Clue model: {clue_info['model_name']}") print(f" Model size: {clue_info.get('model_size_mb', 0):.1f} MB") if perf_result: avg_total = perf_result['total_stats']['avg'] print(f" Average pipeline time: {avg_total:.2f}s") print(f"\n๐Ÿ’ก Recommendations for HF Spaces:") if perf_result and perf_result['total_stats']['avg'] < 15: print(" โœ… Performance suitable for interactive use") else: print(" โš ๏ธ Consider optimizations for better user experience") print("\n๐ŸŽ‰ Test suite complete!") def main(): """Run the test suite.""" if not GENERATORS_AVAILABLE: print("โŒ Cannot run tests - required generators not available") print("Make sure thematic_word_generator.py and llm_clue_generator.py are working") return # Create and run test suite test_suite = CrosswordClueTestSuite(vocab_size_limit=50000) # Use existing cached embeddings try: test_suite.run_full_test_suite() except KeyboardInterrupt: print("\n\nโน๏ธ Test suite interrupted by user") except Exception as e: print(f"\nโŒ Test suite failed: {e}") logger.error(f"Test suite error: {e}", exc_info=True) if __name__ == "__main__": main()