#!/usr/bin/env python3 """ Quick Test: Semantic Template Clue Generation Test the semantic template approach against the same problematic examples that failed with LLM. """ import sys import logging from pathlib import Path # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from semantic_clue_generator import SemanticClueGenerator GENERATOR_AVAILABLE = True except ImportError as e: print(f"โŒ Import error: {e}") GENERATOR_AVAILABLE = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def test_semantic_clues(): """Test semantic template clue generation with problematic examples.""" if not GENERATOR_AVAILABLE: print("โŒ Cannot run test - Semantic generator not available") return print("๐Ÿงช Testing Semantic Template Clue Generation") print("=" * 60) # Initialize generator print("๐Ÿ”„ Initializing semantic clue generator...") generator = SemanticClueGenerator() try: generator.initialize() print("โœ… Generator initialized successfully") except Exception as e: print(f"โŒ Failed to initialize generator: {e}") return # Test cases that failed with LLM - same examples user reported as junk test_cases = [ # Previously bad LLM examples ("CAT", "animals"), ("KITTY", "animals"), ("MEAL", "food"), ("HUNGER", "food"), ("TECH", "technology"), ("SCIENTIST", "science"), # Additional test cases ("DOG", "animals"), ("PYTHON", "technology"), ("GUITAR", "music"), ("OCEAN", "geography"), ("ATOM", "science"), ("PIZZA", "food"), ] print(f"\n๐ŸŽฏ Testing {len(test_cases)} word-topic combinations") print("=" * 60) successful_clues = 0 total_tests = len(test_cases) for word, topic in test_cases: print(f"\n๐Ÿ“ Testing: '{word}' + '{topic}'") print("-" * 40) try: # Generate multiple clues with different styles for variety styles = ["category", "definition", "description"] candidates = [] for style in styles: clue = generator.generate_clue( word=word, topic=topic, clue_style=style ) if clue and clue not in candidates: candidates.append(clue) print(f"Generated {len(candidates)} candidates:") for i, candidate in enumerate(candidates, 1): print(f" {i}. {candidate}") # Use the first/best clue best_clue = candidates[0] if candidates else None print(f"\n๐Ÿ† Best clue: {best_clue}") # Quality evaluation - more comprehensive than LLM test if (best_clue and len(best_clue) > 3 and word.lower() not in best_clue.lower() and not any(junk in best_clue.lower() for junk in ['trick and treating', 'gritting your teeth', 'fender', 'occurrence'])): successful_clues += 1 print("โœ… Quality: GOOD") else: print("โŒ Quality: POOR") except Exception as e: print(f"โŒ Error generating clue: {e}") logger.exception("Detailed error:") print(f"\n" + "=" * 60) print(f"๐Ÿ“Š SEMANTIC TEMPLATE RESULTS") print(f"=" * 60) print(f"Total tests: {total_tests}") print(f"Successful clues: {successful_clues}") print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%") # Compare with LLM performance (which was ~0% success) if successful_clues >= total_tests * 0.8: # 80% success rate print("๐ŸŽ‰ Semantic templates show MAJOR improvement over LLM!") elif successful_clues >= total_tests * 0.6: # 60% success rate print("๐Ÿ”„ Good improvement, semantic approach is viable") elif successful_clues >= total_tests * 0.3: # 30% success rate print("โš ๏ธ Some improvement, but templates need refinement") else: print("โŒ Semantic approach also struggling, may need hybrid method") def main(): """Run the semantic template test.""" test_semantic_clues() if __name__ == "__main__": main()