#!/usr/bin/env python3 """ Quick Test: Improved Prompt Engineering Test the improved prompts and validation on a few examples to see if clue quality improved. """ import sys import logging from pathlib import Path # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from llm_clue_generator import LLMClueGenerator GENERATOR_AVAILABLE = True except ImportError as e: print(f"โŒ Import error: {e}") GENERATOR_AVAILABLE = False # Set up logging to see debug output logging.basicConfig( level=logging.DEBUG, format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) def test_improved_prompts(): """Test improved prompt engineering with problematic examples.""" if not GENERATOR_AVAILABLE: print("โŒ Cannot run test - LLM generator not available") return print("๐Ÿงช Testing Improved Prompt Engineering") print("=" * 60) # Initialize generator print("๐Ÿ”„ Initializing LLM clue generator...") generator = LLMClueGenerator() try: generator.initialize() print("โœ… Generator initialized successfully") except Exception as e: print(f"โŒ Failed to initialize generator: {e}") return # Test cases that previously produced bad clues test_cases = [ # Previously bad examples ("CAT", "animals", "definition"), ("KITTY", "animals", "description"), ("MEAL", "food", "category"), ("HUNGER", "food", "simple"), ("TECH", "technology", "category"), ("SCIENTIST", "science", "trivia"), # Additional test cases ("DOG", "animals", "definition"), ("PYTHON", "technology", "description"), ("GUITAR", "music", "category"), ] print(f"\n๐ŸŽฏ Testing {len(test_cases)} word-topic combinations") print("=" * 60) successful_clues = 0 total_tests = len(test_cases) for word, topic, style in test_cases: print(f"\n๐Ÿ“ Testing: '{word}' + '{topic}' (style: {style})") print("-" * 40) try: # Generate clue candidates to see the process candidates = generator.generate_clue_candidates( word=word, topic=topic, clue_style=style, difficulty="medium", num_candidates=3 ) print(f"Generated {len(candidates)} candidates:") for i, candidate in enumerate(candidates, 1): print(f" {i}. {candidate}") # Get best clue best_clue = generator.generate_clue( word=word, topic=topic, clue_style=style, difficulty="medium" ) print(f"\n๐Ÿ† Best clue: {best_clue}") # Evaluate quality if best_clue and len(best_clue) > 5 and word.lower() not in best_clue.lower(): successful_clues += 1 print("โœ… Quality: GOOD") else: print("โŒ Quality: POOR") except Exception as e: print(f"โŒ Error generating clue: {e}") print(f"\n" + "=" * 60) print(f"๐Ÿ“Š RESULTS SUMMARY") print(f"=" * 60) print(f"Total tests: {total_tests}") print(f"Successful clues: {successful_clues}") print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%") if successful_clues >= total_tests * 0.7: # 70% success rate print("๐ŸŽ‰ Improved prompts show significant improvement!") elif successful_clues >= total_tests * 0.4: # 40% success rate print("๐Ÿ”„ Some improvement, but may need model upgrade") else: print("โŒ Prompts still not effective, recommend semantic template approach") def main(): """Run the prompt improvement test.""" test_improved_prompts() if __name__ == "__main__": main()