#!/usr/bin/env python3
"""
Quick Test: Improved Prompt Engineering
Test the improved prompts and validation on a few examples to see if clue quality improved.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging to see debug output
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


def test_improved_prompts():
    """Test improved prompt engineering with problematic examples."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("🧪 Testing Improved Prompt Engineering")
    print("=" * 60)
    
    # Initialize generator
    print("🔄 Initializing LLM clue generator...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print("✅ Generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that previously produced bad clues
    test_cases = [
        # Previously bad examples
        ("CAT", "animals", "definition"),
        ("KITTY", "animals", "description"), 
        ("MEAL", "food", "category"),
        ("HUNGER", "food", "simple"),
        ("TECH", "technology", "category"),
        ("SCIENTIST", "science", "trivia"),
        
        # Additional test cases
        ("DOG", "animals", "definition"),
        ("PYTHON", "technology", "description"),
        ("GUITAR", "music", "category"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
    print("=" * 60)
    
    successful_clues = 0
    total_tests = len(test_cases)
    
    for word, topic, style in test_cases:
        print(f"\n📝 Testing: '{word}' + '{topic}' (style: {style})")
        print("-" * 40)
        
        try:
            # Generate clue candidates to see the process
            candidates = generator.generate_clue_candidates(
                word=word,
                topic=topic,
                clue_style=style,
                difficulty="medium",
                num_candidates=3
            )
            
            print(f"Generated {len(candidates)} candidates:")
            for i, candidate in enumerate(candidates, 1):
                print(f"  {i}. {candidate}")
            
            # Get best clue
            best_clue = generator.generate_clue(
                word=word,
                topic=topic,
                clue_style=style,
                difficulty="medium"
            )
            
            print(f"\n🏆 Best clue: {best_clue}")
            
            # Evaluate quality
            if best_clue and len(best_clue) > 5 and word.lower() not in best_clue.lower():
                successful_clues += 1
                print("✅ Quality: GOOD")
            else:
                print("❌ Quality: POOR")
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
    
    print(f"\n" + "=" * 60)
    print(f"📊 RESULTS SUMMARY")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Successful clues: {successful_clues}")
    print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")
    
    if successful_clues >= total_tests * 0.7:  # 70% success rate
        print("🎉 Improved prompts show significant improvement!")
    elif successful_clues >= total_tests * 0.4:  # 40% success rate  
        print("🔄 Some improvement, but may need model upgrade")
    else:
        print("❌ Prompts still not effective, recommend semantic template approach")


def main():
    """Run the prompt improvement test."""
    test_improved_prompts()


if __name__ == "__main__":
    main()