#!/usr/bin/env python3
"""
Quick Test: Semantic Template Clue Generation
Test the semantic template approach against the same problematic examples that failed with LLM.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from semantic_clue_generator import SemanticClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_semantic_clues():
    """Test semantic template clue generation with problematic examples."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - Semantic generator not available")
        return
    
    print("🧪 Testing Semantic Template Clue Generation")
    print("=" * 60)
    
    # Initialize generator
    print("🔄 Initializing semantic clue generator...")
    generator = SemanticClueGenerator()
    
    try:
        generator.initialize()
        print("✅ Generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that failed with LLM - same examples user reported as junk
    test_cases = [
        # Previously bad LLM examples
        ("CAT", "animals"),
        ("KITTY", "animals"), 
        ("MEAL", "food"),
        ("HUNGER", "food"),
        ("TECH", "technology"),
        ("SCIENTIST", "science"),
        
        # Additional test cases
        ("DOG", "animals"),
        ("PYTHON", "technology"),
        ("GUITAR", "music"),
        ("OCEAN", "geography"),
        ("ATOM", "science"),
        ("PIZZA", "food"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
    print("=" * 60)
    
    successful_clues = 0
    total_tests = len(test_cases)
    
    for word, topic in test_cases:
        print(f"\n📝 Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Generate multiple clues with different styles for variety
            styles = ["category", "definition", "description"]
            candidates = []
            
            for style in styles:
                clue = generator.generate_clue(
                    word=word,
                    topic=topic,
                    clue_style=style
                )
                if clue and clue not in candidates:
                    candidates.append(clue)
            
            print(f"Generated {len(candidates)} candidates:")
            for i, candidate in enumerate(candidates, 1):
                print(f"  {i}. {candidate}")
            
            # Use the first/best clue
            best_clue = candidates[0] if candidates else None
            
            print(f"\n🏆 Best clue: {best_clue}")
            
            # Quality evaluation - more comprehensive than LLM test
            if (best_clue and 
                len(best_clue) > 3 and 
                word.lower() not in best_clue.lower() and
                not any(junk in best_clue.lower() for junk in ['trick and treating', 'gritting your teeth', 'fender', 'occurrence'])):
                successful_clues += 1
                print("✅ Quality: GOOD")
            else:
                print("❌ Quality: POOR")
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
            logger.exception("Detailed error:")
    
    print(f"\n" + "=" * 60)
    print(f"📊 SEMANTIC TEMPLATE RESULTS")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Successful clues: {successful_clues}")
    print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")
    
    # Compare with LLM performance (which was ~0% success)
    if successful_clues >= total_tests * 0.8:  # 80% success rate
        print("🎉 Semantic templates show MAJOR improvement over LLM!")
    elif successful_clues >= total_tests * 0.6:  # 60% success rate  
        print("🔄 Good improvement, semantic approach is viable")
    elif successful_clues >= total_tests * 0.3:  # 30% success rate
        print("⚠️  Some improvement, but templates need refinement")
    else:
        print("❌ Semantic approach also struggling, may need hybrid method")


def main():
    """Run the semantic template test."""
    test_semantic_clues()


if __name__ == "__main__":
    main()