#!/usr/bin/env python3
"""
Enhanced Test: Semantic Clue Generator with ThematicWordGenerator Integration
Test semantic clue generation with proper embedding integration for better quality.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from semantic_clue_generator import SemanticClueGenerator
    from thematic_word_generator import UnifiedThematicWordGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_enhanced_semantic_clues():
    """Test semantic clue generation with thematic word generator integration."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - Enhanced generators not available")
        return
    
    print("🧪 Testing Enhanced Semantic Clue Generation")
    print("=" * 60)
    
    # Initialize thematic word generator first
    print("🔄 Initializing thematic word generator...")
    try:
        word_gen = UnifiedThematicWordGenerator(vocab_size_limit=50000)
        word_gen.initialize()
        print("✅ Thematic word generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize thematic word generator: {e}")
        return
    
    # Initialize semantic clue generator with thematic integration
    print("🔄 Initializing semantic clue generator with thematic integration...")
    clue_gen = SemanticClueGenerator(thematic_word_generator=word_gen)
    
    try:
        clue_gen.initialize()
        print("✅ Semantic clue generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize semantic clue generator: {e}")
        return
    
    # Test cases that previously failed with LLM
    test_cases = [
        # Previously problematic examples
        ("CAT", "animals"),
        ("KITTY", "animals"), 
        ("MEAL", "food"),
        ("HUNGER", "food"),
        ("TECH", "technology"),
        ("SCIENTIST", "science"),
        
        # Additional challenging cases
        ("DOG", "animals"),
        ("PYTHON", "technology"),
        ("GUITAR", "music"),
        ("OCEAN", "geography"),
        ("ATOM", "science"),
        ("PIZZA", "food"),
        ("MOUNTAIN", "geography"),
        ("VIOLIN", "music"),
        ("DATABASE", "technology"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} word-topic combinations with enhanced semantic analysis")
    print("=" * 60)
    
    successful_clues = 0
    total_tests = len(test_cases)
    high_quality_clues = 0
    
    for word, topic in test_cases:
        print(f"\n📝 Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Generate multiple clues with different styles for variety
            styles = ["category", "definition", "description"]
            candidates = []
            
            for style in styles:
                clue = clue_gen.generate_clue(
                    word=word,
                    topic=topic,
                    clue_style=style,
                    difficulty="medium"
                )
                if clue and clue not in candidates:
                    candidates.append(clue)
            
            print(f"Generated {len(candidates)} candidates:")
            for i, candidate in enumerate(candidates, 1):
                print(f"  {i}. {candidate}")
            
            # Use the best clue (first one)
            best_clue = candidates[0] if candidates else None
            
            print(f"\n🏆 Best clue: {best_clue}")
            
            # Enhanced quality evaluation
            if best_clue:
                # Basic quality check
                basic_quality = (len(best_clue) > 3 and 
                               word.lower() not in best_clue.lower())
                
                # Check for generic fallback patterns
                is_generic = ("term related to" in best_clue.lower() or
                             "associated with" in best_clue.lower())
                
                # Check for descriptive quality
                is_descriptive = (len(best_clue.split()) >= 3 and
                                not is_generic and
                                basic_quality)
                
                if is_descriptive:
                    high_quality_clues += 1
                    successful_clues += 1
                    print("✅ Quality: EXCELLENT")
                elif basic_quality and not is_generic:
                    successful_clues += 1
                    print("✅ Quality: GOOD")
                elif basic_quality:
                    successful_clues += 1
                    print("🔄 Quality: ACCEPTABLE (generic)")
                else:
                    print("❌ Quality: POOR")
            else:
                print("❌ No clue generated")
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
            logger.exception("Detailed error:")
    
    print(f"\n" + "=" * 60)
    print(f"📊 ENHANCED SEMANTIC RESULTS")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Successful clues: {successful_clues}")
    print(f"High quality clues: {high_quality_clues}")
    print(f"Overall success rate: {(successful_clues/total_tests)*100:.1f}%")
    print(f"High quality rate: {(high_quality_clues/total_tests)*100:.1f}%")
    
    # Enhanced evaluation criteria
    if high_quality_clues >= total_tests * 0.6:  # 60% high quality
        print("🎉 Enhanced semantic approach produces excellent clues!")
        print("🚀 Ready for integration into main crossword application")
    elif successful_clues >= total_tests * 0.8:  # 80% acceptable 
        print("🔄 Good improvement over LLM, suitable for production use")
    elif successful_clues >= total_tests * 0.6:  # 60% acceptable
        print("⚠️  Decent improvement, may need more template refinement")
    else:
        print("❌ Still struggling, consider alternative approaches")


def interactive_test():
    """Interactive test mode for user-provided word-topic combinations."""
    print("🧪 Interactive Semantic Clue Testing")
    print("=" * 60)
    
    # Initialize thematic word generator first
    print("🔄 Initializing thematic word generator...")
    try:
        word_gen = UnifiedThematicWordGenerator(vocab_size_limit=50000)
        word_gen.initialize()
        print("✅ Thematic word generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize thematic word generator: {e}")
        return
    
    # Initialize semantic clue generator with thematic integration
    print("🔄 Initializing semantic clue generator with thematic integration...")
    clue_gen = SemanticClueGenerator(thematic_word_generator=word_gen)
    
    try:
        clue_gen.initialize()
        print("✅ Semantic clue generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize semantic clue generator: {e}")
        return
    
    print("\n" + "=" * 60)
    print("🎯 INTERACTIVE MODE")
    print("=" * 60)
    print("Enter word-topic pairs to test clue generation.")
    print("Format: word,topic (e.g., 'cat,animals')")
    print("Type 'quit' or 'exit' to stop.")
    print("Type 'batch' to run the full test suite.")
    print("-" * 60)
    
    while True:
        try:
            user_input = input("\n📝 Enter word,topic: ").strip()
            
            if user_input.lower() in ['quit', 'exit', 'q']:
                print("👋 Goodbye!")
                break
            elif user_input.lower() == 'batch':
                print("\n🔄 Running full test suite...")
                test_enhanced_semantic_clues()
                print("\n" + "=" * 60)
                print("🎯 Back to interactive mode")
                print("-" * 60)
                continue
            elif not user_input or ',' not in user_input:
                print("❌ Invalid format. Use: word,topic (e.g., 'cat,animals')")
                continue
            
            # Parse input
            parts = user_input.split(',', 1)
            word = parts[0].strip().upper()
            topic = parts[1].strip().lower()
            
            if not word or not topic:
                print("❌ Both word and topic are required")
                continue
            
            print(f"\n📝 Testing: '{word}' + '{topic}'")
            print("-" * 40)
            
            # Generate multiple clues with different styles for variety
            styles = ["category", "definition", "description"]
            candidates = []
            
            for style in styles:
                try:
                    clue = clue_gen.generate_clue(
                        word=word,
                        topic=topic,
                        clue_style=style,
                        difficulty="medium"
                    )
                    if clue and clue not in candidates:
                        candidates.append(clue)
                except Exception as e:
                    logger.debug(f"Error with style {style}: {e}")
            
            if candidates:
                print(f"Generated {len(candidates)} candidates:")
                for i, candidate in enumerate(candidates, 1):
                    print(f"  {i}. {candidate}")
                
                best_clue = candidates[0]
                print(f"\n🏆 Best clue: {best_clue}")
                
                # Quality evaluation
                if (best_clue and 
                    len(best_clue) > 3 and 
                    word.lower() not in best_clue.lower()):
                    
                    is_generic = ("term related to" in best_clue.lower() or
                                 "associated with" in best_clue.lower())
                    
                    if len(best_clue.split()) >= 3 and not is_generic:
                        print("✅ Quality: EXCELLENT")
                    elif not is_generic:
                        print("✅ Quality: GOOD")
                    else:
                        print("🔄 Quality: ACCEPTABLE (generic)")
                else:
                    print("❌ Quality: POOR")
            else:
                print("❌ No clues generated")
                
        except KeyboardInterrupt:
            print("\n👋 Goodbye!")
            break
        except Exception as e:
            print(f"❌ Error: {e}")


def main():
    """Run the enhanced semantic test."""
    import sys
    
    if len(sys.argv) > 1 and sys.argv[1] == '--interactive':
        interactive_test()
    else:
        print("Run with --interactive for user input mode, or without args for full test.")
        test_enhanced_semantic_clues()


if __name__ == "__main__":
    main()