#!/usr/bin/env python3
"""
Test: Upgraded flan-t5-base Model for Crossword Clue Generation
Compare flan-t5-base performance against the previous flan-t5-small results.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_flan_t5_base():
    """Test flan-t5-base model with problematic examples that failed with flan-t5-small."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("🧪 Testing Upgraded flan-t5-base Model")
    print("=" * 60)
    
    # Initialize generator with base model
    print("🔄 Initializing flan-t5-base clue generator...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print(f"✅ Generator initialized successfully with {generator.model_name}")
        print(f"📊 Model size: ~1GB (vs ~80MB for flan-t5-small)")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that produced terrible results with flan-t5-small
    test_cases = [
        # Previous failures with flan-t5-small:
        # CAT + animals → "Tryon", "Trick and treating"
        # MEAL + food → "Jack nixt", "fender"  
        # SONG + music → "Gritting your teeth"
        ("CAT", "animals"),
        ("KITTY", "animals"), 
        ("MEAL", "food"),
        ("HUNGER", "food"),
        ("SONG", "music"),
        ("GUITAR", "music"),
        
        # Your specific problematic examples
        ("BATSMAN", "cricket"),
        ("SWIMMING", "sports"),
        ("AIRPORT", "transportation"),
        
        # Additional challenging cases
        ("DATABASE", "technology"),
        ("SCIENTIST", "science"),
        ("PIZZA", "food"),
        ("MOUNTAIN", "geography"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} word-topic combinations with flan-t5-base")
    print("=" * 60)
    
    excellent_clues = 0
    good_clues = 0
    poor_clues = 0
    failed_clues = 0
    
    # Track specific improvements over flan-t5-small
    major_improvements = []
    
    for word, topic in test_cases:
        print(f"\n📝 Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Test multiple clue styles to get best result
            styles = ["definition", "description", "category", "function", "context"]
            candidates = []
            
            for style in styles:
                clue = generator.generate_clue(
                    word=word,
                    topic=topic,
                    clue_style=style,
                    difficulty="medium"
                )
                if clue and len(clue) > 5:
                    candidates.append((style, clue))
            
            if candidates:
                print(f"Generated {len(candidates)} candidates:")
                for i, (style, clue) in enumerate(candidates, 1):
                    print(f"  {i}. [{style}] {clue}")
                
                # Use the first valid clue as best
                best_style, best_clue = candidates[0]
                print(f"\n🏆 Best clue [{best_style}]: {best_clue}")
                
                # Enhanced quality evaluation
                word_lower = word.lower()
                clue_lower = best_clue.lower()
                
                # Check if contains target word (should not)
                contains_word = word_lower in clue_lower
                
                # Check for nonsense patterns from flan-t5-small
                old_nonsense = any(bad in clue_lower for bad in [
                    "trick and treating", "gritting your teeth", "jack nixt", 
                    "fender", "tryon", "nicolas", "occurrence", "sludge"
                ])
                
                # Check for descriptive quality
                is_descriptive = (
                    len(best_clue.split()) >= 2 and
                    len(best_clue) >= 8 and
                    not contains_word and
                    not old_nonsense
                )
                
                # Check for definitional quality  
                is_definitional = (
                    any(def_word in clue_lower for def_word in [
                        "player", "sport", "instrument", "device", "system", "food", 
                        "language", "place", "animal", "creature", "location"
                    ]) and not contains_word
                )
                
                if contains_word:
                    print("❌ Quality: POOR (contains target word)")
                    poor_clues += 1
                elif old_nonsense:
                    print("❌ Quality: POOR (nonsensical)")
                    poor_clues += 1
                elif is_definitional:
                    print("✅ Quality: EXCELLENT (definitional)")
                    excellent_clues += 1
                    major_improvements.append((word, topic, best_clue))
                elif is_descriptive:
                    print("✅ Quality: GOOD (descriptive)")
                    good_clues += 1
                    major_improvements.append((word, topic, best_clue))
                else:
                    print("🔄 Quality: ACCEPTABLE")
                    good_clues += 1
            else:
                print("❌ No valid clues generated")
                failed_clues += 1
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
            failed_clues += 1
    
    total_tests = len(test_cases)
    print(f"\n" + "=" * 60)
    print(f"📊 FLAN-T5-BASE RESULTS")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Excellent clues: {excellent_clues}")
    print(f"Good clues: {good_clues}")
    print(f"Poor clues: {poor_clues}")
    print(f"Failed clues: {failed_clues}")
    print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
    print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
    
    # Show major improvements
    if major_improvements:
        print(f"\n🎉 MAJOR IMPROVEMENTS OVER FLAN-T5-SMALL:")
        print("-" * 60)
        for word, topic, clue in major_improvements[:5]:  # Show top 5
            print(f"  {word} + {topic}: \"{clue}\"")
    
    # Evaluation compared to flan-t5-small (which had ~0% success)
    if excellent_clues >= total_tests * 0.4:  # 40% excellent
        print("🎉 MAJOR SUCCESS! flan-t5-base produces excellent clues!")
        print("🚀 Ready for production use - significant upgrade from flan-t5-small")
    elif (excellent_clues + good_clues) >= total_tests * 0.6:  # 60% good+excellent  
        print("🔄 Good improvement! Much better than flan-t5-small")
        print("✅ Suitable for production with semantic fallback")
    elif (excellent_clues + good_clues) >= total_tests * 0.3:  # 30% success
        print("⚠️  Some improvement over flan-t5-small, but still limited")
    else:
        print("❌ Still struggling - may need even larger model or external knowledge")


def main():
    """Run the flan-t5-base upgrade test."""
    test_flan_t5_base()


if __name__ == "__main__":
    main()