#!/usr/bin/env python3
"""
Test: flan-t5-large Model for Superior Crossword Clue Generation
Test the most capable model to eliminate generic responses and achieve excellence.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_flan_t5_large():
    """Test flan-t5-large model for superior crossword clue quality."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("🧪 Testing flan-t5-large Model (No Fallbacks)")
    print("=" * 60)
    
    # Initialize generator with large model
    print("🔄 Initializing flan-t5-large clue generator...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print(f"✅ Generator initialized successfully with {generator.model_name}")
        print(f"📊 Model size: ~3GB (3x larger than base, 37x larger than small)")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        print("💡 Note: flan-t5-large requires ~3GB RAM and longer initialization time")
        return
    
    # Challenging test cases that should be handled well by a large model
    test_cases = [
        # Basic cases that failed with smaller models
        ("CAT", "animals"),
        ("BATSMAN", "cricket"), 
        ("SWIMMING", "sports"),
        ("AIRPORT", "transportation"),
        ("DATABASE", "technology"),

        # More challenging cases requiring world knowledge
        ("VIOLIN", "music"),
        ("SCIENTIST", "science"),
        ("PIZZA", "food"),
        ("MOUNTAIN", "geography"),
        ("HELICOPTER", "transportation"),
        ("DEMOCRACY", "politics"),
        ("PHOTOSYNTHESIS", "science"),

        # Abstract concepts
        ("HAPPINESS", "emotions"),
        ("ALGORITHM", "technology"),
        ("METAPHOR", "literature"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} challenging word-topic combinations")
    print("=" * 60)
    
    excellent_clues = 0
    good_clues = 0
    generic_clues = 0
    poor_clues = 0
    
    for word, topic in test_cases:
        print(f"\n📝 Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Test the best-performing clue style
            best_clue = generator.generate_clue(
                word=word,
                topic=topic,
                clue_style="definition",  # Usually produces the best results
                difficulty="medium"
            )
            
            if best_clue and len(best_clue) > 3:
                print(f"🏆 Generated clue: {best_clue}")
                
                # Comprehensive quality evaluation
                word_lower = word.lower()
                clue_lower = best_clue.lower()
                
                # Critical quality checks
                contains_word = word_lower in clue_lower
                is_generic = any(generic in clue_lower for generic in [
                    "make it moderately challenging", "make it challenging", 
                    "make it difficult", "make it easier", "moderately challenging",
                    "difficult", "easy"
                ])
                is_nonsensical = any(nonsense in clue_lower for nonsense in [
                    "a) a) a)", "trick and treating", "gritting your teeth", 
                    "jack nixt", "fender", "tryon"
                ])
                
                # Positive quality indicators
                has_definition = any(def_word in clue_lower for def_word in [
                    "player", "instrument", "device", "system", "place", "location",
                    "animal", "creature", "building", "process", "method", "concept",
                    "sport", "activity", "food", "dish", "language", "tool"
                ])
                
                is_descriptive = (
                    len(best_clue.split()) >= 3 and
                    len(best_clue) >= 10 and
                    not contains_word and
                    not is_generic and
                    not is_nonsensical
                )
                
                # Quality scoring
                if contains_word:
                    print("❌ Quality: POOR (contains target word)")
                    poor_clues += 1
                elif is_nonsensical:
                    print("❌ Quality: POOR (nonsensical)")
                    poor_clues += 1
                elif is_generic:
                    print("⚠️  Quality: GENERIC (template response)")
                    generic_clues += 1
                elif has_definition and is_descriptive:
                    print("✅ Quality: EXCELLENT (definitional & descriptive)")
                    excellent_clues += 1
                elif is_descriptive:
                    print("✅ Quality: GOOD (descriptive)")
                    good_clues += 1
                elif has_definition:
                    print("🔄 Quality: ACCEPTABLE (basic definition)")
                    good_clues += 1
                else:
                    print("⚠️  Quality: GENERIC (basic)")
                    generic_clues += 1
            else:
                print("❌ No valid clue generated")
                poor_clues += 1
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
            poor_clues += 1
    
    total_tests = len(test_cases)
    print(f"\n" + "=" * 60)
    print(f"📊 FLAN-T5-LARGE RESULTS (NO FALLBACKS)")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Excellent clues: {excellent_clues}")
    print(f"Good clues: {good_clues}")
    print(f"Generic clues: {generic_clues}")
    print(f"Poor clues: {poor_clues}")
    print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
    print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
    print(f"Generic rate: {(generic_clues/total_tests)*100:.1f}%")
    
    # Final evaluation - high standards for large model
    if excellent_clues >= total_tests * 0.6:  # 60% excellent
        print("🎉 SUCCESS! flan-t5-large produces excellent crossword clues!")
        print("🚀 Ready for production - no fallbacks needed!")
    elif excellent_clues >= total_tests * 0.4 and generic_clues <= total_tests * 0.2:  # 40% excellent, <20% generic
        print("🔄 Very good! flan-t5-large is suitable for production")
        print("✅ Significant improvement over smaller models")
    elif (excellent_clues + good_clues) >= total_tests * 0.7:  # 70% good+excellent
        print("⚠️  Good results, but some generic responses remain")
        print("💡 Consider prompt engineering refinements")
    else:
        print("❌ Still not meeting quality standards")
        print("💡 May need flan-t5-xl (~11GB) or different approach")


def main():
    """Run the flan-t5-large test."""
    test_flan_t5_large()


if __name__ == "__main__":
    main()