#!/usr/bin/env python3
"""
Test: Superior Prompt Engineering with flan-t5-base
Test if better prompts with examples can achieve excellence without larger models.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_superior_prompts():
    """Test superior prompt engineering with flan-t5-base."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("🧪 Testing Superior Prompt Engineering")
    print("=" * 60)
    print("🎯 Strategy: Better prompts with examples vs larger models")
    
    # Initialize generator
    print("🔄 Initializing flan-t5-base with superior prompts...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print(f"✅ Generator initialized successfully with {generator.model_name}")
        print(f"📊 Model size: ~1GB with enhanced example-based prompts")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that should work well with good examples
    test_cases = [
        # Cases that failed with poor prompting
        ("CAT", "animals"),
        ("BATSMAN", "cricket"),
        ("SWIMMING", "sports"),
        ("AIRPORT", "transportation"),
        ("DATABASE", "technology"),
        
        # Additional challenging cases
        ("VIOLIN", "music"),
        ("SCIENTIST", "science"),
        ("PIZZA", "food"),
        ("MOUNTAIN", "geography"),
        ("ELEPHANT", "animals"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} cases with superior prompts")
    print("=" * 60)
    
    excellent_results = []
    good_results = []
    poor_results = []
    
    for word, topic in test_cases:
        print(f"\n📝 Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Test different prompt styles
            results = {}
            for style in ["definition", "description", "simple"]:
                clue = generator.generate_clue(
                    word=word,
                    topic=topic,
                    clue_style=style,
                    difficulty="medium"
                )
                if clue and len(clue) > 3:
                    results[style] = clue
            
            if results:
                print("Generated clues:")
                for style, clue in results.items():
                    print(f"  [{style}] {clue}")
                
                # Use the best result
                best_style = "definition" if "definition" in results else list(results.keys())[0]
                best_clue = results[best_style]
                
                print(f"\n🏆 Best clue [{best_style}]: {best_clue}")
                
                # Quality evaluation
                word_lower = word.lower()
                clue_lower = best_clue.lower()
                
                # Quality checks
                contains_word = word_lower in clue_lower
                is_generic = any(bad in clue_lower for bad in [
                    "make it", "moderately challenging", "difficult", "easy"
                ])
                is_descriptive = len(best_clue.split()) >= 2 and len(best_clue) >= 6
                has_quality_words = any(good in clue_lower for good in [
                    "instrument", "player", "animal", "device", "system", "terminal", 
                    "companion", "professional", "activity", "dish", "creature"
                ])
                
                # Scoring
                if contains_word:
                    print("❌ Quality: POOR (contains target word)")
                    poor_results.append((word, topic, best_clue, "contains word"))
                elif is_generic:
                    print("⚠️  Quality: GENERIC (template response)")
                    poor_results.append((word, topic, best_clue, "generic"))
                elif has_quality_words and is_descriptive:
                    print("✅ Quality: EXCELLENT (specific & descriptive)")
                    excellent_results.append((word, topic, best_clue))
                elif is_descriptive:
                    print("✅ Quality: GOOD (descriptive)")
                    good_results.append((word, topic, best_clue))
                else:
                    print("🔄 Quality: ACCEPTABLE")
                    good_results.append((word, topic, best_clue))
            else:
                print("❌ No valid clues generated")
                poor_results.append((word, topic, "No clue", "failed"))
                
        except Exception as e:
            print(f"❌ Error: {e}")
            poor_results.append((word, topic, "Error", str(e)))
    
    # Results analysis
    total_tests = len(test_cases)
    excellent_count = len(excellent_results)
    good_count = len(good_results)
    poor_count = len(poor_results)
    
    print(f"\n" + "=" * 60)
    print(f"📊 SUPERIOR PROMPTS RESULTS")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Excellent clues: {excellent_count}")
    print(f"Good clues: {good_count}")
    print(f"Poor/Failed clues: {poor_count}")
    print(f"Success rate: {((excellent_count + good_count)/total_tests)*100:.1f}%")
    print(f"Excellence rate: {(excellent_count/total_tests)*100:.1f}%")
    
    # Show best results
    if excellent_results:
        print(f"\n🎉 EXCELLENT CLUES:")
        print("-" * 40)
        for word, topic, clue in excellent_results:
            print(f"  {word} + {topic}: \"{clue}\"")
    
    if good_results and len(good_results) <= 5:
        print(f"\n✅ GOOD CLUES:")
        print("-" * 40)
        for word, topic, clue in good_results:
            print(f"  {word} + {topic}: \"{clue}\"")
    
    # Final evaluation
    if excellent_count >= total_tests * 0.6:  # 60% excellent
        print("\n🎉 SUCCESS! Superior prompts achieve excellent results!")
        print("🚀 Ready for production - proof that better prompts > bigger models!")
    elif excellent_count >= total_tests * 0.4:  # 40% excellent
        print("\n🔄 Very promising! Superior prompts show major improvement")
        print("✅ Much better than previous attempts")
    elif (excellent_count + good_count) >= total_tests * 0.7:  # 70% success
        print("\n⚠️  Good results with superior prompts")
        print("💡 Demonstrates prompt engineering is key to success")
    else:
        print("\n❌ Still struggling even with better prompts")
        print("💡 May need combination of larger model + superior prompts")


def main():
    """Run the superior prompts test."""
    test_superior_prompts()


if __name__ == "__main__":
    main()