#!/usr/bin/env python3
"""
Test Script: Thematic Word Generation + LLM Clue Generation

Integrates the existing thematic_word_generator.py with the new llm_clue_generator.py
to create a complete word-to-clue pipeline for crossword puzzles.

Tests various scenarios:
- Single topics
- Multiple topics  
- Custom sentences
- Different difficulties
- Performance analysis
"""

import os
import sys
import time
import logging
from typing import List, Dict, Tuple, Any
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from thematic_word_generator import UnifiedThematicWordGenerator
    from llm_clue_generator import LLMClueGenerator
    GENERATORS_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("Make sure thematic_word_generator.py and llm_clue_generator.py are in the same directory")
    GENERATORS_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


class CrosswordClueTestSuite:
    """
    Test suite for integrated thematic word generation + LLM clue generation.
    """
    
    def __init__(self, vocab_size_limit: int = 50000):
        """Initialize the test suite.
        
        Args:
            vocab_size_limit: Vocabulary size for thematic generator (smaller for faster testing)
        """
        if not GENERATORS_AVAILABLE:
            raise ImportError("Required generators not available")
        
        self.vocab_size_limit = vocab_size_limit
        self.word_generator = None
        self.clue_generator = None
        
        # Test results storage
        self.test_results = {}
        self.performance_stats = {}
    
    def initialize(self):
        """Initialize both generators."""
        print("🚀 Initializing Crossword Clue Test Suite")
        print("=" * 60)
        
        # Initialize thematic word generator
        print("\n🔄 Initializing thematic word generator...")
        start_time = time.time()
        
        self.word_generator = UnifiedThematicWordGenerator(
            vocab_size_limit=self.vocab_size_limit
        )
        self.word_generator.initialize()
        
        word_gen_time = time.time() - start_time
        print(f"✅ Word generator ready in {word_gen_time:.2f}s")
        
        # Initialize LLM clue generator
        print("\n🔄 Initializing LLM clue generator...")
        start_time = time.time()
        
        self.clue_generator = LLMClueGenerator()
        self.clue_generator.initialize()
        
        clue_gen_time = time.time() - start_time
        print(f"✅ Clue generator ready in {clue_gen_time:.2f}s")
        
        # Store initialization stats
        self.performance_stats['word_gen_init_time'] = word_gen_time
        self.performance_stats['clue_gen_init_time'] = clue_gen_time
        self.performance_stats['total_init_time'] = word_gen_time + clue_gen_time
        
        print(f"\n✅ Test suite initialized in {word_gen_time + clue_gen_time:.2f}s")
    
    def run_single_topic_test(self, topic: str, num_words: int = 10) -> Dict[str, Any]:
        """Test single topic word+clue generation.
        
        Args:
            topic: Single topic string
            num_words: Number of words to generate
            
        Returns:
            Test results dictionary
        """
        print(f"\n🎯 Single Topic Test: '{topic}'")
        print("-" * 50)
        
        start_time = time.time()
        
        # Step 1: Generate thematic words
        print(f"📝 Generating {num_words} thematic words...")
        word_start_time = time.time()
        
        thematic_results = self.word_generator.generate_thematic_words(
            inputs=topic,
            num_words=num_words,
            min_similarity=0.3
        )
        
        word_gen_time = time.time() - word_start_time
        print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")
        
        if not thematic_results:
            return {"error": "No thematic words generated"}
        
        # Step 2: Generate clues for each word
        print(f"🎭 Generating clues for {len(thematic_results)} words...")
        clue_start_time = time.time()
        
        word_clue_pairs = []
        for word, similarity, tier in thematic_results:
            try:
                clue = self.clue_generator.generate_clue(
                    word=word, 
                    topic=topic, 
                    clue_style="category",
                    difficulty="medium"
                )
                
                word_clue_pairs.append({
                    "word": word.upper(),
                    "clue": clue,
                    "similarity": similarity,
                    "tier": tier,
                    "length": len(word)
                })
                
            except Exception as e:
                logger.error(f"Failed to generate clue for '{word}': {e}")
                word_clue_pairs.append({
                    "word": word.upper(),
                    "clue": f"Related to {topic}: {word}",  # Fallback
                    "similarity": similarity,
                    "tier": tier,
                    "length": len(word),
                    "error": str(e)
                })
        
        clue_gen_time = time.time() - clue_start_time
        total_time = time.time() - start_time
        
        # Display results
        print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
        print(f"\n📋 Results for topic '{topic}':")
        print("=" * 60)
        
        for i, item in enumerate(word_clue_pairs, 1):
            tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
            error_marker = " ⚠️" if 'error' in item else ""
            print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
            print(f"    Similarity: {item['similarity']:.3f} | {tier_desc}")
        
        # Performance summary
        print(f"\n⏱️  Performance:")
        print(f"   Word generation: {word_gen_time:.2f}s")
        print(f"   Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
        print(f"   Total time: {total_time:.2f}s")
        
        return {
            "topic": topic,
            "num_words_requested": num_words,
            "num_words_generated": len(word_clue_pairs),
            "word_clue_pairs": word_clue_pairs,
            "performance": {
                "word_gen_time": word_gen_time,
                "clue_gen_time": clue_gen_time,
                "total_time": total_time,
                "avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
            }
        }
    
    def run_multi_topic_test(self, topics: List[str], num_words: int = 12) -> Dict[str, Any]:
        """Test multi-topic word+clue generation.
        
        Args:
            topics: List of topic strings
            num_words: Number of words to generate
            
        Returns:
            Test results dictionary
        """
        print(f"\n🎯 Multi-Topic Test: {topics}")
        print("-" * 50)
        
        start_time = time.time()
        
        # Step 1: Generate thematic words (multi-theme enabled)
        print(f"📝 Generating {num_words} multi-thematic words...")
        word_start_time = time.time()
        
        thematic_results = self.word_generator.generate_thematic_words(
            inputs=topics,
            num_words=num_words,
            min_similarity=0.25,  # Lower threshold for multi-topic
            multi_theme=True  # Enable multi-theme processing
        )
        
        word_gen_time = time.time() - word_start_time
        print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")
        
        if not thematic_results:
            return {"error": "No thematic words generated"}
        
        # Step 2: Generate contextual clues
        print(f"🎭 Generating contextual clues...")
        clue_start_time = time.time()
        
        # Create topic context string for clue generation
        topic_context = " and ".join(topics)
        
        word_clue_pairs = []
        for word, similarity, tier in thematic_results:
            try:
                clue = self.clue_generator.generate_clue(
                    word=word, 
                    topic=topic_context,
                    clue_style="description",  # Use descriptive style for multi-topic
                    difficulty="medium"
                )
                
                word_clue_pairs.append({
                    "word": word.upper(),
                    "clue": clue,
                    "similarity": similarity,
                    "tier": tier,
                    "length": len(word)
                })
                
            except Exception as e:
                logger.error(f"Failed to generate clue for '{word}': {e}")
                word_clue_pairs.append({
                    "word": word.upper(),
                    "clue": f"Related to {topic_context}: {word}",
                    "similarity": similarity,
                    "tier": tier,
                    "length": len(word),
                    "error": str(e)
                })
        
        clue_gen_time = time.time() - clue_start_time
        total_time = time.time() - start_time
        
        # Display results
        print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
        print(f"\n📋 Results for topics {topics}:")
        print("=" * 70)
        
        for i, item in enumerate(word_clue_pairs, 1):
            tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
            error_marker = " ⚠️" if 'error' in item else ""
            print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
            print(f"    Similarity: {item['similarity']:.3f} | {tier_desc}")
        
        # Performance summary
        print(f"\n⏱️  Performance:")
        print(f"   Word generation: {word_gen_time:.2f}s")
        print(f"   Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
        print(f"   Total time: {total_time:.2f}s")
        
        return {
            "topics": topics,
            "num_words_requested": num_words,
            "num_words_generated": len(word_clue_pairs),
            "word_clue_pairs": word_clue_pairs,
            "performance": {
                "word_gen_time": word_gen_time,
                "clue_gen_time": clue_gen_time,
                "total_time": total_time,
                "avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
            }
        }
    
    def run_custom_sentence_test(self, sentence: str, num_words: int = 10) -> Dict[str, Any]:
        """Test custom sentence word+clue generation.
        
        Args:
            sentence: Custom sentence input
            num_words: Number of words to generate
            
        Returns:
            Test results dictionary
        """
        print(f"\n🎯 Custom Sentence Test: '{sentence}'")
        print("-" * 60)
        
        start_time = time.time()
        
        # Step 1: Generate thematic words from sentence
        print(f"📝 Generating {num_words} words from sentence...")
        word_start_time = time.time()
        
        thematic_results = self.word_generator.generate_thematic_words(
            inputs=sentence,
            num_words=num_words,
            min_similarity=0.2  # Lower threshold for sentences
        )
        
        word_gen_time = time.time() - word_start_time
        print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")
        
        if not thematic_results:
            return {"error": "No thematic words generated"}
        
        # Step 2: Generate personalized clues
        print(f"🎭 Generating personalized clues...")
        clue_start_time = time.time()
        
        word_clue_pairs = []
        for word, similarity, tier in thematic_results:
            try:
                # Use the original sentence as context for more personalized clues
                clue = self.clue_generator.generate_clue(
                    word=word, 
                    topic=f"theme: {sentence}",
                    clue_style="description",
                    difficulty="medium"
                )
                
                word_clue_pairs.append({
                    "word": word.upper(),
                    "clue": clue,
                    "similarity": similarity,
                    "tier": tier,
                    "length": len(word)
                })
                
            except Exception as e:
                logger.error(f"Failed to generate clue for '{word}': {e}")
                word_clue_pairs.append({
                    "word": word.upper(),
                    "clue": f"From '{sentence[:30]}...': {word}",
                    "similarity": similarity,
                    "tier": tier,
                    "length": len(word),
                    "error": str(e)
                })
        
        clue_gen_time = time.time() - clue_start_time
        total_time = time.time() - start_time
        
        # Display results
        print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
        print(f"\n📋 Results for sentence: '{sentence}'")
        print("=" * 70)
        
        for i, item in enumerate(word_clue_pairs, 1):
            tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
            error_marker = " ⚠️" if 'error' in item else ""
            print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
            print(f"    Similarity: {item['similarity']:.3f} | {tier_desc}")
        
        # Performance summary
        print(f"\n⏱️  Performance:")
        print(f"   Word generation: {word_gen_time:.2f}s")
        print(f"   Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
        print(f"   Total time: {total_time:.2f}s")
        
        return {
            "sentence": sentence,
            "num_words_requested": num_words,
            "num_words_generated": len(word_clue_pairs),
            "word_clue_pairs": word_clue_pairs,
            "performance": {
                "word_gen_time": word_gen_time,
                "clue_gen_time": clue_gen_time,
                "total_time": total_time,
                "avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
            }
        }
    
    def run_difficulty_comparison_test(self, topic: str, num_words: int = 6) -> Dict[str, Any]:
        """Test different difficulty levels for the same topic.
        
        Args:
            topic: Topic to test
            num_words: Number of words to generate
            
        Returns:
            Comparison results
        """
        print(f"\n🎯 Difficulty Comparison Test: '{topic}'")
        print("-" * 50)
        
        difficulties = ["easy", "medium", "hard"]
        results = {}
        
        # Generate words once (reuse for all difficulties)
        thematic_results = self.word_generator.generate_thematic_words(
            inputs=topic,
            num_words=num_words,
            min_similarity=0.3
        )[:num_words]  # Take only requested number
        
        if not thematic_results:
            return {"error": "No thematic words generated"}
        
        print(f"📝 Testing {len(thematic_results)} words at different difficulty levels...")
        
        for difficulty in difficulties:
            print(f"\n--- {difficulty.upper()} Difficulty ---")
            
            clue_pairs = []
            start_time = time.time()
            
            for word, similarity, tier in thematic_results:
                try:
                    clue = self.clue_generator.generate_clue(
                        word=word,
                        topic=topic,
                        clue_style="category",
                        difficulty=difficulty
                    )
                    
                    clue_pairs.append({
                        "word": word.upper(),
                        "clue": clue,
                        "similarity": similarity,
                        "tier": tier
                    })
                    
                except Exception as e:
                    logger.error(f"Failed to generate {difficulty} clue for '{word}': {e}")
                    clue_pairs.append({
                        "word": word.upper(),
                        "clue": f"{difficulty.title()} clue for {word}",
                        "similarity": similarity,
                        "tier": tier,
                        "error": str(e)
                    })
            
            generation_time = time.time() - start_time
            results[difficulty] = {
                "clue_pairs": clue_pairs,
                "generation_time": generation_time
            }
            
            # Display this difficulty's results
            for i, item in enumerate(clue_pairs, 1):
                error_marker = " ⚠️" if 'error' in item else ""
                print(f"  {i}. {item['word']:<10} - {item['clue']}{error_marker}")
        
        return {
            "topic": topic,
            "difficulties_tested": difficulties,
            "results": results,
            "base_words": [{"word": w, "similarity": s, "tier": t} for w, s, t in thematic_results]
        }
    
    def run_performance_analysis(self) -> Dict[str, Any]:
        """Analyze overall performance characteristics."""
        print(f"\n📊 Performance Analysis")
        print("-" * 40)
        
        # Collect performance stats from previous tests
        if not self.test_results:
            print("⚠️ No test results available for performance analysis")
            return {}
        
        all_word_times = []
        all_clue_times = []
        all_total_times = []
        
        for test_name, result in self.test_results.items():
            if 'performance' in result:
                perf = result['performance']
                all_word_times.append(perf.get('word_gen_time', 0))
                all_clue_times.append(perf.get('clue_gen_time', 0))
                all_total_times.append(perf.get('total_time', 0))
        
        if all_word_times:
            print(f"📈 Word Generation Performance:")
            print(f"   Average: {sum(all_word_times)/len(all_word_times):.2f}s")
            print(f"   Min: {min(all_word_times):.2f}s")
            print(f"   Max: {max(all_word_times):.2f}s")
            
            print(f"\n🎭 Clue Generation Performance:")
            print(f"   Average: {sum(all_clue_times)/len(all_clue_times):.2f}s")
            print(f"   Min: {min(all_clue_times):.2f}s")
            print(f"   Max: {max(all_clue_times):.2f}s")
            
            print(f"\n⏱️  Total Pipeline Performance:")
            print(f"   Average: {sum(all_total_times)/len(all_total_times):.2f}s")
            print(f"   Min: {min(all_total_times):.2f}s")
            print(f"   Max: {max(all_total_times):.2f}s")
        
        return {
            "word_gen_stats": {
                "avg": sum(all_word_times)/len(all_word_times) if all_word_times else 0,
                "min": min(all_word_times) if all_word_times else 0,
                "max": max(all_word_times) if all_word_times else 0
            },
            "clue_gen_stats": {
                "avg": sum(all_clue_times)/len(all_clue_times) if all_clue_times else 0,
                "min": min(all_clue_times) if all_clue_times else 0,
                "max": max(all_clue_times) if all_clue_times else 0
            },
            "total_stats": {
                "avg": sum(all_total_times)/len(all_total_times) if all_total_times else 0,
                "min": min(all_total_times) if all_total_times else 0,
                "max": max(all_total_times) if all_total_times else 0
            }
        }
    
    def run_full_test_suite(self):
        """Run the complete test suite."""
        print("🧪 CROSSWORD CLUE GENERATION TEST SUITE")
        print("=" * 70)
        
        if not GENERATORS_AVAILABLE:
            print("❌ Cannot run tests - generators not available")
            return
        
        # Initialize
        self.initialize()
        
        # Test 1: Single topics
        print("\n" + "="*70)
        print("TEST 1: SINGLE TOPIC TESTS")
        print("="*70)
        
        single_topics = ["animals", "technology", "music", "food"]
        for topic in single_topics:
            result = self.run_single_topic_test(topic, num_words=8)
            self.test_results[f"single_{topic}"] = result
        
        # Test 2: Multi-topic
        print("\n" + "="*70)
        print("TEST 2: MULTI-TOPIC TEST")
        print("="*70)
        
        multi_result = self.run_multi_topic_test(["science", "technology"], num_words=10)
        self.test_results["multi_science_tech"] = multi_result
        
        # Test 3: Custom sentence
        print("\n" + "="*70)
        print("TEST 3: CUSTOM SENTENCE TEST")
        print("="*70)
        
        sentence_result = self.run_custom_sentence_test("I love cats and playing guitar", num_words=8)
        self.test_results["sentence_cats_guitar"] = sentence_result
        
        # Test 4: Difficulty comparison
        print("\n" + "="*70)
        print("TEST 4: DIFFICULTY COMPARISON")
        print("="*70)
        
        difficulty_result = self.run_difficulty_comparison_test("sports", num_words=5)
        self.test_results["difficulty_sports"] = difficulty_result
        
        # Test 5: Performance analysis
        print("\n" + "="*70)
        print("TEST 5: PERFORMANCE ANALYSIS")
        print("="*70)
        
        perf_result = self.run_performance_analysis()
        self.test_results["performance"] = perf_result
        
        # Final summary
        print("\n" + "="*70)
        print("📋 FINAL SUMMARY")
        print("="*70)
        
        print(f"✅ Test suite completed!")
        print(f"📊 Tests run: {len(self.test_results)}")
        
        # Model info
        word_info = {
            "vocab_size": self.word_generator.get_vocabulary_size(),
            "tier_distribution": len(self.word_generator.get_tier_distribution())
        }
        
        clue_info = self.clue_generator.get_model_info()
        
        print(f"\n🔧 System Information:")
        print(f"   Word vocabulary: {word_info['vocab_size']:,} words")
        print(f"   Clue model: {clue_info['model_name']}")
        print(f"   Model size: {clue_info.get('model_size_mb', 0):.1f} MB")
        
        if perf_result:
            avg_total = perf_result['total_stats']['avg']
            print(f"   Average pipeline time: {avg_total:.2f}s")
        
        print(f"\n💡 Recommendations for HF Spaces:")
        if perf_result and perf_result['total_stats']['avg'] < 15:
            print("   ✅ Performance suitable for interactive use")
        else:
            print("   ⚠️ Consider optimizations for better user experience")
        
        print("\n🎉 Test suite complete!")


def main():
    """Run the test suite."""
    if not GENERATORS_AVAILABLE:
        print("❌ Cannot run tests - required generators not available")
        print("Make sure thematic_word_generator.py and llm_clue_generator.py are working")
        return
    
    # Create and run test suite
    test_suite = CrosswordClueTestSuite(vocab_size_limit=50000)  # Use existing cached embeddings
    
    try:
        test_suite.run_full_test_suite()
    except KeyboardInterrupt:
        print("\n\n⏹️ Test suite interrupted by user")
    except Exception as e:
        print(f"\n❌ Test suite failed: {e}")
        logger.error(f"Test suite error: {e}", exc_info=True)


if __name__ == "__main__":
    main()