#!/usr/bin/env python3 """ Test: flan-t5-large Model for Superior Crossword Clue Generation Test the most capable model to eliminate generic responses and achieve excellence. """ import sys import logging from pathlib import Path # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from llm_clue_generator import LLMClueGenerator GENERATOR_AVAILABLE = True except ImportError as e: print(f"โŒ Import error: {e}") GENERATOR_AVAILABLE = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def test_flan_t5_large(): """Test flan-t5-large model for superior crossword clue quality.""" if not GENERATOR_AVAILABLE: print("โŒ Cannot run test - LLM generator not available") return print("๐Ÿงช Testing flan-t5-large Model (No Fallbacks)") print("=" * 60) # Initialize generator with large model print("๐Ÿ”„ Initializing flan-t5-large clue generator...") generator = LLMClueGenerator() try: generator.initialize() print(f"โœ… Generator initialized successfully with {generator.model_name}") print(f"๐Ÿ“Š Model size: ~3GB (3x larger than base, 37x larger than small)") except Exception as e: print(f"โŒ Failed to initialize generator: {e}") print("๐Ÿ’ก Note: flan-t5-large requires ~3GB RAM and longer initialization time") return # Challenging test cases that should be handled well by a large model test_cases = [ # Basic cases that failed with smaller models ("CAT", "animals"), ("BATSMAN", "cricket"), ("SWIMMING", "sports"), ("AIRPORT", "transportation"), ("DATABASE", "technology"), # More challenging cases requiring world knowledge ("VIOLIN", "music"), ("SCIENTIST", "science"), ("PIZZA", "food"), ("MOUNTAIN", "geography"), ("HELICOPTER", "transportation"), ("DEMOCRACY", "politics"), ("PHOTOSYNTHESIS", "science"), # Abstract concepts ("HAPPINESS", "emotions"), ("ALGORITHM", "technology"), ("METAPHOR", "literature"), ] print(f"\n๐ŸŽฏ Testing {len(test_cases)} challenging word-topic combinations") print("=" * 60) excellent_clues = 0 good_clues = 0 generic_clues = 0 poor_clues = 0 for word, topic in test_cases: print(f"\n๐Ÿ“ Testing: '{word}' + '{topic}'") print("-" * 40) try: # Test the best-performing clue style best_clue = generator.generate_clue( word=word, topic=topic, clue_style="definition", # Usually produces the best results difficulty="medium" ) if best_clue and len(best_clue) > 3: print(f"๐Ÿ† Generated clue: {best_clue}") # Comprehensive quality evaluation word_lower = word.lower() clue_lower = best_clue.lower() # Critical quality checks contains_word = word_lower in clue_lower is_generic = any(generic in clue_lower for generic in [ "make it moderately challenging", "make it challenging", "make it difficult", "make it easier", "moderately challenging", "difficult", "easy" ]) is_nonsensical = any(nonsense in clue_lower for nonsense in [ "a) a) a)", "trick and treating", "gritting your teeth", "jack nixt", "fender", "tryon" ]) # Positive quality indicators has_definition = any(def_word in clue_lower for def_word in [ "player", "instrument", "device", "system", "place", "location", "animal", "creature", "building", "process", "method", "concept", "sport", "activity", "food", "dish", "language", "tool" ]) is_descriptive = ( len(best_clue.split()) >= 3 and len(best_clue) >= 10 and not contains_word and not is_generic and not is_nonsensical ) # Quality scoring if contains_word: print("โŒ Quality: POOR (contains target word)") poor_clues += 1 elif is_nonsensical: print("โŒ Quality: POOR (nonsensical)") poor_clues += 1 elif is_generic: print("โš ๏ธ Quality: GENERIC (template response)") generic_clues += 1 elif has_definition and is_descriptive: print("โœ… Quality: EXCELLENT (definitional & descriptive)") excellent_clues += 1 elif is_descriptive: print("โœ… Quality: GOOD (descriptive)") good_clues += 1 elif has_definition: print("๐Ÿ”„ Quality: ACCEPTABLE (basic definition)") good_clues += 1 else: print("โš ๏ธ Quality: GENERIC (basic)") generic_clues += 1 else: print("โŒ No valid clue generated") poor_clues += 1 except Exception as e: print(f"โŒ Error generating clue: {e}") poor_clues += 1 total_tests = len(test_cases) print(f"\n" + "=" * 60) print(f"๐Ÿ“Š FLAN-T5-LARGE RESULTS (NO FALLBACKS)") print(f"=" * 60) print(f"Total tests: {total_tests}") print(f"Excellent clues: {excellent_clues}") print(f"Good clues: {good_clues}") print(f"Generic clues: {generic_clues}") print(f"Poor clues: {poor_clues}") print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%") print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%") print(f"Generic rate: {(generic_clues/total_tests)*100:.1f}%") # Final evaluation - high standards for large model if excellent_clues >= total_tests * 0.6: # 60% excellent print("๐ŸŽ‰ SUCCESS! flan-t5-large produces excellent crossword clues!") print("๐Ÿš€ Ready for production - no fallbacks needed!") elif excellent_clues >= total_tests * 0.4 and generic_clues <= total_tests * 0.2: # 40% excellent, <20% generic print("๐Ÿ”„ Very good! flan-t5-large is suitable for production") print("โœ… Significant improvement over smaller models") elif (excellent_clues + good_clues) >= total_tests * 0.7: # 70% good+excellent print("โš ๏ธ Good results, but some generic responses remain") print("๐Ÿ’ก Consider prompt engineering refinements") else: print("โŒ Still not meeting quality standards") print("๐Ÿ’ก May need flan-t5-xl (~11GB) or different approach") def main(): """Run the flan-t5-large test.""" test_flan_t5_large() if __name__ == "__main__": main()