#!/usr/bin/env python3 """ Test: Upgraded flan-t5-base Model for Crossword Clue Generation Compare flan-t5-base performance against the previous flan-t5-small results. """ import sys import logging from pathlib import Path # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from llm_clue_generator import LLMClueGenerator GENERATOR_AVAILABLE = True except ImportError as e: print(f"โŒ Import error: {e}") GENERATOR_AVAILABLE = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def test_flan_t5_base(): """Test flan-t5-base model with problematic examples that failed with flan-t5-small.""" if not GENERATOR_AVAILABLE: print("โŒ Cannot run test - LLM generator not available") return print("๐Ÿงช Testing Upgraded flan-t5-base Model") print("=" * 60) # Initialize generator with base model print("๐Ÿ”„ Initializing flan-t5-base clue generator...") generator = LLMClueGenerator() try: generator.initialize() print(f"โœ… Generator initialized successfully with {generator.model_name}") print(f"๐Ÿ“Š Model size: ~1GB (vs ~80MB for flan-t5-small)") except Exception as e: print(f"โŒ Failed to initialize generator: {e}") return # Test cases that produced terrible results with flan-t5-small test_cases = [ # Previous failures with flan-t5-small: # CAT + animals โ†’ "Tryon", "Trick and treating" # MEAL + food โ†’ "Jack nixt", "fender" # SONG + music โ†’ "Gritting your teeth" ("CAT", "animals"), ("KITTY", "animals"), ("MEAL", "food"), ("HUNGER", "food"), ("SONG", "music"), ("GUITAR", "music"), # Your specific problematic examples ("BATSMAN", "cricket"), ("SWIMMING", "sports"), ("AIRPORT", "transportation"), # Additional challenging cases ("DATABASE", "technology"), ("SCIENTIST", "science"), ("PIZZA", "food"), ("MOUNTAIN", "geography"), ] print(f"\n๐ŸŽฏ Testing {len(test_cases)} word-topic combinations with flan-t5-base") print("=" * 60) excellent_clues = 0 good_clues = 0 poor_clues = 0 failed_clues = 0 # Track specific improvements over flan-t5-small major_improvements = [] for word, topic in test_cases: print(f"\n๐Ÿ“ Testing: '{word}' + '{topic}'") print("-" * 40) try: # Test multiple clue styles to get best result styles = ["definition", "description", "category", "function", "context"] candidates = [] for style in styles: clue = generator.generate_clue( word=word, topic=topic, clue_style=style, difficulty="medium" ) if clue and len(clue) > 5: candidates.append((style, clue)) if candidates: print(f"Generated {len(candidates)} candidates:") for i, (style, clue) in enumerate(candidates, 1): print(f" {i}. [{style}] {clue}") # Use the first valid clue as best best_style, best_clue = candidates[0] print(f"\n๐Ÿ† Best clue [{best_style}]: {best_clue}") # Enhanced quality evaluation word_lower = word.lower() clue_lower = best_clue.lower() # Check if contains target word (should not) contains_word = word_lower in clue_lower # Check for nonsense patterns from flan-t5-small old_nonsense = any(bad in clue_lower for bad in [ "trick and treating", "gritting your teeth", "jack nixt", "fender", "tryon", "nicolas", "occurrence", "sludge" ]) # Check for descriptive quality is_descriptive = ( len(best_clue.split()) >= 2 and len(best_clue) >= 8 and not contains_word and not old_nonsense ) # Check for definitional quality is_definitional = ( any(def_word in clue_lower for def_word in [ "player", "sport", "instrument", "device", "system", "food", "language", "place", "animal", "creature", "location" ]) and not contains_word ) if contains_word: print("โŒ Quality: POOR (contains target word)") poor_clues += 1 elif old_nonsense: print("โŒ Quality: POOR (nonsensical)") poor_clues += 1 elif is_definitional: print("โœ… Quality: EXCELLENT (definitional)") excellent_clues += 1 major_improvements.append((word, topic, best_clue)) elif is_descriptive: print("โœ… Quality: GOOD (descriptive)") good_clues += 1 major_improvements.append((word, topic, best_clue)) else: print("๐Ÿ”„ Quality: ACCEPTABLE") good_clues += 1 else: print("โŒ No valid clues generated") failed_clues += 1 except Exception as e: print(f"โŒ Error generating clue: {e}") failed_clues += 1 total_tests = len(test_cases) print(f"\n" + "=" * 60) print(f"๐Ÿ“Š FLAN-T5-BASE RESULTS") print(f"=" * 60) print(f"Total tests: {total_tests}") print(f"Excellent clues: {excellent_clues}") print(f"Good clues: {good_clues}") print(f"Poor clues: {poor_clues}") print(f"Failed clues: {failed_clues}") print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%") print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%") # Show major improvements if major_improvements: print(f"\n๐ŸŽ‰ MAJOR IMPROVEMENTS OVER FLAN-T5-SMALL:") print("-" * 60) for word, topic, clue in major_improvements[:5]: # Show top 5 print(f" {word} + {topic}: \"{clue}\"") # Evaluation compared to flan-t5-small (which had ~0% success) if excellent_clues >= total_tests * 0.4: # 40% excellent print("๐ŸŽ‰ MAJOR SUCCESS! flan-t5-base produces excellent clues!") print("๐Ÿš€ Ready for production use - significant upgrade from flan-t5-small") elif (excellent_clues + good_clues) >= total_tests * 0.6: # 60% good+excellent print("๐Ÿ”„ Good improvement! Much better than flan-t5-small") print("โœ… Suitable for production with semantic fallback") elif (excellent_clues + good_clues) >= total_tests * 0.3: # 30% success print("โš ๏ธ Some improvement over flan-t5-small, but still limited") else: print("โŒ Still struggling - may need even larger model or external knowledge") def main(): """Run the flan-t5-base upgrade test.""" test_flan_t5_base() if __name__ == "__main__": main()