abc123 / hack /test_flan_t5_base.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
7.63 kB
#!/usr/bin/env python3
"""
Test: Upgraded flan-t5-base Model for Crossword Clue Generation
Compare flan-t5-base performance against the previous flan-t5-small results.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"❌ Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_flan_t5_base():
"""Test flan-t5-base model with problematic examples that failed with flan-t5-small."""
if not GENERATOR_AVAILABLE:
print("❌ Cannot run test - LLM generator not available")
return
print("πŸ§ͺ Testing Upgraded flan-t5-base Model")
print("=" * 60)
# Initialize generator with base model
print("πŸ”„ Initializing flan-t5-base clue generator...")
generator = LLMClueGenerator()
try:
generator.initialize()
print(f"βœ… Generator initialized successfully with {generator.model_name}")
print(f"πŸ“Š Model size: ~1GB (vs ~80MB for flan-t5-small)")
except Exception as e:
print(f"❌ Failed to initialize generator: {e}")
return
# Test cases that produced terrible results with flan-t5-small
test_cases = [
# Previous failures with flan-t5-small:
# CAT + animals β†’ "Tryon", "Trick and treating"
# MEAL + food β†’ "Jack nixt", "fender"
# SONG + music β†’ "Gritting your teeth"
("CAT", "animals"),
("KITTY", "animals"),
("MEAL", "food"),
("HUNGER", "food"),
("SONG", "music"),
("GUITAR", "music"),
# Your specific problematic examples
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
# Additional challenging cases
("DATABASE", "technology"),
("SCIENTIST", "science"),
("PIZZA", "food"),
("MOUNTAIN", "geography"),
]
print(f"\n🎯 Testing {len(test_cases)} word-topic combinations with flan-t5-base")
print("=" * 60)
excellent_clues = 0
good_clues = 0
poor_clues = 0
failed_clues = 0
# Track specific improvements over flan-t5-small
major_improvements = []
for word, topic in test_cases:
print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
print("-" * 40)
try:
# Test multiple clue styles to get best result
styles = ["definition", "description", "category", "function", "context"]
candidates = []
for style in styles:
clue = generator.generate_clue(
word=word,
topic=topic,
clue_style=style,
difficulty="medium"
)
if clue and len(clue) > 5:
candidates.append((style, clue))
if candidates:
print(f"Generated {len(candidates)} candidates:")
for i, (style, clue) in enumerate(candidates, 1):
print(f" {i}. [{style}] {clue}")
# Use the first valid clue as best
best_style, best_clue = candidates[0]
print(f"\nπŸ† Best clue [{best_style}]: {best_clue}")
# Enhanced quality evaluation
word_lower = word.lower()
clue_lower = best_clue.lower()
# Check if contains target word (should not)
contains_word = word_lower in clue_lower
# Check for nonsense patterns from flan-t5-small
old_nonsense = any(bad in clue_lower for bad in [
"trick and treating", "gritting your teeth", "jack nixt",
"fender", "tryon", "nicolas", "occurrence", "sludge"
])
# Check for descriptive quality
is_descriptive = (
len(best_clue.split()) >= 2 and
len(best_clue) >= 8 and
not contains_word and
not old_nonsense
)
# Check for definitional quality
is_definitional = (
any(def_word in clue_lower for def_word in [
"player", "sport", "instrument", "device", "system", "food",
"language", "place", "animal", "creature", "location"
]) and not contains_word
)
if contains_word:
print("❌ Quality: POOR (contains target word)")
poor_clues += 1
elif old_nonsense:
print("❌ Quality: POOR (nonsensical)")
poor_clues += 1
elif is_definitional:
print("βœ… Quality: EXCELLENT (definitional)")
excellent_clues += 1
major_improvements.append((word, topic, best_clue))
elif is_descriptive:
print("βœ… Quality: GOOD (descriptive)")
good_clues += 1
major_improvements.append((word, topic, best_clue))
else:
print("πŸ”„ Quality: ACCEPTABLE")
good_clues += 1
else:
print("❌ No valid clues generated")
failed_clues += 1
except Exception as e:
print(f"❌ Error generating clue: {e}")
failed_clues += 1
total_tests = len(test_cases)
print(f"\n" + "=" * 60)
print(f"πŸ“Š FLAN-T5-BASE RESULTS")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Excellent clues: {excellent_clues}")
print(f"Good clues: {good_clues}")
print(f"Poor clues: {poor_clues}")
print(f"Failed clues: {failed_clues}")
print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
# Show major improvements
if major_improvements:
print(f"\nπŸŽ‰ MAJOR IMPROVEMENTS OVER FLAN-T5-SMALL:")
print("-" * 60)
for word, topic, clue in major_improvements[:5]: # Show top 5
print(f" {word} + {topic}: \"{clue}\"")
# Evaluation compared to flan-t5-small (which had ~0% success)
if excellent_clues >= total_tests * 0.4: # 40% excellent
print("πŸŽ‰ MAJOR SUCCESS! flan-t5-base produces excellent clues!")
print("πŸš€ Ready for production use - significant upgrade from flan-t5-small")
elif (excellent_clues + good_clues) >= total_tests * 0.6: # 60% good+excellent
print("πŸ”„ Good improvement! Much better than flan-t5-small")
print("βœ… Suitable for production with semantic fallback")
elif (excellent_clues + good_clues) >= total_tests * 0.3: # 30% success
print("⚠️ Some improvement over flan-t5-small, but still limited")
else:
print("❌ Still struggling - may need even larger model or external knowledge")
def main():
"""Run the flan-t5-base upgrade test."""
test_flan_t5_base()
if __name__ == "__main__":
main()