abc123 / hack /test_flan_t5_large.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
7.38 kB
#!/usr/bin/env python3
"""
Test: flan-t5-large Model for Superior Crossword Clue Generation
Test the most capable model to eliminate generic responses and achieve excellence.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"❌ Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_flan_t5_large():
"""Test flan-t5-large model for superior crossword clue quality."""
if not GENERATOR_AVAILABLE:
print("❌ Cannot run test - LLM generator not available")
return
print("πŸ§ͺ Testing flan-t5-large Model (No Fallbacks)")
print("=" * 60)
# Initialize generator with large model
print("πŸ”„ Initializing flan-t5-large clue generator...")
generator = LLMClueGenerator()
try:
generator.initialize()
print(f"βœ… Generator initialized successfully with {generator.model_name}")
print(f"πŸ“Š Model size: ~3GB (3x larger than base, 37x larger than small)")
except Exception as e:
print(f"❌ Failed to initialize generator: {e}")
print("πŸ’‘ Note: flan-t5-large requires ~3GB RAM and longer initialization time")
return
# Challenging test cases that should be handled well by a large model
test_cases = [
# Basic cases that failed with smaller models
("CAT", "animals"),
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
("DATABASE", "technology"),
# More challenging cases requiring world knowledge
("VIOLIN", "music"),
("SCIENTIST", "science"),
("PIZZA", "food"),
("MOUNTAIN", "geography"),
("HELICOPTER", "transportation"),
("DEMOCRACY", "politics"),
("PHOTOSYNTHESIS", "science"),
# Abstract concepts
("HAPPINESS", "emotions"),
("ALGORITHM", "technology"),
("METAPHOR", "literature"),
]
print(f"\n🎯 Testing {len(test_cases)} challenging word-topic combinations")
print("=" * 60)
excellent_clues = 0
good_clues = 0
generic_clues = 0
poor_clues = 0
for word, topic in test_cases:
print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
print("-" * 40)
try:
# Test the best-performing clue style
best_clue = generator.generate_clue(
word=word,
topic=topic,
clue_style="definition", # Usually produces the best results
difficulty="medium"
)
if best_clue and len(best_clue) > 3:
print(f"πŸ† Generated clue: {best_clue}")
# Comprehensive quality evaluation
word_lower = word.lower()
clue_lower = best_clue.lower()
# Critical quality checks
contains_word = word_lower in clue_lower
is_generic = any(generic in clue_lower for generic in [
"make it moderately challenging", "make it challenging",
"make it difficult", "make it easier", "moderately challenging",
"difficult", "easy"
])
is_nonsensical = any(nonsense in clue_lower for nonsense in [
"a) a) a)", "trick and treating", "gritting your teeth",
"jack nixt", "fender", "tryon"
])
# Positive quality indicators
has_definition = any(def_word in clue_lower for def_word in [
"player", "instrument", "device", "system", "place", "location",
"animal", "creature", "building", "process", "method", "concept",
"sport", "activity", "food", "dish", "language", "tool"
])
is_descriptive = (
len(best_clue.split()) >= 3 and
len(best_clue) >= 10 and
not contains_word and
not is_generic and
not is_nonsensical
)
# Quality scoring
if contains_word:
print("❌ Quality: POOR (contains target word)")
poor_clues += 1
elif is_nonsensical:
print("❌ Quality: POOR (nonsensical)")
poor_clues += 1
elif is_generic:
print("⚠️ Quality: GENERIC (template response)")
generic_clues += 1
elif has_definition and is_descriptive:
print("βœ… Quality: EXCELLENT (definitional & descriptive)")
excellent_clues += 1
elif is_descriptive:
print("βœ… Quality: GOOD (descriptive)")
good_clues += 1
elif has_definition:
print("πŸ”„ Quality: ACCEPTABLE (basic definition)")
good_clues += 1
else:
print("⚠️ Quality: GENERIC (basic)")
generic_clues += 1
else:
print("❌ No valid clue generated")
poor_clues += 1
except Exception as e:
print(f"❌ Error generating clue: {e}")
poor_clues += 1
total_tests = len(test_cases)
print(f"\n" + "=" * 60)
print(f"πŸ“Š FLAN-T5-LARGE RESULTS (NO FALLBACKS)")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Excellent clues: {excellent_clues}")
print(f"Good clues: {good_clues}")
print(f"Generic clues: {generic_clues}")
print(f"Poor clues: {poor_clues}")
print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
print(f"Generic rate: {(generic_clues/total_tests)*100:.1f}%")
# Final evaluation - high standards for large model
if excellent_clues >= total_tests * 0.6: # 60% excellent
print("πŸŽ‰ SUCCESS! flan-t5-large produces excellent crossword clues!")
print("πŸš€ Ready for production - no fallbacks needed!")
elif excellent_clues >= total_tests * 0.4 and generic_clues <= total_tests * 0.2: # 40% excellent, <20% generic
print("πŸ”„ Very good! flan-t5-large is suitable for production")
print("βœ… Significant improvement over smaller models")
elif (excellent_clues + good_clues) >= total_tests * 0.7: # 70% good+excellent
print("⚠️ Good results, but some generic responses remain")
print("πŸ’‘ Consider prompt engineering refinements")
else:
print("❌ Still not meeting quality standards")
print("πŸ’‘ May need flan-t5-xl (~11GB) or different approach")
def main():
"""Run the flan-t5-large test."""
test_flan_t5_large()
if __name__ == "__main__":
main()