File size: 7,381 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
#!/usr/bin/env python3
"""
Test: flan-t5-large Model for Superior Crossword Clue Generation
Test the most capable model to eliminate generic responses and achieve excellence.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"β Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_flan_t5_large():
"""Test flan-t5-large model for superior crossword clue quality."""
if not GENERATOR_AVAILABLE:
print("β Cannot run test - LLM generator not available")
return
print("π§ͺ Testing flan-t5-large Model (No Fallbacks)")
print("=" * 60)
# Initialize generator with large model
print("π Initializing flan-t5-large clue generator...")
generator = LLMClueGenerator()
try:
generator.initialize()
print(f"β
Generator initialized successfully with {generator.model_name}")
print(f"π Model size: ~3GB (3x larger than base, 37x larger than small)")
except Exception as e:
print(f"β Failed to initialize generator: {e}")
print("π‘ Note: flan-t5-large requires ~3GB RAM and longer initialization time")
return
# Challenging test cases that should be handled well by a large model
test_cases = [
# Basic cases that failed with smaller models
("CAT", "animals"),
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
("DATABASE", "technology"),
# More challenging cases requiring world knowledge
("VIOLIN", "music"),
("SCIENTIST", "science"),
("PIZZA", "food"),
("MOUNTAIN", "geography"),
("HELICOPTER", "transportation"),
("DEMOCRACY", "politics"),
("PHOTOSYNTHESIS", "science"),
# Abstract concepts
("HAPPINESS", "emotions"),
("ALGORITHM", "technology"),
("METAPHOR", "literature"),
]
print(f"\nπ― Testing {len(test_cases)} challenging word-topic combinations")
print("=" * 60)
excellent_clues = 0
good_clues = 0
generic_clues = 0
poor_clues = 0
for word, topic in test_cases:
print(f"\nπ Testing: '{word}' + '{topic}'")
print("-" * 40)
try:
# Test the best-performing clue style
best_clue = generator.generate_clue(
word=word,
topic=topic,
clue_style="definition", # Usually produces the best results
difficulty="medium"
)
if best_clue and len(best_clue) > 3:
print(f"π Generated clue: {best_clue}")
# Comprehensive quality evaluation
word_lower = word.lower()
clue_lower = best_clue.lower()
# Critical quality checks
contains_word = word_lower in clue_lower
is_generic = any(generic in clue_lower for generic in [
"make it moderately challenging", "make it challenging",
"make it difficult", "make it easier", "moderately challenging",
"difficult", "easy"
])
is_nonsensical = any(nonsense in clue_lower for nonsense in [
"a) a) a)", "trick and treating", "gritting your teeth",
"jack nixt", "fender", "tryon"
])
# Positive quality indicators
has_definition = any(def_word in clue_lower for def_word in [
"player", "instrument", "device", "system", "place", "location",
"animal", "creature", "building", "process", "method", "concept",
"sport", "activity", "food", "dish", "language", "tool"
])
is_descriptive = (
len(best_clue.split()) >= 3 and
len(best_clue) >= 10 and
not contains_word and
not is_generic and
not is_nonsensical
)
# Quality scoring
if contains_word:
print("β Quality: POOR (contains target word)")
poor_clues += 1
elif is_nonsensical:
print("β Quality: POOR (nonsensical)")
poor_clues += 1
elif is_generic:
print("β οΈ Quality: GENERIC (template response)")
generic_clues += 1
elif has_definition and is_descriptive:
print("β
Quality: EXCELLENT (definitional & descriptive)")
excellent_clues += 1
elif is_descriptive:
print("β
Quality: GOOD (descriptive)")
good_clues += 1
elif has_definition:
print("π Quality: ACCEPTABLE (basic definition)")
good_clues += 1
else:
print("β οΈ Quality: GENERIC (basic)")
generic_clues += 1
else:
print("β No valid clue generated")
poor_clues += 1
except Exception as e:
print(f"β Error generating clue: {e}")
poor_clues += 1
total_tests = len(test_cases)
print(f"\n" + "=" * 60)
print(f"π FLAN-T5-LARGE RESULTS (NO FALLBACKS)")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Excellent clues: {excellent_clues}")
print(f"Good clues: {good_clues}")
print(f"Generic clues: {generic_clues}")
print(f"Poor clues: {poor_clues}")
print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
print(f"Generic rate: {(generic_clues/total_tests)*100:.1f}%")
# Final evaluation - high standards for large model
if excellent_clues >= total_tests * 0.6: # 60% excellent
print("π SUCCESS! flan-t5-large produces excellent crossword clues!")
print("π Ready for production - no fallbacks needed!")
elif excellent_clues >= total_tests * 0.4 and generic_clues <= total_tests * 0.2: # 40% excellent, <20% generic
print("π Very good! flan-t5-large is suitable for production")
print("β
Significant improvement over smaller models")
elif (excellent_clues + good_clues) >= total_tests * 0.7: # 70% good+excellent
print("β οΈ Good results, but some generic responses remain")
print("π‘ Consider prompt engineering refinements")
else:
print("β Still not meeting quality standards")
print("π‘ May need flan-t5-xl (~11GB) or different approach")
def main():
"""Run the flan-t5-large test."""
test_flan_t5_large()
if __name__ == "__main__":
main()
|