File size: 7,625 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
#!/usr/bin/env python3
"""
Test: Upgraded flan-t5-base Model for Crossword Clue Generation
Compare flan-t5-base performance against the previous flan-t5-small results.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"β Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_flan_t5_base():
"""Test flan-t5-base model with problematic examples that failed with flan-t5-small."""
if not GENERATOR_AVAILABLE:
print("β Cannot run test - LLM generator not available")
return
print("π§ͺ Testing Upgraded flan-t5-base Model")
print("=" * 60)
# Initialize generator with base model
print("π Initializing flan-t5-base clue generator...")
generator = LLMClueGenerator()
try:
generator.initialize()
print(f"β
Generator initialized successfully with {generator.model_name}")
print(f"π Model size: ~1GB (vs ~80MB for flan-t5-small)")
except Exception as e:
print(f"β Failed to initialize generator: {e}")
return
# Test cases that produced terrible results with flan-t5-small
test_cases = [
# Previous failures with flan-t5-small:
# CAT + animals β "Tryon", "Trick and treating"
# MEAL + food β "Jack nixt", "fender"
# SONG + music β "Gritting your teeth"
("CAT", "animals"),
("KITTY", "animals"),
("MEAL", "food"),
("HUNGER", "food"),
("SONG", "music"),
("GUITAR", "music"),
# Your specific problematic examples
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
# Additional challenging cases
("DATABASE", "technology"),
("SCIENTIST", "science"),
("PIZZA", "food"),
("MOUNTAIN", "geography"),
]
print(f"\nπ― Testing {len(test_cases)} word-topic combinations with flan-t5-base")
print("=" * 60)
excellent_clues = 0
good_clues = 0
poor_clues = 0
failed_clues = 0
# Track specific improvements over flan-t5-small
major_improvements = []
for word, topic in test_cases:
print(f"\nπ Testing: '{word}' + '{topic}'")
print("-" * 40)
try:
# Test multiple clue styles to get best result
styles = ["definition", "description", "category", "function", "context"]
candidates = []
for style in styles:
clue = generator.generate_clue(
word=word,
topic=topic,
clue_style=style,
difficulty="medium"
)
if clue and len(clue) > 5:
candidates.append((style, clue))
if candidates:
print(f"Generated {len(candidates)} candidates:")
for i, (style, clue) in enumerate(candidates, 1):
print(f" {i}. [{style}] {clue}")
# Use the first valid clue as best
best_style, best_clue = candidates[0]
print(f"\nπ Best clue [{best_style}]: {best_clue}")
# Enhanced quality evaluation
word_lower = word.lower()
clue_lower = best_clue.lower()
# Check if contains target word (should not)
contains_word = word_lower in clue_lower
# Check for nonsense patterns from flan-t5-small
old_nonsense = any(bad in clue_lower for bad in [
"trick and treating", "gritting your teeth", "jack nixt",
"fender", "tryon", "nicolas", "occurrence", "sludge"
])
# Check for descriptive quality
is_descriptive = (
len(best_clue.split()) >= 2 and
len(best_clue) >= 8 and
not contains_word and
not old_nonsense
)
# Check for definitional quality
is_definitional = (
any(def_word in clue_lower for def_word in [
"player", "sport", "instrument", "device", "system", "food",
"language", "place", "animal", "creature", "location"
]) and not contains_word
)
if contains_word:
print("β Quality: POOR (contains target word)")
poor_clues += 1
elif old_nonsense:
print("β Quality: POOR (nonsensical)")
poor_clues += 1
elif is_definitional:
print("β
Quality: EXCELLENT (definitional)")
excellent_clues += 1
major_improvements.append((word, topic, best_clue))
elif is_descriptive:
print("β
Quality: GOOD (descriptive)")
good_clues += 1
major_improvements.append((word, topic, best_clue))
else:
print("π Quality: ACCEPTABLE")
good_clues += 1
else:
print("β No valid clues generated")
failed_clues += 1
except Exception as e:
print(f"β Error generating clue: {e}")
failed_clues += 1
total_tests = len(test_cases)
print(f"\n" + "=" * 60)
print(f"π FLAN-T5-BASE RESULTS")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Excellent clues: {excellent_clues}")
print(f"Good clues: {good_clues}")
print(f"Poor clues: {poor_clues}")
print(f"Failed clues: {failed_clues}")
print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
# Show major improvements
if major_improvements:
print(f"\nπ MAJOR IMPROVEMENTS OVER FLAN-T5-SMALL:")
print("-" * 60)
for word, topic, clue in major_improvements[:5]: # Show top 5
print(f" {word} + {topic}: \"{clue}\"")
# Evaluation compared to flan-t5-small (which had ~0% success)
if excellent_clues >= total_tests * 0.4: # 40% excellent
print("π MAJOR SUCCESS! flan-t5-base produces excellent clues!")
print("π Ready for production use - significant upgrade from flan-t5-small")
elif (excellent_clues + good_clues) >= total_tests * 0.6: # 60% good+excellent
print("π Good improvement! Much better than flan-t5-small")
print("β
Suitable for production with semantic fallback")
elif (excellent_clues + good_clues) >= total_tests * 0.3: # 30% success
print("β οΈ Some improvement over flan-t5-small, but still limited")
else:
print("β Still struggling - may need even larger model or external knowledge")
def main():
"""Run the flan-t5-base upgrade test."""
test_flan_t5_base()
if __name__ == "__main__":
main() |