|
|
|
""" |
|
Test: flan-t5-large Model for Superior Crossword Clue Generation |
|
Test the most capable model to eliminate generic responses and achieve excellence. |
|
""" |
|
|
|
import sys |
|
import logging |
|
from pathlib import Path |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
try: |
|
from llm_clue_generator import LLMClueGenerator |
|
GENERATOR_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"β Import error: {e}") |
|
GENERATOR_AVAILABLE = False |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def test_flan_t5_large(): |
|
"""Test flan-t5-large model for superior crossword clue quality.""" |
|
if not GENERATOR_AVAILABLE: |
|
print("β Cannot run test - LLM generator not available") |
|
return |
|
|
|
print("π§ͺ Testing flan-t5-large Model (No Fallbacks)") |
|
print("=" * 60) |
|
|
|
|
|
print("π Initializing flan-t5-large clue generator...") |
|
generator = LLMClueGenerator() |
|
|
|
try: |
|
generator.initialize() |
|
print(f"β
Generator initialized successfully with {generator.model_name}") |
|
print(f"π Model size: ~3GB (3x larger than base, 37x larger than small)") |
|
except Exception as e: |
|
print(f"β Failed to initialize generator: {e}") |
|
print("π‘ Note: flan-t5-large requires ~3GB RAM and longer initialization time") |
|
return |
|
|
|
|
|
test_cases = [ |
|
|
|
("CAT", "animals"), |
|
("BATSMAN", "cricket"), |
|
("SWIMMING", "sports"), |
|
("AIRPORT", "transportation"), |
|
("DATABASE", "technology"), |
|
|
|
|
|
("VIOLIN", "music"), |
|
("SCIENTIST", "science"), |
|
("PIZZA", "food"), |
|
("MOUNTAIN", "geography"), |
|
("HELICOPTER", "transportation"), |
|
("DEMOCRACY", "politics"), |
|
("PHOTOSYNTHESIS", "science"), |
|
|
|
|
|
("HAPPINESS", "emotions"), |
|
("ALGORITHM", "technology"), |
|
("METAPHOR", "literature"), |
|
] |
|
|
|
print(f"\nπ― Testing {len(test_cases)} challenging word-topic combinations") |
|
print("=" * 60) |
|
|
|
excellent_clues = 0 |
|
good_clues = 0 |
|
generic_clues = 0 |
|
poor_clues = 0 |
|
|
|
for word, topic in test_cases: |
|
print(f"\nπ Testing: '{word}' + '{topic}'") |
|
print("-" * 40) |
|
|
|
try: |
|
|
|
best_clue = generator.generate_clue( |
|
word=word, |
|
topic=topic, |
|
clue_style="definition", |
|
difficulty="medium" |
|
) |
|
|
|
if best_clue and len(best_clue) > 3: |
|
print(f"π Generated clue: {best_clue}") |
|
|
|
|
|
word_lower = word.lower() |
|
clue_lower = best_clue.lower() |
|
|
|
|
|
contains_word = word_lower in clue_lower |
|
is_generic = any(generic in clue_lower for generic in [ |
|
"make it moderately challenging", "make it challenging", |
|
"make it difficult", "make it easier", "moderately challenging", |
|
"difficult", "easy" |
|
]) |
|
is_nonsensical = any(nonsense in clue_lower for nonsense in [ |
|
"a) a) a)", "trick and treating", "gritting your teeth", |
|
"jack nixt", "fender", "tryon" |
|
]) |
|
|
|
|
|
has_definition = any(def_word in clue_lower for def_word in [ |
|
"player", "instrument", "device", "system", "place", "location", |
|
"animal", "creature", "building", "process", "method", "concept", |
|
"sport", "activity", "food", "dish", "language", "tool" |
|
]) |
|
|
|
is_descriptive = ( |
|
len(best_clue.split()) >= 3 and |
|
len(best_clue) >= 10 and |
|
not contains_word and |
|
not is_generic and |
|
not is_nonsensical |
|
) |
|
|
|
|
|
if contains_word: |
|
print("β Quality: POOR (contains target word)") |
|
poor_clues += 1 |
|
elif is_nonsensical: |
|
print("β Quality: POOR (nonsensical)") |
|
poor_clues += 1 |
|
elif is_generic: |
|
print("β οΈ Quality: GENERIC (template response)") |
|
generic_clues += 1 |
|
elif has_definition and is_descriptive: |
|
print("β
Quality: EXCELLENT (definitional & descriptive)") |
|
excellent_clues += 1 |
|
elif is_descriptive: |
|
print("β
Quality: GOOD (descriptive)") |
|
good_clues += 1 |
|
elif has_definition: |
|
print("π Quality: ACCEPTABLE (basic definition)") |
|
good_clues += 1 |
|
else: |
|
print("β οΈ Quality: GENERIC (basic)") |
|
generic_clues += 1 |
|
else: |
|
print("β No valid clue generated") |
|
poor_clues += 1 |
|
|
|
except Exception as e: |
|
print(f"β Error generating clue: {e}") |
|
poor_clues += 1 |
|
|
|
total_tests = len(test_cases) |
|
print(f"\n" + "=" * 60) |
|
print(f"π FLAN-T5-LARGE RESULTS (NO FALLBACKS)") |
|
print(f"=" * 60) |
|
print(f"Total tests: {total_tests}") |
|
print(f"Excellent clues: {excellent_clues}") |
|
print(f"Good clues: {good_clues}") |
|
print(f"Generic clues: {generic_clues}") |
|
print(f"Poor clues: {poor_clues}") |
|
print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%") |
|
print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%") |
|
print(f"Generic rate: {(generic_clues/total_tests)*100:.1f}%") |
|
|
|
|
|
if excellent_clues >= total_tests * 0.6: |
|
print("π SUCCESS! flan-t5-large produces excellent crossword clues!") |
|
print("π Ready for production - no fallbacks needed!") |
|
elif excellent_clues >= total_tests * 0.4 and generic_clues <= total_tests * 0.2: |
|
print("π Very good! flan-t5-large is suitable for production") |
|
print("β
Significant improvement over smaller models") |
|
elif (excellent_clues + good_clues) >= total_tests * 0.7: |
|
print("β οΈ Good results, but some generic responses remain") |
|
print("π‘ Consider prompt engineering refinements") |
|
else: |
|
print("β Still not meeting quality standards") |
|
print("π‘ May need flan-t5-xl (~11GB) or different approach") |
|
|
|
|
|
def main(): |
|
"""Run the flan-t5-large test.""" |
|
test_flan_t5_large() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|