Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_flan_t5_base.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 21 days ago

raw

history blame

7.63 kB

	#!/usr/bin/env python3
	"""
	Test: Upgraded flan-t5-base Model for Crossword Clue Generation
	Compare flan-t5-base performance against the previous flan-t5-small results.
	"""

	import sys
	import logging
	from pathlib import Path

	# Add hack directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	try:
	from llm_clue_generator import LLMClueGenerator
	GENERATOR_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Import error: {e}")
	GENERATOR_AVAILABLE = False

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def test_flan_t5_base():
	"""Test flan-t5-base model with problematic examples that failed with flan-t5-small."""
	if not GENERATOR_AVAILABLE:
	print("❌ Cannot run test - LLM generator not available")
	return

	print("🧪 Testing Upgraded flan-t5-base Model")
	print("=" * 60)

	# Initialize generator with base model
	print("🔄 Initializing flan-t5-base clue generator...")
	generator = LLMClueGenerator()

	try:
	generator.initialize()
	print(f"✅ Generator initialized successfully with {generator.model_name}")
	print(f"📊 Model size: ~1GB (vs ~80MB for flan-t5-small)")
	except Exception as e:
	print(f"❌ Failed to initialize generator: {e}")
	return

	# Test cases that produced terrible results with flan-t5-small
	test_cases = [
	# Previous failures with flan-t5-small:
	# CAT + animals → "Tryon", "Trick and treating"
	# MEAL + food → "Jack nixt", "fender"
	# SONG + music → "Gritting your teeth"
	("CAT", "animals"),
	("KITTY", "animals"),
	("MEAL", "food"),
	("HUNGER", "food"),
	("SONG", "music"),
	("GUITAR", "music"),

	# Your specific problematic examples
	("BATSMAN", "cricket"),
	("SWIMMING", "sports"),
	("AIRPORT", "transportation"),

	# Additional challenging cases
	("DATABASE", "technology"),
	("SCIENTIST", "science"),
	("PIZZA", "food"),
	("MOUNTAIN", "geography"),
	]

	print(f"\n🎯 Testing {len(test_cases)} word-topic combinations with flan-t5-base")
	print("=" * 60)

	excellent_clues = 0
	good_clues = 0
	poor_clues = 0
	failed_clues = 0

	# Track specific improvements over flan-t5-small
	major_improvements = []

	for word, topic in test_cases:
	print(f"\n📝 Testing: '{word}' + '{topic}'")
	print("-" * 40)

	try:
	# Test multiple clue styles to get best result
	styles = ["definition", "description", "category", "function", "context"]
	candidates = []

	for style in styles:
	clue = generator.generate_clue(
	word=word,
	topic=topic,
	clue_style=style,
	difficulty="medium"
	)
	if clue and len(clue) > 5:
	candidates.append((style, clue))

	if candidates:
	print(f"Generated {len(candidates)} candidates:")
	for i, (style, clue) in enumerate(candidates, 1):
	print(f" {i}. [{style}] {clue}")

	# Use the first valid clue as best
	best_style, best_clue = candidates[0]
	print(f"\n🏆 Best clue [{best_style}]: {best_clue}")

	# Enhanced quality evaluation
	word_lower = word.lower()
	clue_lower = best_clue.lower()

	# Check if contains target word (should not)
	contains_word = word_lower in clue_lower

	# Check for nonsense patterns from flan-t5-small
	old_nonsense = any(bad in clue_lower for bad in [
	"trick and treating", "gritting your teeth", "jack nixt",
	"fender", "tryon", "nicolas", "occurrence", "sludge"
	])

	# Check for descriptive quality
	is_descriptive = (
	len(best_clue.split()) >= 2 and
	len(best_clue) >= 8 and
	not contains_word and
	not old_nonsense
	)

	# Check for definitional quality
	is_definitional = (
	any(def_word in clue_lower for def_word in [
	"player", "sport", "instrument", "device", "system", "food",
	"language", "place", "animal", "creature", "location"
	]) and not contains_word
	)

	if contains_word:
	print("❌ Quality: POOR (contains target word)")
	poor_clues += 1
	elif old_nonsense:
	print("❌ Quality: POOR (nonsensical)")
	poor_clues += 1
	elif is_definitional:
	print("✅ Quality: EXCELLENT (definitional)")
	excellent_clues += 1
	major_improvements.append((word, topic, best_clue))
	elif is_descriptive:
	print("✅ Quality: GOOD (descriptive)")
	good_clues += 1
	major_improvements.append((word, topic, best_clue))
	else:
	print("🔄 Quality: ACCEPTABLE")
	good_clues += 1
	else:
	print("❌ No valid clues generated")
	failed_clues += 1

	except Exception as e:
	print(f"❌ Error generating clue: {e}")
	failed_clues += 1

	total_tests = len(test_cases)
	print(f"\n" + "=" * 60)
	print(f"📊 FLAN-T5-BASE RESULTS")
	print(f"=" * 60)
	print(f"Total tests: {total_tests}")
	print(f"Excellent clues: {excellent_clues}")
	print(f"Good clues: {good_clues}")
	print(f"Poor clues: {poor_clues}")
	print(f"Failed clues: {failed_clues}")
	print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
	print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")

	# Show major improvements
	if major_improvements:
	print(f"\n🎉 MAJOR IMPROVEMENTS OVER FLAN-T5-SMALL:")
	print("-" * 60)
	for word, topic, clue in major_improvements[:5]: # Show top 5
	print(f" {word} + {topic}: \"{clue}\"")

	# Evaluation compared to flan-t5-small (which had ~0% success)
	if excellent_clues >= total_tests * 0.4: # 40% excellent
	print("🎉 MAJOR SUCCESS! flan-t5-base produces excellent clues!")
	print("🚀 Ready for production use - significant upgrade from flan-t5-small")
	elif (excellent_clues + good_clues) >= total_tests * 0.6: # 60% good+excellent
	print("🔄 Good improvement! Much better than flan-t5-small")
	print("✅ Suitable for production with semantic fallback")
	elif (excellent_clues + good_clues) >= total_tests * 0.3: # 30% success
	print("⚠️ Some improvement over flan-t5-small, but still limited")
	else:
	print("❌ Still struggling - may need even larger model or external knowledge")


	def main():
	"""Run the flan-t5-base upgrade test."""
	test_flan_t5_base()


	if __name__ == "__main__":
	main()