Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_flan_t5_large.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 about 2 months ago

raw

history blame

7.38 kB

	#!/usr/bin/env python3
	"""
	Test: flan-t5-large Model for Superior Crossword Clue Generation
	Test the most capable model to eliminate generic responses and achieve excellence.
	"""

	import sys
	import logging
	from pathlib import Path

	# Add hack directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	try:
	from llm_clue_generator import LLMClueGenerator
	GENERATOR_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Import error: {e}")
	GENERATOR_AVAILABLE = False

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def test_flan_t5_large():
	"""Test flan-t5-large model for superior crossword clue quality."""
	if not GENERATOR_AVAILABLE:
	print("❌ Cannot run test - LLM generator not available")
	return

	print("🧪 Testing flan-t5-large Model (No Fallbacks)")
	print("=" * 60)

	# Initialize generator with large model
	print("🔄 Initializing flan-t5-large clue generator...")
	generator = LLMClueGenerator()

	try:
	generator.initialize()
	print(f"✅ Generator initialized successfully with {generator.model_name}")
	print(f"📊 Model size: ~3GB (3x larger than base, 37x larger than small)")
	except Exception as e:
	print(f"❌ Failed to initialize generator: {e}")
	print("💡 Note: flan-t5-large requires ~3GB RAM and longer initialization time")
	return

	# Challenging test cases that should be handled well by a large model
	test_cases = [
	# Basic cases that failed with smaller models
	("CAT", "animals"),
	("BATSMAN", "cricket"),
	("SWIMMING", "sports"),
	("AIRPORT", "transportation"),
	("DATABASE", "technology"),

	# More challenging cases requiring world knowledge
	("VIOLIN", "music"),
	("SCIENTIST", "science"),
	("PIZZA", "food"),
	("MOUNTAIN", "geography"),
	("HELICOPTER", "transportation"),
	("DEMOCRACY", "politics"),
	("PHOTOSYNTHESIS", "science"),

	# Abstract concepts
	("HAPPINESS", "emotions"),
	("ALGORITHM", "technology"),
	("METAPHOR", "literature"),
	]

	print(f"\n🎯 Testing {len(test_cases)} challenging word-topic combinations")
	print("=" * 60)

	excellent_clues = 0
	good_clues = 0
	generic_clues = 0
	poor_clues = 0

	for word, topic in test_cases:
	print(f"\n📝 Testing: '{word}' + '{topic}'")
	print("-" * 40)

	try:
	# Test the best-performing clue style
	best_clue = generator.generate_clue(
	word=word,
	topic=topic,
	clue_style="definition", # Usually produces the best results
	difficulty="medium"
	)

	if best_clue and len(best_clue) > 3:
	print(f"🏆 Generated clue: {best_clue}")

	# Comprehensive quality evaluation
	word_lower = word.lower()
	clue_lower = best_clue.lower()

	# Critical quality checks
	contains_word = word_lower in clue_lower
	is_generic = any(generic in clue_lower for generic in [
	"make it moderately challenging", "make it challenging",
	"make it difficult", "make it easier", "moderately challenging",
	"difficult", "easy"
	])
	is_nonsensical = any(nonsense in clue_lower for nonsense in [
	"a) a) a)", "trick and treating", "gritting your teeth",
	"jack nixt", "fender", "tryon"
	])

	# Positive quality indicators
	has_definition = any(def_word in clue_lower for def_word in [
	"player", "instrument", "device", "system", "place", "location",
	"animal", "creature", "building", "process", "method", "concept",
	"sport", "activity", "food", "dish", "language", "tool"
	])

	is_descriptive = (
	len(best_clue.split()) >= 3 and
	len(best_clue) >= 10 and
	not contains_word and
	not is_generic and
	not is_nonsensical
	)

	# Quality scoring
	if contains_word:
	print("❌ Quality: POOR (contains target word)")
	poor_clues += 1
	elif is_nonsensical:
	print("❌ Quality: POOR (nonsensical)")
	poor_clues += 1
	elif is_generic:
	print("⚠️ Quality: GENERIC (template response)")
	generic_clues += 1
	elif has_definition and is_descriptive:
	print("✅ Quality: EXCELLENT (definitional & descriptive)")
	excellent_clues += 1
	elif is_descriptive:
	print("✅ Quality: GOOD (descriptive)")
	good_clues += 1
	elif has_definition:
	print("🔄 Quality: ACCEPTABLE (basic definition)")
	good_clues += 1
	else:
	print("⚠️ Quality: GENERIC (basic)")
	generic_clues += 1
	else:
	print("❌ No valid clue generated")
	poor_clues += 1

	except Exception as e:
	print(f"❌ Error generating clue: {e}")
	poor_clues += 1

	total_tests = len(test_cases)
	print(f"\n" + "=" * 60)
	print(f"📊 FLAN-T5-LARGE RESULTS (NO FALLBACKS)")
	print(f"=" * 60)
	print(f"Total tests: {total_tests}")
	print(f"Excellent clues: {excellent_clues}")
	print(f"Good clues: {good_clues}")
	print(f"Generic clues: {generic_clues}")
	print(f"Poor clues: {poor_clues}")
	print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
	print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
	print(f"Generic rate: {(generic_clues/total_tests)*100:.1f}%")

	# Final evaluation - high standards for large model
	if excellent_clues >= total_tests * 0.6: # 60% excellent
	print("🎉 SUCCESS! flan-t5-large produces excellent crossword clues!")
	print("🚀 Ready for production - no fallbacks needed!")
	elif excellent_clues >= total_tests * 0.4 and generic_clues <= total_tests * 0.2: # 40% excellent, <20% generic
	print("🔄 Very good! flan-t5-large is suitable for production")
	print("✅ Significant improvement over smaller models")
	elif (excellent_clues + good_clues) >= total_tests * 0.7: # 70% good+excellent
	print("⚠️ Good results, but some generic responses remain")
	print("💡 Consider prompt engineering refinements")
	else:
	print("❌ Still not meeting quality standards")
	print("💡 May need flan-t5-xl (~11GB) or different approach")


	def main():
	"""Run the flan-t5-large test."""
	test_flan_t5_large()


	if __name__ == "__main__":
	main()