Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_superior_prompts.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 27 days ago

raw

history blame

7.14 kB

	#!/usr/bin/env python3
	"""
	Test: Superior Prompt Engineering with flan-t5-base
	Test if better prompts with examples can achieve excellence without larger models.
	"""

	import sys
	import logging
	from pathlib import Path

	# Add hack directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	try:
	from llm_clue_generator import LLMClueGenerator
	GENERATOR_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Import error: {e}")
	GENERATOR_AVAILABLE = False

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def test_superior_prompts():
	"""Test superior prompt engineering with flan-t5-base."""
	if not GENERATOR_AVAILABLE:
	print("❌ Cannot run test - LLM generator not available")
	return

	print("🧪 Testing Superior Prompt Engineering")
	print("=" * 60)
	print("🎯 Strategy: Better prompts with examples vs larger models")

	# Initialize generator
	print("🔄 Initializing flan-t5-base with superior prompts...")
	generator = LLMClueGenerator()

	try:
	generator.initialize()
	print(f"✅ Generator initialized successfully with {generator.model_name}")
	print(f"📊 Model size: ~1GB with enhanced example-based prompts")
	except Exception as e:
	print(f"❌ Failed to initialize generator: {e}")
	return

	# Test cases that should work well with good examples
	test_cases = [
	# Cases that failed with poor prompting
	("CAT", "animals"),
	("BATSMAN", "cricket"),
	("SWIMMING", "sports"),
	("AIRPORT", "transportation"),
	("DATABASE", "technology"),

	# Additional challenging cases
	("VIOLIN", "music"),
	("SCIENTIST", "science"),
	("PIZZA", "food"),
	("MOUNTAIN", "geography"),
	("ELEPHANT", "animals"),
	]

	print(f"\n🎯 Testing {len(test_cases)} cases with superior prompts")
	print("=" * 60)

	excellent_results = []
	good_results = []
	poor_results = []

	for word, topic in test_cases:
	print(f"\n📝 Testing: '{word}' + '{topic}'")
	print("-" * 40)

	try:
	# Test different prompt styles
	results = {}
	for style in ["definition", "description", "simple"]:
	clue = generator.generate_clue(
	word=word,
	topic=topic,
	clue_style=style,
	difficulty="medium"
	)
	if clue and len(clue) > 3:
	results[style] = clue

	if results:
	print("Generated clues:")
	for style, clue in results.items():
	print(f" [{style}] {clue}")

	# Use the best result
	best_style = "definition" if "definition" in results else list(results.keys())[0]
	best_clue = results[best_style]

	print(f"\n🏆 Best clue [{best_style}]: {best_clue}")

	# Quality evaluation
	word_lower = word.lower()
	clue_lower = best_clue.lower()

	# Quality checks
	contains_word = word_lower in clue_lower
	is_generic = any(bad in clue_lower for bad in [
	"make it", "moderately challenging", "difficult", "easy"
	])
	is_descriptive = len(best_clue.split()) >= 2 and len(best_clue) >= 6
	has_quality_words = any(good in clue_lower for good in [
	"instrument", "player", "animal", "device", "system", "terminal",
	"companion", "professional", "activity", "dish", "creature"
	])

	# Scoring
	if contains_word:
	print("❌ Quality: POOR (contains target word)")
	poor_results.append((word, topic, best_clue, "contains word"))
	elif is_generic:
	print("⚠️ Quality: GENERIC (template response)")
	poor_results.append((word, topic, best_clue, "generic"))
	elif has_quality_words and is_descriptive:
	print("✅ Quality: EXCELLENT (specific & descriptive)")
	excellent_results.append((word, topic, best_clue))
	elif is_descriptive:
	print("✅ Quality: GOOD (descriptive)")
	good_results.append((word, topic, best_clue))
	else:
	print("🔄 Quality: ACCEPTABLE")
	good_results.append((word, topic, best_clue))
	else:
	print("❌ No valid clues generated")
	poor_results.append((word, topic, "No clue", "failed"))

	except Exception as e:
	print(f"❌ Error: {e}")
	poor_results.append((word, topic, "Error", str(e)))

	# Results analysis
	total_tests = len(test_cases)
	excellent_count = len(excellent_results)
	good_count = len(good_results)
	poor_count = len(poor_results)

	print(f"\n" + "=" * 60)
	print(f"📊 SUPERIOR PROMPTS RESULTS")
	print(f"=" * 60)
	print(f"Total tests: {total_tests}")
	print(f"Excellent clues: {excellent_count}")
	print(f"Good clues: {good_count}")
	print(f"Poor/Failed clues: {poor_count}")
	print(f"Success rate: {((excellent_count + good_count)/total_tests)*100:.1f}%")
	print(f"Excellence rate: {(excellent_count/total_tests)*100:.1f}%")

	# Show best results
	if excellent_results:
	print(f"\n🎉 EXCELLENT CLUES:")
	print("-" * 40)
	for word, topic, clue in excellent_results:
	print(f" {word} + {topic}: \"{clue}\"")

	if good_results and len(good_results) <= 5:
	print(f"\n✅ GOOD CLUES:")
	print("-" * 40)
	for word, topic, clue in good_results:
	print(f" {word} + {topic}: \"{clue}\"")

	# Final evaluation
	if excellent_count >= total_tests * 0.6: # 60% excellent
	print("\n🎉 SUCCESS! Superior prompts achieve excellent results!")
	print("🚀 Ready for production - proof that better prompts > bigger models!")
	elif excellent_count >= total_tests * 0.4: # 40% excellent
	print("\n🔄 Very promising! Superior prompts show major improvement")
	print("✅ Much better than previous attempts")
	elif (excellent_count + good_count) >= total_tests * 0.7: # 70% success
	print("\n⚠️ Good results with superior prompts")
	print("💡 Demonstrates prompt engineering is key to success")
	else:
	print("\n❌ Still struggling even with better prompts")
	print("💡 May need combination of larger model + superior prompts")


	def main():
	"""Run the superior prompts test."""
	test_superior_prompts()


	if __name__ == "__main__":
	main()