Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_improved_prompts.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 24 days ago

raw

history blame

4.09 kB

	#!/usr/bin/env python3
	"""
	Quick Test: Improved Prompt Engineering
	Test the improved prompts and validation on a few examples to see if clue quality improved.
	"""

	import sys
	import logging
	from pathlib import Path

	# Add hack directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	try:
	from llm_clue_generator import LLMClueGenerator
	GENERATOR_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Import error: {e}")
	GENERATOR_AVAILABLE = False

	# Set up logging to see debug output
	logging.basicConfig(
	level=logging.DEBUG,
	format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)
	logger = logging.getLogger(__name__)


	def test_improved_prompts():
	"""Test improved prompt engineering with problematic examples."""
	if not GENERATOR_AVAILABLE:
	print("❌ Cannot run test - LLM generator not available")
	return

	print("🧪 Testing Improved Prompt Engineering")
	print("=" * 60)

	# Initialize generator
	print("🔄 Initializing LLM clue generator...")
	generator = LLMClueGenerator()

	try:
	generator.initialize()
	print("✅ Generator initialized successfully")
	except Exception as e:
	print(f"❌ Failed to initialize generator: {e}")
	return

	# Test cases that previously produced bad clues
	test_cases = [
	# Previously bad examples
	("CAT", "animals", "definition"),
	("KITTY", "animals", "description"),
	("MEAL", "food", "category"),
	("HUNGER", "food", "simple"),
	("TECH", "technology", "category"),
	("SCIENTIST", "science", "trivia"),

	# Additional test cases
	("DOG", "animals", "definition"),
	("PYTHON", "technology", "description"),
	("GUITAR", "music", "category"),
	]

	print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
	print("=" * 60)

	successful_clues = 0
	total_tests = len(test_cases)

	for word, topic, style in test_cases:
	print(f"\n📝 Testing: '{word}' + '{topic}' (style: {style})")
	print("-" * 40)

	try:
	# Generate clue candidates to see the process
	candidates = generator.generate_clue_candidates(
	word=word,
	topic=topic,
	clue_style=style,
	difficulty="medium",
	num_candidates=3
	)

	print(f"Generated {len(candidates)} candidates:")
	for i, candidate in enumerate(candidates, 1):
	print(f" {i}. {candidate}")

	# Get best clue
	best_clue = generator.generate_clue(
	word=word,
	topic=topic,
	clue_style=style,
	difficulty="medium"
	)

	print(f"\n🏆 Best clue: {best_clue}")

	# Evaluate quality
	if best_clue and len(best_clue) > 5 and word.lower() not in best_clue.lower():
	successful_clues += 1
	print("✅ Quality: GOOD")
	else:
	print("❌ Quality: POOR")

	except Exception as e:
	print(f"❌ Error generating clue: {e}")

	print(f"\n" + "=" * 60)
	print(f"📊 RESULTS SUMMARY")
	print(f"=" * 60)
	print(f"Total tests: {total_tests}")
	print(f"Successful clues: {successful_clues}")
	print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")

	if successful_clues >= total_tests * 0.7: # 70% success rate
	print("🎉 Improved prompts show significant improvement!")
	elif successful_clues >= total_tests * 0.4: # 40% success rate
	print("🔄 Some improvement, but may need model upgrade")
	else:
	print("❌ Prompts still not effective, recommend semantic template approach")


	def main():
	"""Run the prompt improvement test."""
	test_improved_prompts()


	if __name__ == "__main__":
	main()