abc123 / hack /test_improved_prompts.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
4.09 kB
#!/usr/bin/env python3
"""
Quick Test: Improved Prompt Engineering
Test the improved prompts and validation on a few examples to see if clue quality improved.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"❌ Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging to see debug output
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
def test_improved_prompts():
"""Test improved prompt engineering with problematic examples."""
if not GENERATOR_AVAILABLE:
print("❌ Cannot run test - LLM generator not available")
return
print("πŸ§ͺ Testing Improved Prompt Engineering")
print("=" * 60)
# Initialize generator
print("πŸ”„ Initializing LLM clue generator...")
generator = LLMClueGenerator()
try:
generator.initialize()
print("βœ… Generator initialized successfully")
except Exception as e:
print(f"❌ Failed to initialize generator: {e}")
return
# Test cases that previously produced bad clues
test_cases = [
# Previously bad examples
("CAT", "animals", "definition"),
("KITTY", "animals", "description"),
("MEAL", "food", "category"),
("HUNGER", "food", "simple"),
("TECH", "technology", "category"),
("SCIENTIST", "science", "trivia"),
# Additional test cases
("DOG", "animals", "definition"),
("PYTHON", "technology", "description"),
("GUITAR", "music", "category"),
]
print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
print("=" * 60)
successful_clues = 0
total_tests = len(test_cases)
for word, topic, style in test_cases:
print(f"\nπŸ“ Testing: '{word}' + '{topic}' (style: {style})")
print("-" * 40)
try:
# Generate clue candidates to see the process
candidates = generator.generate_clue_candidates(
word=word,
topic=topic,
clue_style=style,
difficulty="medium",
num_candidates=3
)
print(f"Generated {len(candidates)} candidates:")
for i, candidate in enumerate(candidates, 1):
print(f" {i}. {candidate}")
# Get best clue
best_clue = generator.generate_clue(
word=word,
topic=topic,
clue_style=style,
difficulty="medium"
)
print(f"\nπŸ† Best clue: {best_clue}")
# Evaluate quality
if best_clue and len(best_clue) > 5 and word.lower() not in best_clue.lower():
successful_clues += 1
print("βœ… Quality: GOOD")
else:
print("❌ Quality: POOR")
except Exception as e:
print(f"❌ Error generating clue: {e}")
print(f"\n" + "=" * 60)
print(f"πŸ“Š RESULTS SUMMARY")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Successful clues: {successful_clues}")
print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")
if successful_clues >= total_tests * 0.7: # 70% success rate
print("πŸŽ‰ Improved prompts show significant improvement!")
elif successful_clues >= total_tests * 0.4: # 40% success rate
print("πŸ”„ Some improvement, but may need model upgrade")
else:
print("❌ Prompts still not effective, recommend semantic template approach")
def main():
"""Run the prompt improvement test."""
test_improved_prompts()
if __name__ == "__main__":
main()