File size: 4,093 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
#!/usr/bin/env python3
"""
Quick Test: Improved Prompt Engineering
Test the improved prompts and validation on a few examples to see if clue quality improved.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"β Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging to see debug output
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
def test_improved_prompts():
"""Test improved prompt engineering with problematic examples."""
if not GENERATOR_AVAILABLE:
print("β Cannot run test - LLM generator not available")
return
print("π§ͺ Testing Improved Prompt Engineering")
print("=" * 60)
# Initialize generator
print("π Initializing LLM clue generator...")
generator = LLMClueGenerator()
try:
generator.initialize()
print("β
Generator initialized successfully")
except Exception as e:
print(f"β Failed to initialize generator: {e}")
return
# Test cases that previously produced bad clues
test_cases = [
# Previously bad examples
("CAT", "animals", "definition"),
("KITTY", "animals", "description"),
("MEAL", "food", "category"),
("HUNGER", "food", "simple"),
("TECH", "technology", "category"),
("SCIENTIST", "science", "trivia"),
# Additional test cases
("DOG", "animals", "definition"),
("PYTHON", "technology", "description"),
("GUITAR", "music", "category"),
]
print(f"\nπ― Testing {len(test_cases)} word-topic combinations")
print("=" * 60)
successful_clues = 0
total_tests = len(test_cases)
for word, topic, style in test_cases:
print(f"\nπ Testing: '{word}' + '{topic}' (style: {style})")
print("-" * 40)
try:
# Generate clue candidates to see the process
candidates = generator.generate_clue_candidates(
word=word,
topic=topic,
clue_style=style,
difficulty="medium",
num_candidates=3
)
print(f"Generated {len(candidates)} candidates:")
for i, candidate in enumerate(candidates, 1):
print(f" {i}. {candidate}")
# Get best clue
best_clue = generator.generate_clue(
word=word,
topic=topic,
clue_style=style,
difficulty="medium"
)
print(f"\nπ Best clue: {best_clue}")
# Evaluate quality
if best_clue and len(best_clue) > 5 and word.lower() not in best_clue.lower():
successful_clues += 1
print("β
Quality: GOOD")
else:
print("β Quality: POOR")
except Exception as e:
print(f"β Error generating clue: {e}")
print(f"\n" + "=" * 60)
print(f"π RESULTS SUMMARY")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Successful clues: {successful_clues}")
print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")
if successful_clues >= total_tests * 0.7: # 70% success rate
print("π Improved prompts show significant improvement!")
elif successful_clues >= total_tests * 0.4: # 40% success rate
print("π Some improvement, but may need model upgrade")
else:
print("β Prompts still not effective, recommend semantic template approach")
def main():
"""Run the prompt improvement test."""
test_improved_prompts()
if __name__ == "__main__":
main() |