|
|
|
""" |
|
Quick Test: Improved Prompt Engineering |
|
Test the improved prompts and validation on a few examples to see if clue quality improved. |
|
""" |
|
|
|
import sys |
|
import logging |
|
from pathlib import Path |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
try: |
|
from llm_clue_generator import LLMClueGenerator |
|
GENERATOR_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"β Import error: {e}") |
|
GENERATOR_AVAILABLE = False |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.DEBUG, |
|
format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s', |
|
datefmt='%Y-%m-%d %H:%M:%S' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def test_improved_prompts(): |
|
"""Test improved prompt engineering with problematic examples.""" |
|
if not GENERATOR_AVAILABLE: |
|
print("β Cannot run test - LLM generator not available") |
|
return |
|
|
|
print("π§ͺ Testing Improved Prompt Engineering") |
|
print("=" * 60) |
|
|
|
|
|
print("π Initializing LLM clue generator...") |
|
generator = LLMClueGenerator() |
|
|
|
try: |
|
generator.initialize() |
|
print("β
Generator initialized successfully") |
|
except Exception as e: |
|
print(f"β Failed to initialize generator: {e}") |
|
return |
|
|
|
|
|
test_cases = [ |
|
|
|
("CAT", "animals", "definition"), |
|
("KITTY", "animals", "description"), |
|
("MEAL", "food", "category"), |
|
("HUNGER", "food", "simple"), |
|
("TECH", "technology", "category"), |
|
("SCIENTIST", "science", "trivia"), |
|
|
|
|
|
("DOG", "animals", "definition"), |
|
("PYTHON", "technology", "description"), |
|
("GUITAR", "music", "category"), |
|
] |
|
|
|
print(f"\nπ― Testing {len(test_cases)} word-topic combinations") |
|
print("=" * 60) |
|
|
|
successful_clues = 0 |
|
total_tests = len(test_cases) |
|
|
|
for word, topic, style in test_cases: |
|
print(f"\nπ Testing: '{word}' + '{topic}' (style: {style})") |
|
print("-" * 40) |
|
|
|
try: |
|
|
|
candidates = generator.generate_clue_candidates( |
|
word=word, |
|
topic=topic, |
|
clue_style=style, |
|
difficulty="medium", |
|
num_candidates=3 |
|
) |
|
|
|
print(f"Generated {len(candidates)} candidates:") |
|
for i, candidate in enumerate(candidates, 1): |
|
print(f" {i}. {candidate}") |
|
|
|
|
|
best_clue = generator.generate_clue( |
|
word=word, |
|
topic=topic, |
|
clue_style=style, |
|
difficulty="medium" |
|
) |
|
|
|
print(f"\nπ Best clue: {best_clue}") |
|
|
|
|
|
if best_clue and len(best_clue) > 5 and word.lower() not in best_clue.lower(): |
|
successful_clues += 1 |
|
print("β
Quality: GOOD") |
|
else: |
|
print("β Quality: POOR") |
|
|
|
except Exception as e: |
|
print(f"β Error generating clue: {e}") |
|
|
|
print(f"\n" + "=" * 60) |
|
print(f"π RESULTS SUMMARY") |
|
print(f"=" * 60) |
|
print(f"Total tests: {total_tests}") |
|
print(f"Successful clues: {successful_clues}") |
|
print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%") |
|
|
|
if successful_clues >= total_tests * 0.7: |
|
print("π Improved prompts show significant improvement!") |
|
elif successful_clues >= total_tests * 0.4: |
|
print("π Some improvement, but may need model upgrade") |
|
else: |
|
print("β Prompts still not effective, recommend semantic template approach") |
|
|
|
|
|
def main(): |
|
"""Run the prompt improvement test.""" |
|
test_improved_prompts() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |