File size: 4,093 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python3
"""
Quick Test: Improved Prompt Engineering
Test the improved prompts and validation on a few examples to see if clue quality improved.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging to see debug output
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


def test_improved_prompts():
    """Test improved prompt engineering with problematic examples."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("πŸ§ͺ Testing Improved Prompt Engineering")
    print("=" * 60)
    
    # Initialize generator
    print("πŸ”„ Initializing LLM clue generator...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print("βœ… Generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that previously produced bad clues
    test_cases = [
        # Previously bad examples
        ("CAT", "animals", "definition"),
        ("KITTY", "animals", "description"), 
        ("MEAL", "food", "category"),
        ("HUNGER", "food", "simple"),
        ("TECH", "technology", "category"),
        ("SCIENTIST", "science", "trivia"),
        
        # Additional test cases
        ("DOG", "animals", "definition"),
        ("PYTHON", "technology", "description"),
        ("GUITAR", "music", "category"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
    print("=" * 60)
    
    successful_clues = 0
    total_tests = len(test_cases)
    
    for word, topic, style in test_cases:
        print(f"\nπŸ“ Testing: '{word}' + '{topic}' (style: {style})")
        print("-" * 40)
        
        try:
            # Generate clue candidates to see the process
            candidates = generator.generate_clue_candidates(
                word=word,
                topic=topic,
                clue_style=style,
                difficulty="medium",
                num_candidates=3
            )
            
            print(f"Generated {len(candidates)} candidates:")
            for i, candidate in enumerate(candidates, 1):
                print(f"  {i}. {candidate}")
            
            # Get best clue
            best_clue = generator.generate_clue(
                word=word,
                topic=topic,
                clue_style=style,
                difficulty="medium"
            )
            
            print(f"\nπŸ† Best clue: {best_clue}")
            
            # Evaluate quality
            if best_clue and len(best_clue) > 5 and word.lower() not in best_clue.lower():
                successful_clues += 1
                print("βœ… Quality: GOOD")
            else:
                print("❌ Quality: POOR")
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
    
    print(f"\n" + "=" * 60)
    print(f"πŸ“Š RESULTS SUMMARY")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Successful clues: {successful_clues}")
    print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")
    
    if successful_clues >= total_tests * 0.7:  # 70% success rate
        print("πŸŽ‰ Improved prompts show significant improvement!")
    elif successful_clues >= total_tests * 0.4:  # 40% success rate  
        print("πŸ”„ Some improvement, but may need model upgrade")
    else:
        print("❌ Prompts still not effective, recommend semantic template approach")


def main():
    """Run the prompt improvement test."""
    test_improved_prompts()


if __name__ == "__main__":
    main()