File size: 7,141 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/usr/bin/env python3
"""
Test: Superior Prompt Engineering with flan-t5-base
Test if better prompts with examples can achieve excellence without larger models.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_superior_prompts():
    """Test superior prompt engineering with flan-t5-base."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("πŸ§ͺ Testing Superior Prompt Engineering")
    print("=" * 60)
    print("🎯 Strategy: Better prompts with examples vs larger models")
    
    # Initialize generator
    print("πŸ”„ Initializing flan-t5-base with superior prompts...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print(f"βœ… Generator initialized successfully with {generator.model_name}")
        print(f"πŸ“Š Model size: ~1GB with enhanced example-based prompts")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that should work well with good examples
    test_cases = [
        # Cases that failed with poor prompting
        ("CAT", "animals"),
        ("BATSMAN", "cricket"),
        ("SWIMMING", "sports"),
        ("AIRPORT", "transportation"),
        ("DATABASE", "technology"),
        
        # Additional challenging cases
        ("VIOLIN", "music"),
        ("SCIENTIST", "science"),
        ("PIZZA", "food"),
        ("MOUNTAIN", "geography"),
        ("ELEPHANT", "animals"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} cases with superior prompts")
    print("=" * 60)
    
    excellent_results = []
    good_results = []
    poor_results = []
    
    for word, topic in test_cases:
        print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Test different prompt styles
            results = {}
            for style in ["definition", "description", "simple"]:
                clue = generator.generate_clue(
                    word=word,
                    topic=topic,
                    clue_style=style,
                    difficulty="medium"
                )
                if clue and len(clue) > 3:
                    results[style] = clue
            
            if results:
                print("Generated clues:")
                for style, clue in results.items():
                    print(f"  [{style}] {clue}")
                
                # Use the best result
                best_style = "definition" if "definition" in results else list(results.keys())[0]
                best_clue = results[best_style]
                
                print(f"\nπŸ† Best clue [{best_style}]: {best_clue}")
                
                # Quality evaluation
                word_lower = word.lower()
                clue_lower = best_clue.lower()
                
                # Quality checks
                contains_word = word_lower in clue_lower
                is_generic = any(bad in clue_lower for bad in [
                    "make it", "moderately challenging", "difficult", "easy"
                ])
                is_descriptive = len(best_clue.split()) >= 2 and len(best_clue) >= 6
                has_quality_words = any(good in clue_lower for good in [
                    "instrument", "player", "animal", "device", "system", "terminal", 
                    "companion", "professional", "activity", "dish", "creature"
                ])
                
                # Scoring
                if contains_word:
                    print("❌ Quality: POOR (contains target word)")
                    poor_results.append((word, topic, best_clue, "contains word"))
                elif is_generic:
                    print("⚠️  Quality: GENERIC (template response)")
                    poor_results.append((word, topic, best_clue, "generic"))
                elif has_quality_words and is_descriptive:
                    print("βœ… Quality: EXCELLENT (specific & descriptive)")
                    excellent_results.append((word, topic, best_clue))
                elif is_descriptive:
                    print("βœ… Quality: GOOD (descriptive)")
                    good_results.append((word, topic, best_clue))
                else:
                    print("πŸ”„ Quality: ACCEPTABLE")
                    good_results.append((word, topic, best_clue))
            else:
                print("❌ No valid clues generated")
                poor_results.append((word, topic, "No clue", "failed"))
                
        except Exception as e:
            print(f"❌ Error: {e}")
            poor_results.append((word, topic, "Error", str(e)))
    
    # Results analysis
    total_tests = len(test_cases)
    excellent_count = len(excellent_results)
    good_count = len(good_results)
    poor_count = len(poor_results)
    
    print(f"\n" + "=" * 60)
    print(f"πŸ“Š SUPERIOR PROMPTS RESULTS")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Excellent clues: {excellent_count}")
    print(f"Good clues: {good_count}")
    print(f"Poor/Failed clues: {poor_count}")
    print(f"Success rate: {((excellent_count + good_count)/total_tests)*100:.1f}%")
    print(f"Excellence rate: {(excellent_count/total_tests)*100:.1f}%")
    
    # Show best results
    if excellent_results:
        print(f"\nπŸŽ‰ EXCELLENT CLUES:")
        print("-" * 40)
        for word, topic, clue in excellent_results:
            print(f"  {word} + {topic}: \"{clue}\"")
    
    if good_results and len(good_results) <= 5:
        print(f"\nβœ… GOOD CLUES:")
        print("-" * 40)
        for word, topic, clue in good_results:
            print(f"  {word} + {topic}: \"{clue}\"")
    
    # Final evaluation
    if excellent_count >= total_tests * 0.6:  # 60% excellent
        print("\nπŸŽ‰ SUCCESS! Superior prompts achieve excellent results!")
        print("πŸš€ Ready for production - proof that better prompts > bigger models!")
    elif excellent_count >= total_tests * 0.4:  # 40% excellent
        print("\nπŸ”„ Very promising! Superior prompts show major improvement")
        print("βœ… Much better than previous attempts")
    elif (excellent_count + good_count) >= total_tests * 0.7:  # 70% success
        print("\n⚠️  Good results with superior prompts")
        print("πŸ’‘ Demonstrates prompt engineering is key to success")
    else:
        print("\n❌ Still struggling even with better prompts")
        print("πŸ’‘ May need combination of larger model + superior prompts")


def main():
    """Run the superior prompts test."""
    test_superior_prompts()


if __name__ == "__main__":
    main()