File size: 7,141 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
#!/usr/bin/env python3
"""
Test: Superior Prompt Engineering with flan-t5-base
Test if better prompts with examples can achieve excellence without larger models.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"β Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_superior_prompts():
"""Test superior prompt engineering with flan-t5-base."""
if not GENERATOR_AVAILABLE:
print("β Cannot run test - LLM generator not available")
return
print("π§ͺ Testing Superior Prompt Engineering")
print("=" * 60)
print("π― Strategy: Better prompts with examples vs larger models")
# Initialize generator
print("π Initializing flan-t5-base with superior prompts...")
generator = LLMClueGenerator()
try:
generator.initialize()
print(f"β
Generator initialized successfully with {generator.model_name}")
print(f"π Model size: ~1GB with enhanced example-based prompts")
except Exception as e:
print(f"β Failed to initialize generator: {e}")
return
# Test cases that should work well with good examples
test_cases = [
# Cases that failed with poor prompting
("CAT", "animals"),
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
("DATABASE", "technology"),
# Additional challenging cases
("VIOLIN", "music"),
("SCIENTIST", "science"),
("PIZZA", "food"),
("MOUNTAIN", "geography"),
("ELEPHANT", "animals"),
]
print(f"\nπ― Testing {len(test_cases)} cases with superior prompts")
print("=" * 60)
excellent_results = []
good_results = []
poor_results = []
for word, topic in test_cases:
print(f"\nπ Testing: '{word}' + '{topic}'")
print("-" * 40)
try:
# Test different prompt styles
results = {}
for style in ["definition", "description", "simple"]:
clue = generator.generate_clue(
word=word,
topic=topic,
clue_style=style,
difficulty="medium"
)
if clue and len(clue) > 3:
results[style] = clue
if results:
print("Generated clues:")
for style, clue in results.items():
print(f" [{style}] {clue}")
# Use the best result
best_style = "definition" if "definition" in results else list(results.keys())[0]
best_clue = results[best_style]
print(f"\nπ Best clue [{best_style}]: {best_clue}")
# Quality evaluation
word_lower = word.lower()
clue_lower = best_clue.lower()
# Quality checks
contains_word = word_lower in clue_lower
is_generic = any(bad in clue_lower for bad in [
"make it", "moderately challenging", "difficult", "easy"
])
is_descriptive = len(best_clue.split()) >= 2 and len(best_clue) >= 6
has_quality_words = any(good in clue_lower for good in [
"instrument", "player", "animal", "device", "system", "terminal",
"companion", "professional", "activity", "dish", "creature"
])
# Scoring
if contains_word:
print("β Quality: POOR (contains target word)")
poor_results.append((word, topic, best_clue, "contains word"))
elif is_generic:
print("β οΈ Quality: GENERIC (template response)")
poor_results.append((word, topic, best_clue, "generic"))
elif has_quality_words and is_descriptive:
print("β
Quality: EXCELLENT (specific & descriptive)")
excellent_results.append((word, topic, best_clue))
elif is_descriptive:
print("β
Quality: GOOD (descriptive)")
good_results.append((word, topic, best_clue))
else:
print("π Quality: ACCEPTABLE")
good_results.append((word, topic, best_clue))
else:
print("β No valid clues generated")
poor_results.append((word, topic, "No clue", "failed"))
except Exception as e:
print(f"β Error: {e}")
poor_results.append((word, topic, "Error", str(e)))
# Results analysis
total_tests = len(test_cases)
excellent_count = len(excellent_results)
good_count = len(good_results)
poor_count = len(poor_results)
print(f"\n" + "=" * 60)
print(f"π SUPERIOR PROMPTS RESULTS")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Excellent clues: {excellent_count}")
print(f"Good clues: {good_count}")
print(f"Poor/Failed clues: {poor_count}")
print(f"Success rate: {((excellent_count + good_count)/total_tests)*100:.1f}%")
print(f"Excellence rate: {(excellent_count/total_tests)*100:.1f}%")
# Show best results
if excellent_results:
print(f"\nπ EXCELLENT CLUES:")
print("-" * 40)
for word, topic, clue in excellent_results:
print(f" {word} + {topic}: \"{clue}\"")
if good_results and len(good_results) <= 5:
print(f"\nβ
GOOD CLUES:")
print("-" * 40)
for word, topic, clue in good_results:
print(f" {word} + {topic}: \"{clue}\"")
# Final evaluation
if excellent_count >= total_tests * 0.6: # 60% excellent
print("\nπ SUCCESS! Superior prompts achieve excellent results!")
print("π Ready for production - proof that better prompts > bigger models!")
elif excellent_count >= total_tests * 0.4: # 40% excellent
print("\nπ Very promising! Superior prompts show major improvement")
print("β
Much better than previous attempts")
elif (excellent_count + good_count) >= total_tests * 0.7: # 70% success
print("\nβ οΈ Good results with superior prompts")
print("π‘ Demonstrates prompt engineering is key to success")
else:
print("\nβ Still struggling even with better prompts")
print("π‘ May need combination of larger model + superior prompts")
def main():
"""Run the superior prompts test."""
test_superior_prompts()
if __name__ == "__main__":
main() |