abc123 / hack /test_superior_prompts.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
7.14 kB
#!/usr/bin/env python3
"""
Test: Superior Prompt Engineering with flan-t5-base
Test if better prompts with examples can achieve excellence without larger models.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from llm_clue_generator import LLMClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"❌ Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_superior_prompts():
"""Test superior prompt engineering with flan-t5-base."""
if not GENERATOR_AVAILABLE:
print("❌ Cannot run test - LLM generator not available")
return
print("πŸ§ͺ Testing Superior Prompt Engineering")
print("=" * 60)
print("🎯 Strategy: Better prompts with examples vs larger models")
# Initialize generator
print("πŸ”„ Initializing flan-t5-base with superior prompts...")
generator = LLMClueGenerator()
try:
generator.initialize()
print(f"βœ… Generator initialized successfully with {generator.model_name}")
print(f"πŸ“Š Model size: ~1GB with enhanced example-based prompts")
except Exception as e:
print(f"❌ Failed to initialize generator: {e}")
return
# Test cases that should work well with good examples
test_cases = [
# Cases that failed with poor prompting
("CAT", "animals"),
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
("DATABASE", "technology"),
# Additional challenging cases
("VIOLIN", "music"),
("SCIENTIST", "science"),
("PIZZA", "food"),
("MOUNTAIN", "geography"),
("ELEPHANT", "animals"),
]
print(f"\n🎯 Testing {len(test_cases)} cases with superior prompts")
print("=" * 60)
excellent_results = []
good_results = []
poor_results = []
for word, topic in test_cases:
print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
print("-" * 40)
try:
# Test different prompt styles
results = {}
for style in ["definition", "description", "simple"]:
clue = generator.generate_clue(
word=word,
topic=topic,
clue_style=style,
difficulty="medium"
)
if clue and len(clue) > 3:
results[style] = clue
if results:
print("Generated clues:")
for style, clue in results.items():
print(f" [{style}] {clue}")
# Use the best result
best_style = "definition" if "definition" in results else list(results.keys())[0]
best_clue = results[best_style]
print(f"\nπŸ† Best clue [{best_style}]: {best_clue}")
# Quality evaluation
word_lower = word.lower()
clue_lower = best_clue.lower()
# Quality checks
contains_word = word_lower in clue_lower
is_generic = any(bad in clue_lower for bad in [
"make it", "moderately challenging", "difficult", "easy"
])
is_descriptive = len(best_clue.split()) >= 2 and len(best_clue) >= 6
has_quality_words = any(good in clue_lower for good in [
"instrument", "player", "animal", "device", "system", "terminal",
"companion", "professional", "activity", "dish", "creature"
])
# Scoring
if contains_word:
print("❌ Quality: POOR (contains target word)")
poor_results.append((word, topic, best_clue, "contains word"))
elif is_generic:
print("⚠️ Quality: GENERIC (template response)")
poor_results.append((word, topic, best_clue, "generic"))
elif has_quality_words and is_descriptive:
print("βœ… Quality: EXCELLENT (specific & descriptive)")
excellent_results.append((word, topic, best_clue))
elif is_descriptive:
print("βœ… Quality: GOOD (descriptive)")
good_results.append((word, topic, best_clue))
else:
print("πŸ”„ Quality: ACCEPTABLE")
good_results.append((word, topic, best_clue))
else:
print("❌ No valid clues generated")
poor_results.append((word, topic, "No clue", "failed"))
except Exception as e:
print(f"❌ Error: {e}")
poor_results.append((word, topic, "Error", str(e)))
# Results analysis
total_tests = len(test_cases)
excellent_count = len(excellent_results)
good_count = len(good_results)
poor_count = len(poor_results)
print(f"\n" + "=" * 60)
print(f"πŸ“Š SUPERIOR PROMPTS RESULTS")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Excellent clues: {excellent_count}")
print(f"Good clues: {good_count}")
print(f"Poor/Failed clues: {poor_count}")
print(f"Success rate: {((excellent_count + good_count)/total_tests)*100:.1f}%")
print(f"Excellence rate: {(excellent_count/total_tests)*100:.1f}%")
# Show best results
if excellent_results:
print(f"\nπŸŽ‰ EXCELLENT CLUES:")
print("-" * 40)
for word, topic, clue in excellent_results:
print(f" {word} + {topic}: \"{clue}\"")
if good_results and len(good_results) <= 5:
print(f"\nβœ… GOOD CLUES:")
print("-" * 40)
for word, topic, clue in good_results:
print(f" {word} + {topic}: \"{clue}\"")
# Final evaluation
if excellent_count >= total_tests * 0.6: # 60% excellent
print("\nπŸŽ‰ SUCCESS! Superior prompts achieve excellent results!")
print("πŸš€ Ready for production - proof that better prompts > bigger models!")
elif excellent_count >= total_tests * 0.4: # 40% excellent
print("\nπŸ”„ Very promising! Superior prompts show major improvement")
print("βœ… Much better than previous attempts")
elif (excellent_count + good_count) >= total_tests * 0.7: # 70% success
print("\n⚠️ Good results with superior prompts")
print("πŸ’‘ Demonstrates prompt engineering is key to success")
else:
print("\n❌ Still struggling even with better prompts")
print("πŸ’‘ May need combination of larger model + superior prompts")
def main():
"""Run the superior prompts test."""
test_superior_prompts()
if __name__ == "__main__":
main()