#!/usr/bin/env python3 """ Test: Superior Prompt Engineering with flan-t5-base Test if better prompts with examples can achieve excellence without larger models. """ import sys import logging from pathlib import Path # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from llm_clue_generator import LLMClueGenerator GENERATOR_AVAILABLE = True except ImportError as e: print(f"โŒ Import error: {e}") GENERATOR_AVAILABLE = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def test_superior_prompts(): """Test superior prompt engineering with flan-t5-base.""" if not GENERATOR_AVAILABLE: print("โŒ Cannot run test - LLM generator not available") return print("๐Ÿงช Testing Superior Prompt Engineering") print("=" * 60) print("๐ŸŽฏ Strategy: Better prompts with examples vs larger models") # Initialize generator print("๐Ÿ”„ Initializing flan-t5-base with superior prompts...") generator = LLMClueGenerator() try: generator.initialize() print(f"โœ… Generator initialized successfully with {generator.model_name}") print(f"๐Ÿ“Š Model size: ~1GB with enhanced example-based prompts") except Exception as e: print(f"โŒ Failed to initialize generator: {e}") return # Test cases that should work well with good examples test_cases = [ # Cases that failed with poor prompting ("CAT", "animals"), ("BATSMAN", "cricket"), ("SWIMMING", "sports"), ("AIRPORT", "transportation"), ("DATABASE", "technology"), # Additional challenging cases ("VIOLIN", "music"), ("SCIENTIST", "science"), ("PIZZA", "food"), ("MOUNTAIN", "geography"), ("ELEPHANT", "animals"), ] print(f"\n๐ŸŽฏ Testing {len(test_cases)} cases with superior prompts") print("=" * 60) excellent_results = [] good_results = [] poor_results = [] for word, topic in test_cases: print(f"\n๐Ÿ“ Testing: '{word}' + '{topic}'") print("-" * 40) try: # Test different prompt styles results = {} for style in ["definition", "description", "simple"]: clue = generator.generate_clue( word=word, topic=topic, clue_style=style, difficulty="medium" ) if clue and len(clue) > 3: results[style] = clue if results: print("Generated clues:") for style, clue in results.items(): print(f" [{style}] {clue}") # Use the best result best_style = "definition" if "definition" in results else list(results.keys())[0] best_clue = results[best_style] print(f"\n๐Ÿ† Best clue [{best_style}]: {best_clue}") # Quality evaluation word_lower = word.lower() clue_lower = best_clue.lower() # Quality checks contains_word = word_lower in clue_lower is_generic = any(bad in clue_lower for bad in [ "make it", "moderately challenging", "difficult", "easy" ]) is_descriptive = len(best_clue.split()) >= 2 and len(best_clue) >= 6 has_quality_words = any(good in clue_lower for good in [ "instrument", "player", "animal", "device", "system", "terminal", "companion", "professional", "activity", "dish", "creature" ]) # Scoring if contains_word: print("โŒ Quality: POOR (contains target word)") poor_results.append((word, topic, best_clue, "contains word")) elif is_generic: print("โš ๏ธ Quality: GENERIC (template response)") poor_results.append((word, topic, best_clue, "generic")) elif has_quality_words and is_descriptive: print("โœ… Quality: EXCELLENT (specific & descriptive)") excellent_results.append((word, topic, best_clue)) elif is_descriptive: print("โœ… Quality: GOOD (descriptive)") good_results.append((word, topic, best_clue)) else: print("๐Ÿ”„ Quality: ACCEPTABLE") good_results.append((word, topic, best_clue)) else: print("โŒ No valid clues generated") poor_results.append((word, topic, "No clue", "failed")) except Exception as e: print(f"โŒ Error: {e}") poor_results.append((word, topic, "Error", str(e))) # Results analysis total_tests = len(test_cases) excellent_count = len(excellent_results) good_count = len(good_results) poor_count = len(poor_results) print(f"\n" + "=" * 60) print(f"๐Ÿ“Š SUPERIOR PROMPTS RESULTS") print(f"=" * 60) print(f"Total tests: {total_tests}") print(f"Excellent clues: {excellent_count}") print(f"Good clues: {good_count}") print(f"Poor/Failed clues: {poor_count}") print(f"Success rate: {((excellent_count + good_count)/total_tests)*100:.1f}%") print(f"Excellence rate: {(excellent_count/total_tests)*100:.1f}%") # Show best results if excellent_results: print(f"\n๐ŸŽ‰ EXCELLENT CLUES:") print("-" * 40) for word, topic, clue in excellent_results: print(f" {word} + {topic}: \"{clue}\"") if good_results and len(good_results) <= 5: print(f"\nโœ… GOOD CLUES:") print("-" * 40) for word, topic, clue in good_results: print(f" {word} + {topic}: \"{clue}\"") # Final evaluation if excellent_count >= total_tests * 0.6: # 60% excellent print("\n๐ŸŽ‰ SUCCESS! Superior prompts achieve excellent results!") print("๐Ÿš€ Ready for production - proof that better prompts > bigger models!") elif excellent_count >= total_tests * 0.4: # 40% excellent print("\n๐Ÿ”„ Very promising! Superior prompts show major improvement") print("โœ… Much better than previous attempts") elif (excellent_count + good_count) >= total_tests * 0.7: # 70% success print("\nโš ๏ธ Good results with superior prompts") print("๐Ÿ’ก Demonstrates prompt engineering is key to success") else: print("\nโŒ Still struggling even with better prompts") print("๐Ÿ’ก May need combination of larger model + superior prompts") def main(): """Run the superior prompts test.""" test_superior_prompts() if __name__ == "__main__": main()