|
|
|
""" |
|
Test: Superior Prompt Engineering with flan-t5-base |
|
Test if better prompts with examples can achieve excellence without larger models. |
|
""" |
|
|
|
import sys |
|
import logging |
|
from pathlib import Path |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
try: |
|
from llm_clue_generator import LLMClueGenerator |
|
GENERATOR_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"β Import error: {e}") |
|
GENERATOR_AVAILABLE = False |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def test_superior_prompts(): |
|
"""Test superior prompt engineering with flan-t5-base.""" |
|
if not GENERATOR_AVAILABLE: |
|
print("β Cannot run test - LLM generator not available") |
|
return |
|
|
|
print("π§ͺ Testing Superior Prompt Engineering") |
|
print("=" * 60) |
|
print("π― Strategy: Better prompts with examples vs larger models") |
|
|
|
|
|
print("π Initializing flan-t5-base with superior prompts...") |
|
generator = LLMClueGenerator() |
|
|
|
try: |
|
generator.initialize() |
|
print(f"β
Generator initialized successfully with {generator.model_name}") |
|
print(f"π Model size: ~1GB with enhanced example-based prompts") |
|
except Exception as e: |
|
print(f"β Failed to initialize generator: {e}") |
|
return |
|
|
|
|
|
test_cases = [ |
|
|
|
("CAT", "animals"), |
|
("BATSMAN", "cricket"), |
|
("SWIMMING", "sports"), |
|
("AIRPORT", "transportation"), |
|
("DATABASE", "technology"), |
|
|
|
|
|
("VIOLIN", "music"), |
|
("SCIENTIST", "science"), |
|
("PIZZA", "food"), |
|
("MOUNTAIN", "geography"), |
|
("ELEPHANT", "animals"), |
|
] |
|
|
|
print(f"\nπ― Testing {len(test_cases)} cases with superior prompts") |
|
print("=" * 60) |
|
|
|
excellent_results = [] |
|
good_results = [] |
|
poor_results = [] |
|
|
|
for word, topic in test_cases: |
|
print(f"\nπ Testing: '{word}' + '{topic}'") |
|
print("-" * 40) |
|
|
|
try: |
|
|
|
results = {} |
|
for style in ["definition", "description", "simple"]: |
|
clue = generator.generate_clue( |
|
word=word, |
|
topic=topic, |
|
clue_style=style, |
|
difficulty="medium" |
|
) |
|
if clue and len(clue) > 3: |
|
results[style] = clue |
|
|
|
if results: |
|
print("Generated clues:") |
|
for style, clue in results.items(): |
|
print(f" [{style}] {clue}") |
|
|
|
|
|
best_style = "definition" if "definition" in results else list(results.keys())[0] |
|
best_clue = results[best_style] |
|
|
|
print(f"\nπ Best clue [{best_style}]: {best_clue}") |
|
|
|
|
|
word_lower = word.lower() |
|
clue_lower = best_clue.lower() |
|
|
|
|
|
contains_word = word_lower in clue_lower |
|
is_generic = any(bad in clue_lower for bad in [ |
|
"make it", "moderately challenging", "difficult", "easy" |
|
]) |
|
is_descriptive = len(best_clue.split()) >= 2 and len(best_clue) >= 6 |
|
has_quality_words = any(good in clue_lower for good in [ |
|
"instrument", "player", "animal", "device", "system", "terminal", |
|
"companion", "professional", "activity", "dish", "creature" |
|
]) |
|
|
|
|
|
if contains_word: |
|
print("β Quality: POOR (contains target word)") |
|
poor_results.append((word, topic, best_clue, "contains word")) |
|
elif is_generic: |
|
print("β οΈ Quality: GENERIC (template response)") |
|
poor_results.append((word, topic, best_clue, "generic")) |
|
elif has_quality_words and is_descriptive: |
|
print("β
Quality: EXCELLENT (specific & descriptive)") |
|
excellent_results.append((word, topic, best_clue)) |
|
elif is_descriptive: |
|
print("β
Quality: GOOD (descriptive)") |
|
good_results.append((word, topic, best_clue)) |
|
else: |
|
print("π Quality: ACCEPTABLE") |
|
good_results.append((word, topic, best_clue)) |
|
else: |
|
print("β No valid clues generated") |
|
poor_results.append((word, topic, "No clue", "failed")) |
|
|
|
except Exception as e: |
|
print(f"β Error: {e}") |
|
poor_results.append((word, topic, "Error", str(e))) |
|
|
|
|
|
total_tests = len(test_cases) |
|
excellent_count = len(excellent_results) |
|
good_count = len(good_results) |
|
poor_count = len(poor_results) |
|
|
|
print(f"\n" + "=" * 60) |
|
print(f"π SUPERIOR PROMPTS RESULTS") |
|
print(f"=" * 60) |
|
print(f"Total tests: {total_tests}") |
|
print(f"Excellent clues: {excellent_count}") |
|
print(f"Good clues: {good_count}") |
|
print(f"Poor/Failed clues: {poor_count}") |
|
print(f"Success rate: {((excellent_count + good_count)/total_tests)*100:.1f}%") |
|
print(f"Excellence rate: {(excellent_count/total_tests)*100:.1f}%") |
|
|
|
|
|
if excellent_results: |
|
print(f"\nπ EXCELLENT CLUES:") |
|
print("-" * 40) |
|
for word, topic, clue in excellent_results: |
|
print(f" {word} + {topic}: \"{clue}\"") |
|
|
|
if good_results and len(good_results) <= 5: |
|
print(f"\nβ
GOOD CLUES:") |
|
print("-" * 40) |
|
for word, topic, clue in good_results: |
|
print(f" {word} + {topic}: \"{clue}\"") |
|
|
|
|
|
if excellent_count >= total_tests * 0.6: |
|
print("\nπ SUCCESS! Superior prompts achieve excellent results!") |
|
print("π Ready for production - proof that better prompts > bigger models!") |
|
elif excellent_count >= total_tests * 0.4: |
|
print("\nπ Very promising! Superior prompts show major improvement") |
|
print("β
Much better than previous attempts") |
|
elif (excellent_count + good_count) >= total_tests * 0.7: |
|
print("\nβ οΈ Good results with superior prompts") |
|
print("π‘ Demonstrates prompt engineering is key to success") |
|
else: |
|
print("\nβ Still struggling even with better prompts") |
|
print("π‘ May need combination of larger model + superior prompts") |
|
|
|
|
|
def main(): |
|
"""Run the superior prompts test.""" |
|
test_superior_prompts() |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |