abc123 / hack /test_multiple_models.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
6.88 kB
#!/usr/bin/env python3
"""
Test Multiple Models via API for Crossword Clue Generation
Compare various models and find the best performer.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from api_clue_generator import APIClueGenerator
API_AVAILABLE = True
except ImportError as e:
print(f"❌ Import error: {e}")
API_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_multiple_models():
"""Test multiple models via API and compare results."""
if not API_AVAILABLE:
print("❌ Cannot run test - API generator not available")
return
print("πŸ§ͺ Testing Multiple Models via Hugging Face API")
print("=" * 60)
# Initialize API generator
generator = APIClueGenerator()
print(f"🎯 Testing {len(generator.models)} models:")
for i, (key, model) in enumerate(generator.models.items(), 1):
print(f" {i}. {key} ({model})")
# Test cases for comprehensive evaluation
test_cases = [
# Cases that failed with local models
("CAT", "animals"),
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
("DATABASE", "technology"),
# Additional challenging cases
("VIOLIN", "music"),
("PIZZA", "food"),
("SCIENTIST", "science"),
("MOUNTAIN", "geography"),
("ELEPHANT", "animals"),
]
print(f"\nπŸ“Š Testing {len(test_cases)} word-topic combinations")
print("=" * 60)
# Track results for each model
model_scores = {model_key: {"total": 0, "excellent": 0, "good": 0, "poor": 0, "failed": 0}
for model_key in generator.models.keys()}
all_results = []
for i, (word, topic) in enumerate(test_cases, 1):
print(f"\nπŸ“ Test {i}/{len(test_cases)}: '{word}' + '{topic}'")
print("-" * 50)
try:
# Generate clues with all models
results = generator.generate_clue(word, topic)
test_result = {"word": word, "topic": topic, "results": {}}
# Evaluate each model's response
for model_key, clue in results.items():
if clue:
quality, score = generator.evaluate_clue_quality(word, clue)
test_result["results"][model_key] = {"clue": clue, "quality": quality, "score": score}
# Update model statistics
model_scores[model_key]["total"] += 1
if quality == "EXCELLENT":
model_scores[model_key]["excellent"] += 1
elif quality == "GOOD":
model_scores[model_key]["good"] += 1
elif quality == "ACCEPTABLE":
model_scores[model_key]["good"] += 1 # Count as good
else:
model_scores[model_key]["poor"] += 1
print(f" {model_key:20} | {quality:10} | {clue}")
else:
model_scores[model_key]["failed"] += 1
test_result["results"][model_key] = {"clue": None, "quality": "FAILED", "score": 0.0}
print(f" {model_key:20} | FAILED | No response")
all_results.append(test_result)
except Exception as e:
print(f"❌ Error in test {i}: {e}")
# Calculate final scores and rankings
print(f"\n" + "=" * 60)
print("πŸ“Š FINAL MODEL COMPARISON RESULTS")
print("=" * 60)
model_rankings = []
for model_key, stats in model_scores.items():
if stats["total"] > 0:
success_rate = ((stats["excellent"] + stats["good"]) / len(test_cases)) * 100
excellence_rate = (stats["excellent"] / len(test_cases)) * 100
failure_rate = (stats["failed"] / len(test_cases)) * 100
else:
success_rate = excellence_rate = failure_rate = 0
model_rankings.append({
"model": model_key,
"success_rate": success_rate,
"excellence_rate": excellence_rate,
"failure_rate": failure_rate,
"stats": stats
})
# Sort by success rate, then by excellence rate
model_rankings.sort(key=lambda x: (x["success_rate"], x["excellence_rate"]), reverse=True)
print(f"{'Rank':4} {'Model':25} {'Success%':8} {'Excel%':7} {'Fail%':6} {'E':2} {'G':2} {'P':2} {'F':2}")
print("-" * 75)
for i, ranking in enumerate(model_rankings, 1):
model = ranking["model"]
success = ranking["success_rate"]
excel = ranking["excellence_rate"]
fail = ranking["failure_rate"]
stats = ranking["stats"]
print(f"{i:4} {model:25} {success:7.1f} {excel:6.1f} {fail:5.1f} "
f"{stats['excellent']:2} {stats['good']:2} {stats['poor']:2} {stats['failed']:2}")
# Show best results
if model_rankings:
best_model = model_rankings[0]
print(f"\nπŸ† BEST PERFORMING MODEL: {best_model['model']}")
print(f" Success Rate: {best_model['success_rate']:.1f}%")
print(f" Excellence Rate: {best_model['excellence_rate']:.1f}%")
if best_model['success_rate'] >= 70:
print("πŸŽ‰ EXCELLENT! This model is ready for production use!")
elif best_model['success_rate'] >= 50:
print("πŸ”„ Good results! This model shows promise for crossword generation")
else:
print("⚠️ Moderate results. May need prompt refinement or different approach")
# Show some example excellent clues
print(f"\n🌟 BEST CLUE EXAMPLES:")
print("-" * 40)
excellent_examples = []
for result in all_results:
for model_key, res in result["results"].items():
if res["quality"] == "EXCELLENT":
excellent_examples.append((result["word"], result["topic"], res["clue"], model_key))
for word, topic, clue, model in excellent_examples[:5]: # Show top 5
print(f" {word} + {topic}: \"{clue}\" ({model})")
return model_rankings
def main():
"""Run the multiple model comparison test."""
rankings = test_multiple_models()
if rankings:
print(f"\nπŸ’‘ RECOMMENDATION:")
best = rankings[0]
print(f"Use '{best['model']}' as your primary clue generation model.")
print(f"It achieved {best['success_rate']:.1f}% success rate with {best['excellence_rate']:.1f}% excellent clues.")
if __name__ == "__main__":
main()