File size: 6,880 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
#!/usr/bin/env python3
"""
Test Multiple Models via API for Crossword Clue Generation
Compare various models and find the best performer.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from api_clue_generator import APIClueGenerator
API_AVAILABLE = True
except ImportError as e:
print(f"β Import error: {e}")
API_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_multiple_models():
"""Test multiple models via API and compare results."""
if not API_AVAILABLE:
print("β Cannot run test - API generator not available")
return
print("π§ͺ Testing Multiple Models via Hugging Face API")
print("=" * 60)
# Initialize API generator
generator = APIClueGenerator()
print(f"π― Testing {len(generator.models)} models:")
for i, (key, model) in enumerate(generator.models.items(), 1):
print(f" {i}. {key} ({model})")
# Test cases for comprehensive evaluation
test_cases = [
# Cases that failed with local models
("CAT", "animals"),
("BATSMAN", "cricket"),
("SWIMMING", "sports"),
("AIRPORT", "transportation"),
("DATABASE", "technology"),
# Additional challenging cases
("VIOLIN", "music"),
("PIZZA", "food"),
("SCIENTIST", "science"),
("MOUNTAIN", "geography"),
("ELEPHANT", "animals"),
]
print(f"\nπ Testing {len(test_cases)} word-topic combinations")
print("=" * 60)
# Track results for each model
model_scores = {model_key: {"total": 0, "excellent": 0, "good": 0, "poor": 0, "failed": 0}
for model_key in generator.models.keys()}
all_results = []
for i, (word, topic) in enumerate(test_cases, 1):
print(f"\nπ Test {i}/{len(test_cases)}: '{word}' + '{topic}'")
print("-" * 50)
try:
# Generate clues with all models
results = generator.generate_clue(word, topic)
test_result = {"word": word, "topic": topic, "results": {}}
# Evaluate each model's response
for model_key, clue in results.items():
if clue:
quality, score = generator.evaluate_clue_quality(word, clue)
test_result["results"][model_key] = {"clue": clue, "quality": quality, "score": score}
# Update model statistics
model_scores[model_key]["total"] += 1
if quality == "EXCELLENT":
model_scores[model_key]["excellent"] += 1
elif quality == "GOOD":
model_scores[model_key]["good"] += 1
elif quality == "ACCEPTABLE":
model_scores[model_key]["good"] += 1 # Count as good
else:
model_scores[model_key]["poor"] += 1
print(f" {model_key:20} | {quality:10} | {clue}")
else:
model_scores[model_key]["failed"] += 1
test_result["results"][model_key] = {"clue": None, "quality": "FAILED", "score": 0.0}
print(f" {model_key:20} | FAILED | No response")
all_results.append(test_result)
except Exception as e:
print(f"β Error in test {i}: {e}")
# Calculate final scores and rankings
print(f"\n" + "=" * 60)
print("π FINAL MODEL COMPARISON RESULTS")
print("=" * 60)
model_rankings = []
for model_key, stats in model_scores.items():
if stats["total"] > 0:
success_rate = ((stats["excellent"] + stats["good"]) / len(test_cases)) * 100
excellence_rate = (stats["excellent"] / len(test_cases)) * 100
failure_rate = (stats["failed"] / len(test_cases)) * 100
else:
success_rate = excellence_rate = failure_rate = 0
model_rankings.append({
"model": model_key,
"success_rate": success_rate,
"excellence_rate": excellence_rate,
"failure_rate": failure_rate,
"stats": stats
})
# Sort by success rate, then by excellence rate
model_rankings.sort(key=lambda x: (x["success_rate"], x["excellence_rate"]), reverse=True)
print(f"{'Rank':4} {'Model':25} {'Success%':8} {'Excel%':7} {'Fail%':6} {'E':2} {'G':2} {'P':2} {'F':2}")
print("-" * 75)
for i, ranking in enumerate(model_rankings, 1):
model = ranking["model"]
success = ranking["success_rate"]
excel = ranking["excellence_rate"]
fail = ranking["failure_rate"]
stats = ranking["stats"]
print(f"{i:4} {model:25} {success:7.1f} {excel:6.1f} {fail:5.1f} "
f"{stats['excellent']:2} {stats['good']:2} {stats['poor']:2} {stats['failed']:2}")
# Show best results
if model_rankings:
best_model = model_rankings[0]
print(f"\nπ BEST PERFORMING MODEL: {best_model['model']}")
print(f" Success Rate: {best_model['success_rate']:.1f}%")
print(f" Excellence Rate: {best_model['excellence_rate']:.1f}%")
if best_model['success_rate'] >= 70:
print("π EXCELLENT! This model is ready for production use!")
elif best_model['success_rate'] >= 50:
print("π Good results! This model shows promise for crossword generation")
else:
print("β οΈ Moderate results. May need prompt refinement or different approach")
# Show some example excellent clues
print(f"\nπ BEST CLUE EXAMPLES:")
print("-" * 40)
excellent_examples = []
for result in all_results:
for model_key, res in result["results"].items():
if res["quality"] == "EXCELLENT":
excellent_examples.append((result["word"], result["topic"], res["clue"], model_key))
for word, topic, clue, model in excellent_examples[:5]: # Show top 5
print(f" {word} + {topic}: \"{clue}\" ({model})")
return model_rankings
def main():
"""Run the multiple model comparison test."""
rankings = test_multiple_models()
if rankings:
print(f"\nπ‘ RECOMMENDATION:")
best = rankings[0]
print(f"Use '{best['model']}' as your primary clue generation model.")
print(f"It achieved {best['success_rate']:.1f}% success rate with {best['excellence_rate']:.1f}% excellent clues.")
if __name__ == "__main__":
main() |