|
|
|
""" |
|
Test Multiple Models via API for Crossword Clue Generation |
|
Compare various models and find the best performer. |
|
""" |
|
|
|
import sys |
|
import logging |
|
from pathlib import Path |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
try: |
|
from api_clue_generator import APIClueGenerator |
|
API_AVAILABLE = True |
|
except ImportError as e: |
|
print(f"β Import error: {e}") |
|
API_AVAILABLE = False |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def test_multiple_models(): |
|
"""Test multiple models via API and compare results.""" |
|
if not API_AVAILABLE: |
|
print("β Cannot run test - API generator not available") |
|
return |
|
|
|
print("π§ͺ Testing Multiple Models via Hugging Face API") |
|
print("=" * 60) |
|
|
|
|
|
generator = APIClueGenerator() |
|
|
|
print(f"π― Testing {len(generator.models)} models:") |
|
for i, (key, model) in enumerate(generator.models.items(), 1): |
|
print(f" {i}. {key} ({model})") |
|
|
|
|
|
test_cases = [ |
|
|
|
("CAT", "animals"), |
|
("BATSMAN", "cricket"), |
|
("SWIMMING", "sports"), |
|
("AIRPORT", "transportation"), |
|
("DATABASE", "technology"), |
|
|
|
|
|
("VIOLIN", "music"), |
|
("PIZZA", "food"), |
|
("SCIENTIST", "science"), |
|
("MOUNTAIN", "geography"), |
|
("ELEPHANT", "animals"), |
|
] |
|
|
|
print(f"\nπ Testing {len(test_cases)} word-topic combinations") |
|
print("=" * 60) |
|
|
|
|
|
model_scores = {model_key: {"total": 0, "excellent": 0, "good": 0, "poor": 0, "failed": 0} |
|
for model_key in generator.models.keys()} |
|
all_results = [] |
|
|
|
for i, (word, topic) in enumerate(test_cases, 1): |
|
print(f"\nπ Test {i}/{len(test_cases)}: '{word}' + '{topic}'") |
|
print("-" * 50) |
|
|
|
try: |
|
|
|
results = generator.generate_clue(word, topic) |
|
test_result = {"word": word, "topic": topic, "results": {}} |
|
|
|
|
|
for model_key, clue in results.items(): |
|
if clue: |
|
quality, score = generator.evaluate_clue_quality(word, clue) |
|
test_result["results"][model_key] = {"clue": clue, "quality": quality, "score": score} |
|
|
|
|
|
model_scores[model_key]["total"] += 1 |
|
if quality == "EXCELLENT": |
|
model_scores[model_key]["excellent"] += 1 |
|
elif quality == "GOOD": |
|
model_scores[model_key]["good"] += 1 |
|
elif quality == "ACCEPTABLE": |
|
model_scores[model_key]["good"] += 1 |
|
else: |
|
model_scores[model_key]["poor"] += 1 |
|
|
|
print(f" {model_key:20} | {quality:10} | {clue}") |
|
else: |
|
model_scores[model_key]["failed"] += 1 |
|
test_result["results"][model_key] = {"clue": None, "quality": "FAILED", "score": 0.0} |
|
print(f" {model_key:20} | FAILED | No response") |
|
|
|
all_results.append(test_result) |
|
|
|
except Exception as e: |
|
print(f"β Error in test {i}: {e}") |
|
|
|
|
|
print(f"\n" + "=" * 60) |
|
print("π FINAL MODEL COMPARISON RESULTS") |
|
print("=" * 60) |
|
|
|
model_rankings = [] |
|
for model_key, stats in model_scores.items(): |
|
if stats["total"] > 0: |
|
success_rate = ((stats["excellent"] + stats["good"]) / len(test_cases)) * 100 |
|
excellence_rate = (stats["excellent"] / len(test_cases)) * 100 |
|
failure_rate = (stats["failed"] / len(test_cases)) * 100 |
|
else: |
|
success_rate = excellence_rate = failure_rate = 0 |
|
|
|
model_rankings.append({ |
|
"model": model_key, |
|
"success_rate": success_rate, |
|
"excellence_rate": excellence_rate, |
|
"failure_rate": failure_rate, |
|
"stats": stats |
|
}) |
|
|
|
|
|
model_rankings.sort(key=lambda x: (x["success_rate"], x["excellence_rate"]), reverse=True) |
|
|
|
print(f"{'Rank':4} {'Model':25} {'Success%':8} {'Excel%':7} {'Fail%':6} {'E':2} {'G':2} {'P':2} {'F':2}") |
|
print("-" * 75) |
|
|
|
for i, ranking in enumerate(model_rankings, 1): |
|
model = ranking["model"] |
|
success = ranking["success_rate"] |
|
excel = ranking["excellence_rate"] |
|
fail = ranking["failure_rate"] |
|
stats = ranking["stats"] |
|
|
|
print(f"{i:4} {model:25} {success:7.1f} {excel:6.1f} {fail:5.1f} " |
|
f"{stats['excellent']:2} {stats['good']:2} {stats['poor']:2} {stats['failed']:2}") |
|
|
|
|
|
if model_rankings: |
|
best_model = model_rankings[0] |
|
print(f"\nπ BEST PERFORMING MODEL: {best_model['model']}") |
|
print(f" Success Rate: {best_model['success_rate']:.1f}%") |
|
print(f" Excellence Rate: {best_model['excellence_rate']:.1f}%") |
|
|
|
if best_model['success_rate'] >= 70: |
|
print("π EXCELLENT! This model is ready for production use!") |
|
elif best_model['success_rate'] >= 50: |
|
print("π Good results! This model shows promise for crossword generation") |
|
else: |
|
print("β οΈ Moderate results. May need prompt refinement or different approach") |
|
|
|
|
|
print(f"\nπ BEST CLUE EXAMPLES:") |
|
print("-" * 40) |
|
excellent_examples = [] |
|
for result in all_results: |
|
for model_key, res in result["results"].items(): |
|
if res["quality"] == "EXCELLENT": |
|
excellent_examples.append((result["word"], result["topic"], res["clue"], model_key)) |
|
|
|
for word, topic, clue, model in excellent_examples[:5]: |
|
print(f" {word} + {topic}: \"{clue}\" ({model})") |
|
|
|
return model_rankings |
|
|
|
|
|
def main(): |
|
"""Run the multiple model comparison test.""" |
|
rankings = test_multiple_models() |
|
|
|
if rankings: |
|
print(f"\nπ‘ RECOMMENDATION:") |
|
best = rankings[0] |
|
print(f"Use '{best['model']}' as your primary clue generation model.") |
|
print(f"It achieved {best['success_rate']:.1f}% success rate with {best['excellence_rate']:.1f}% excellent clues.") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |