#!/usr/bin/env python3 """ Test Multiple Models via API for Crossword Clue Generation Compare various models and find the best performer. """ import sys import logging from pathlib import Path # Add hack directory to path for imports sys.path.insert(0, str(Path(__file__).parent)) try: from api_clue_generator import APIClueGenerator API_AVAILABLE = True except ImportError as e: print(f"โŒ Import error: {e}") API_AVAILABLE = False # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) def test_multiple_models(): """Test multiple models via API and compare results.""" if not API_AVAILABLE: print("โŒ Cannot run test - API generator not available") return print("๐Ÿงช Testing Multiple Models via Hugging Face API") print("=" * 60) # Initialize API generator generator = APIClueGenerator() print(f"๐ŸŽฏ Testing {len(generator.models)} models:") for i, (key, model) in enumerate(generator.models.items(), 1): print(f" {i}. {key} ({model})") # Test cases for comprehensive evaluation test_cases = [ # Cases that failed with local models ("CAT", "animals"), ("BATSMAN", "cricket"), ("SWIMMING", "sports"), ("AIRPORT", "transportation"), ("DATABASE", "technology"), # Additional challenging cases ("VIOLIN", "music"), ("PIZZA", "food"), ("SCIENTIST", "science"), ("MOUNTAIN", "geography"), ("ELEPHANT", "animals"), ] print(f"\n๐Ÿ“Š Testing {len(test_cases)} word-topic combinations") print("=" * 60) # Track results for each model model_scores = {model_key: {"total": 0, "excellent": 0, "good": 0, "poor": 0, "failed": 0} for model_key in generator.models.keys()} all_results = [] for i, (word, topic) in enumerate(test_cases, 1): print(f"\n๐Ÿ“ Test {i}/{len(test_cases)}: '{word}' + '{topic}'") print("-" * 50) try: # Generate clues with all models results = generator.generate_clue(word, topic) test_result = {"word": word, "topic": topic, "results": {}} # Evaluate each model's response for model_key, clue in results.items(): if clue: quality, score = generator.evaluate_clue_quality(word, clue) test_result["results"][model_key] = {"clue": clue, "quality": quality, "score": score} # Update model statistics model_scores[model_key]["total"] += 1 if quality == "EXCELLENT": model_scores[model_key]["excellent"] += 1 elif quality == "GOOD": model_scores[model_key]["good"] += 1 elif quality == "ACCEPTABLE": model_scores[model_key]["good"] += 1 # Count as good else: model_scores[model_key]["poor"] += 1 print(f" {model_key:20} | {quality:10} | {clue}") else: model_scores[model_key]["failed"] += 1 test_result["results"][model_key] = {"clue": None, "quality": "FAILED", "score": 0.0} print(f" {model_key:20} | FAILED | No response") all_results.append(test_result) except Exception as e: print(f"โŒ Error in test {i}: {e}") # Calculate final scores and rankings print(f"\n" + "=" * 60) print("๐Ÿ“Š FINAL MODEL COMPARISON RESULTS") print("=" * 60) model_rankings = [] for model_key, stats in model_scores.items(): if stats["total"] > 0: success_rate = ((stats["excellent"] + stats["good"]) / len(test_cases)) * 100 excellence_rate = (stats["excellent"] / len(test_cases)) * 100 failure_rate = (stats["failed"] / len(test_cases)) * 100 else: success_rate = excellence_rate = failure_rate = 0 model_rankings.append({ "model": model_key, "success_rate": success_rate, "excellence_rate": excellence_rate, "failure_rate": failure_rate, "stats": stats }) # Sort by success rate, then by excellence rate model_rankings.sort(key=lambda x: (x["success_rate"], x["excellence_rate"]), reverse=True) print(f"{'Rank':4} {'Model':25} {'Success%':8} {'Excel%':7} {'Fail%':6} {'E':2} {'G':2} {'P':2} {'F':2}") print("-" * 75) for i, ranking in enumerate(model_rankings, 1): model = ranking["model"] success = ranking["success_rate"] excel = ranking["excellence_rate"] fail = ranking["failure_rate"] stats = ranking["stats"] print(f"{i:4} {model:25} {success:7.1f} {excel:6.1f} {fail:5.1f} " f"{stats['excellent']:2} {stats['good']:2} {stats['poor']:2} {stats['failed']:2}") # Show best results if model_rankings: best_model = model_rankings[0] print(f"\n๐Ÿ† BEST PERFORMING MODEL: {best_model['model']}") print(f" Success Rate: {best_model['success_rate']:.1f}%") print(f" Excellence Rate: {best_model['excellence_rate']:.1f}%") if best_model['success_rate'] >= 70: print("๐ŸŽ‰ EXCELLENT! This model is ready for production use!") elif best_model['success_rate'] >= 50: print("๐Ÿ”„ Good results! This model shows promise for crossword generation") else: print("โš ๏ธ Moderate results. May need prompt refinement or different approach") # Show some example excellent clues print(f"\n๐ŸŒŸ BEST CLUE EXAMPLES:") print("-" * 40) excellent_examples = [] for result in all_results: for model_key, res in result["results"].items(): if res["quality"] == "EXCELLENT": excellent_examples.append((result["word"], result["topic"], res["clue"], model_key)) for word, topic, clue, model in excellent_examples[:5]: # Show top 5 print(f" {word} + {topic}: \"{clue}\" ({model})") return model_rankings def main(): """Run the multiple model comparison test.""" rankings = test_multiple_models() if rankings: print(f"\n๐Ÿ’ก RECOMMENDATION:") best = rankings[0] print(f"Use '{best['model']}' as your primary clue generation model.") print(f"It achieved {best['success_rate']:.1f}% success rate with {best['excellence_rate']:.1f}% excellent clues.") if __name__ == "__main__": main()