File size: 6,880 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
#!/usr/bin/env python3
"""
Test Multiple Models via API for Crossword Clue Generation
Compare various models and find the best performer.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from api_clue_generator import APIClueGenerator
    API_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    API_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_multiple_models():
    """Test multiple models via API and compare results."""
    if not API_AVAILABLE:
        print("❌ Cannot run test - API generator not available")
        return
    
    print("πŸ§ͺ Testing Multiple Models via Hugging Face API")
    print("=" * 60)
    
    # Initialize API generator
    generator = APIClueGenerator()
    
    print(f"🎯 Testing {len(generator.models)} models:")
    for i, (key, model) in enumerate(generator.models.items(), 1):
        print(f"  {i}. {key} ({model})")
    
    # Test cases for comprehensive evaluation
    test_cases = [
        # Cases that failed with local models
        ("CAT", "animals"),
        ("BATSMAN", "cricket"),
        ("SWIMMING", "sports"), 
        ("AIRPORT", "transportation"),
        ("DATABASE", "technology"),
        
        # Additional challenging cases
        ("VIOLIN", "music"),
        ("PIZZA", "food"),
        ("SCIENTIST", "science"),
        ("MOUNTAIN", "geography"),
        ("ELEPHANT", "animals"),
    ]
    
    print(f"\nπŸ“Š Testing {len(test_cases)} word-topic combinations")
    print("=" * 60)
    
    # Track results for each model
    model_scores = {model_key: {"total": 0, "excellent": 0, "good": 0, "poor": 0, "failed": 0} 
                   for model_key in generator.models.keys()}
    all_results = []
    
    for i, (word, topic) in enumerate(test_cases, 1):
        print(f"\nπŸ“ Test {i}/{len(test_cases)}: '{word}' + '{topic}'")
        print("-" * 50)
        
        try:
            # Generate clues with all models
            results = generator.generate_clue(word, topic)
            test_result = {"word": word, "topic": topic, "results": {}}
            
            # Evaluate each model's response
            for model_key, clue in results.items():
                if clue:
                    quality, score = generator.evaluate_clue_quality(word, clue)
                    test_result["results"][model_key] = {"clue": clue, "quality": quality, "score": score}
                    
                    # Update model statistics
                    model_scores[model_key]["total"] += 1
                    if quality == "EXCELLENT":
                        model_scores[model_key]["excellent"] += 1
                    elif quality == "GOOD":
                        model_scores[model_key]["good"] += 1
                    elif quality == "ACCEPTABLE":
                        model_scores[model_key]["good"] += 1  # Count as good
                    else:
                        model_scores[model_key]["poor"] += 1
                        
                    print(f"  {model_key:20} | {quality:10} | {clue}")
                else:
                    model_scores[model_key]["failed"] += 1
                    test_result["results"][model_key] = {"clue": None, "quality": "FAILED", "score": 0.0}
                    print(f"  {model_key:20} | FAILED     | No response")
            
            all_results.append(test_result)
            
        except Exception as e:
            print(f"❌ Error in test {i}: {e}")
    
    # Calculate final scores and rankings
    print(f"\n" + "=" * 60)
    print("πŸ“Š FINAL MODEL COMPARISON RESULTS")
    print("=" * 60)
    
    model_rankings = []
    for model_key, stats in model_scores.items():
        if stats["total"] > 0:
            success_rate = ((stats["excellent"] + stats["good"]) / len(test_cases)) * 100
            excellence_rate = (stats["excellent"] / len(test_cases)) * 100
            failure_rate = (stats["failed"] / len(test_cases)) * 100
        else:
            success_rate = excellence_rate = failure_rate = 0
        
        model_rankings.append({
            "model": model_key,
            "success_rate": success_rate,
            "excellence_rate": excellence_rate,
            "failure_rate": failure_rate,
            "stats": stats
        })
    
    # Sort by success rate, then by excellence rate
    model_rankings.sort(key=lambda x: (x["success_rate"], x["excellence_rate"]), reverse=True)
    
    print(f"{'Rank':4} {'Model':25} {'Success%':8} {'Excel%':7} {'Fail%':6} {'E':2} {'G':2} {'P':2} {'F':2}")
    print("-" * 75)
    
    for i, ranking in enumerate(model_rankings, 1):
        model = ranking["model"]
        success = ranking["success_rate"]
        excel = ranking["excellence_rate"]
        fail = ranking["failure_rate"]
        stats = ranking["stats"]
        
        print(f"{i:4} {model:25} {success:7.1f} {excel:6.1f} {fail:5.1f} "
              f"{stats['excellent']:2} {stats['good']:2} {stats['poor']:2} {stats['failed']:2}")
    
    # Show best results
    if model_rankings:
        best_model = model_rankings[0]
        print(f"\nπŸ† BEST PERFORMING MODEL: {best_model['model']}")
        print(f"   Success Rate: {best_model['success_rate']:.1f}%")
        print(f"   Excellence Rate: {best_model['excellence_rate']:.1f}%")
        
        if best_model['success_rate'] >= 70:
            print("πŸŽ‰ EXCELLENT! This model is ready for production use!")
        elif best_model['success_rate'] >= 50:
            print("πŸ”„ Good results! This model shows promise for crossword generation")
        else:
            print("⚠️  Moderate results. May need prompt refinement or different approach")
    
    # Show some example excellent clues
    print(f"\n🌟 BEST CLUE EXAMPLES:")
    print("-" * 40)
    excellent_examples = []
    for result in all_results:
        for model_key, res in result["results"].items():
            if res["quality"] == "EXCELLENT":
                excellent_examples.append((result["word"], result["topic"], res["clue"], model_key))
    
    for word, topic, clue, model in excellent_examples[:5]:  # Show top 5
        print(f"  {word} + {topic}: \"{clue}\" ({model})")
    
    return model_rankings


def main():
    """Run the multiple model comparison test."""
    rankings = test_multiple_models()
    
    if rankings:
        print(f"\nπŸ’‘ RECOMMENDATION:")
        best = rankings[0]
        print(f"Use '{best['model']}' as your primary clue generation model.")
        print(f"It achieved {best['success_rate']:.1f}% success rate with {best['excellence_rate']:.1f}% excellent clues.")


if __name__ == "__main__":
    main()