File size: 7,381 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env python3
"""
Test: flan-t5-large Model for Superior Crossword Clue Generation
Test the most capable model to eliminate generic responses and achieve excellence.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_flan_t5_large():
    """Test flan-t5-large model for superior crossword clue quality."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("πŸ§ͺ Testing flan-t5-large Model (No Fallbacks)")
    print("=" * 60)
    
    # Initialize generator with large model
    print("πŸ”„ Initializing flan-t5-large clue generator...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print(f"βœ… Generator initialized successfully with {generator.model_name}")
        print(f"πŸ“Š Model size: ~3GB (3x larger than base, 37x larger than small)")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        print("πŸ’‘ Note: flan-t5-large requires ~3GB RAM and longer initialization time")
        return
    
    # Challenging test cases that should be handled well by a large model
    test_cases = [
        # Basic cases that failed with smaller models
        ("CAT", "animals"),
        ("BATSMAN", "cricket"), 
        ("SWIMMING", "sports"),
        ("AIRPORT", "transportation"),
        ("DATABASE", "technology"),

        # More challenging cases requiring world knowledge
        ("VIOLIN", "music"),
        ("SCIENTIST", "science"),
        ("PIZZA", "food"),
        ("MOUNTAIN", "geography"),
        ("HELICOPTER", "transportation"),
        ("DEMOCRACY", "politics"),
        ("PHOTOSYNTHESIS", "science"),

        # Abstract concepts
        ("HAPPINESS", "emotions"),
        ("ALGORITHM", "technology"),
        ("METAPHOR", "literature"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} challenging word-topic combinations")
    print("=" * 60)
    
    excellent_clues = 0
    good_clues = 0
    generic_clues = 0
    poor_clues = 0
    
    for word, topic in test_cases:
        print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Test the best-performing clue style
            best_clue = generator.generate_clue(
                word=word,
                topic=topic,
                clue_style="definition",  # Usually produces the best results
                difficulty="medium"
            )
            
            if best_clue and len(best_clue) > 3:
                print(f"πŸ† Generated clue: {best_clue}")
                
                # Comprehensive quality evaluation
                word_lower = word.lower()
                clue_lower = best_clue.lower()
                
                # Critical quality checks
                contains_word = word_lower in clue_lower
                is_generic = any(generic in clue_lower for generic in [
                    "make it moderately challenging", "make it challenging", 
                    "make it difficult", "make it easier", "moderately challenging",
                    "difficult", "easy"
                ])
                is_nonsensical = any(nonsense in clue_lower for nonsense in [
                    "a) a) a)", "trick and treating", "gritting your teeth", 
                    "jack nixt", "fender", "tryon"
                ])
                
                # Positive quality indicators
                has_definition = any(def_word in clue_lower for def_word in [
                    "player", "instrument", "device", "system", "place", "location",
                    "animal", "creature", "building", "process", "method", "concept",
                    "sport", "activity", "food", "dish", "language", "tool"
                ])
                
                is_descriptive = (
                    len(best_clue.split()) >= 3 and
                    len(best_clue) >= 10 and
                    not contains_word and
                    not is_generic and
                    not is_nonsensical
                )
                
                # Quality scoring
                if contains_word:
                    print("❌ Quality: POOR (contains target word)")
                    poor_clues += 1
                elif is_nonsensical:
                    print("❌ Quality: POOR (nonsensical)")
                    poor_clues += 1
                elif is_generic:
                    print("⚠️  Quality: GENERIC (template response)")
                    generic_clues += 1
                elif has_definition and is_descriptive:
                    print("βœ… Quality: EXCELLENT (definitional & descriptive)")
                    excellent_clues += 1
                elif is_descriptive:
                    print("βœ… Quality: GOOD (descriptive)")
                    good_clues += 1
                elif has_definition:
                    print("πŸ”„ Quality: ACCEPTABLE (basic definition)")
                    good_clues += 1
                else:
                    print("⚠️  Quality: GENERIC (basic)")
                    generic_clues += 1
            else:
                print("❌ No valid clue generated")
                poor_clues += 1
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
            poor_clues += 1
    
    total_tests = len(test_cases)
    print(f"\n" + "=" * 60)
    print(f"πŸ“Š FLAN-T5-LARGE RESULTS (NO FALLBACKS)")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Excellent clues: {excellent_clues}")
    print(f"Good clues: {good_clues}")
    print(f"Generic clues: {generic_clues}")
    print(f"Poor clues: {poor_clues}")
    print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
    print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
    print(f"Generic rate: {(generic_clues/total_tests)*100:.1f}%")
    
    # Final evaluation - high standards for large model
    if excellent_clues >= total_tests * 0.6:  # 60% excellent
        print("πŸŽ‰ SUCCESS! flan-t5-large produces excellent crossword clues!")
        print("πŸš€ Ready for production - no fallbacks needed!")
    elif excellent_clues >= total_tests * 0.4 and generic_clues <= total_tests * 0.2:  # 40% excellent, <20% generic
        print("πŸ”„ Very good! flan-t5-large is suitable for production")
        print("βœ… Significant improvement over smaller models")
    elif (excellent_clues + good_clues) >= total_tests * 0.7:  # 70% good+excellent
        print("⚠️  Good results, but some generic responses remain")
        print("πŸ’‘ Consider prompt engineering refinements")
    else:
        print("❌ Still not meeting quality standards")
        print("πŸ’‘ May need flan-t5-xl (~11GB) or different approach")


def main():
    """Run the flan-t5-large test."""
    test_flan_t5_large()


if __name__ == "__main__":
    main()