File size: 7,625 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#!/usr/bin/env python3
"""
Test: Upgraded flan-t5-base Model for Crossword Clue Generation
Compare flan-t5-base performance against the previous flan-t5-small results.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from llm_clue_generator import LLMClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_flan_t5_base():
    """Test flan-t5-base model with problematic examples that failed with flan-t5-small."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - LLM generator not available")
        return
    
    print("πŸ§ͺ Testing Upgraded flan-t5-base Model")
    print("=" * 60)
    
    # Initialize generator with base model
    print("πŸ”„ Initializing flan-t5-base clue generator...")
    generator = LLMClueGenerator()
    
    try:
        generator.initialize()
        print(f"βœ… Generator initialized successfully with {generator.model_name}")
        print(f"πŸ“Š Model size: ~1GB (vs ~80MB for flan-t5-small)")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that produced terrible results with flan-t5-small
    test_cases = [
        # Previous failures with flan-t5-small:
        # CAT + animals β†’ "Tryon", "Trick and treating"
        # MEAL + food β†’ "Jack nixt", "fender"  
        # SONG + music β†’ "Gritting your teeth"
        ("CAT", "animals"),
        ("KITTY", "animals"), 
        ("MEAL", "food"),
        ("HUNGER", "food"),
        ("SONG", "music"),
        ("GUITAR", "music"),
        
        # Your specific problematic examples
        ("BATSMAN", "cricket"),
        ("SWIMMING", "sports"),
        ("AIRPORT", "transportation"),
        
        # Additional challenging cases
        ("DATABASE", "technology"),
        ("SCIENTIST", "science"),
        ("PIZZA", "food"),
        ("MOUNTAIN", "geography"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} word-topic combinations with flan-t5-base")
    print("=" * 60)
    
    excellent_clues = 0
    good_clues = 0
    poor_clues = 0
    failed_clues = 0
    
    # Track specific improvements over flan-t5-small
    major_improvements = []
    
    for word, topic in test_cases:
        print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Test multiple clue styles to get best result
            styles = ["definition", "description", "category", "function", "context"]
            candidates = []
            
            for style in styles:
                clue = generator.generate_clue(
                    word=word,
                    topic=topic,
                    clue_style=style,
                    difficulty="medium"
                )
                if clue and len(clue) > 5:
                    candidates.append((style, clue))
            
            if candidates:
                print(f"Generated {len(candidates)} candidates:")
                for i, (style, clue) in enumerate(candidates, 1):
                    print(f"  {i}. [{style}] {clue}")
                
                # Use the first valid clue as best
                best_style, best_clue = candidates[0]
                print(f"\nπŸ† Best clue [{best_style}]: {best_clue}")
                
                # Enhanced quality evaluation
                word_lower = word.lower()
                clue_lower = best_clue.lower()
                
                # Check if contains target word (should not)
                contains_word = word_lower in clue_lower
                
                # Check for nonsense patterns from flan-t5-small
                old_nonsense = any(bad in clue_lower for bad in [
                    "trick and treating", "gritting your teeth", "jack nixt", 
                    "fender", "tryon", "nicolas", "occurrence", "sludge"
                ])
                
                # Check for descriptive quality
                is_descriptive = (
                    len(best_clue.split()) >= 2 and
                    len(best_clue) >= 8 and
                    not contains_word and
                    not old_nonsense
                )
                
                # Check for definitional quality  
                is_definitional = (
                    any(def_word in clue_lower for def_word in [
                        "player", "sport", "instrument", "device", "system", "food", 
                        "language", "place", "animal", "creature", "location"
                    ]) and not contains_word
                )
                
                if contains_word:
                    print("❌ Quality: POOR (contains target word)")
                    poor_clues += 1
                elif old_nonsense:
                    print("❌ Quality: POOR (nonsensical)")
                    poor_clues += 1
                elif is_definitional:
                    print("βœ… Quality: EXCELLENT (definitional)")
                    excellent_clues += 1
                    major_improvements.append((word, topic, best_clue))
                elif is_descriptive:
                    print("βœ… Quality: GOOD (descriptive)")
                    good_clues += 1
                    major_improvements.append((word, topic, best_clue))
                else:
                    print("πŸ”„ Quality: ACCEPTABLE")
                    good_clues += 1
            else:
                print("❌ No valid clues generated")
                failed_clues += 1
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
            failed_clues += 1
    
    total_tests = len(test_cases)
    print(f"\n" + "=" * 60)
    print(f"πŸ“Š FLAN-T5-BASE RESULTS")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Excellent clues: {excellent_clues}")
    print(f"Good clues: {good_clues}")
    print(f"Poor clues: {poor_clues}")
    print(f"Failed clues: {failed_clues}")
    print(f"Success rate: {((excellent_clues + good_clues)/total_tests)*100:.1f}%")
    print(f"Excellence rate: {(excellent_clues/total_tests)*100:.1f}%")
    
    # Show major improvements
    if major_improvements:
        print(f"\nπŸŽ‰ MAJOR IMPROVEMENTS OVER FLAN-T5-SMALL:")
        print("-" * 60)
        for word, topic, clue in major_improvements[:5]:  # Show top 5
            print(f"  {word} + {topic}: \"{clue}\"")
    
    # Evaluation compared to flan-t5-small (which had ~0% success)
    if excellent_clues >= total_tests * 0.4:  # 40% excellent
        print("πŸŽ‰ MAJOR SUCCESS! flan-t5-base produces excellent clues!")
        print("πŸš€ Ready for production use - significant upgrade from flan-t5-small")
    elif (excellent_clues + good_clues) >= total_tests * 0.6:  # 60% good+excellent  
        print("πŸ”„ Good improvement! Much better than flan-t5-small")
        print("βœ… Suitable for production with semantic fallback")
    elif (excellent_clues + good_clues) >= total_tests * 0.3:  # 30% success
        print("⚠️  Some improvement over flan-t5-small, but still limited")
    else:
        print("❌ Still struggling - may need even larger model or external knowledge")


def main():
    """Run the flan-t5-base upgrade test."""
    test_flan_t5_base()


if __name__ == "__main__":
    main()