File size: 4,576 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/env python3
"""
Quick Test: Semantic Template Clue Generation
Test the semantic template approach against the same problematic examples that failed with LLM.
"""

import sys
import logging
from pathlib import Path

# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))

try:
    from semantic_clue_generator import SemanticClueGenerator
    GENERATOR_AVAILABLE = True
except ImportError as e:
    print(f"❌ Import error: {e}")
    GENERATOR_AVAILABLE = False

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)


def test_semantic_clues():
    """Test semantic template clue generation with problematic examples."""
    if not GENERATOR_AVAILABLE:
        print("❌ Cannot run test - Semantic generator not available")
        return
    
    print("πŸ§ͺ Testing Semantic Template Clue Generation")
    print("=" * 60)
    
    # Initialize generator
    print("πŸ”„ Initializing semantic clue generator...")
    generator = SemanticClueGenerator()
    
    try:
        generator.initialize()
        print("βœ… Generator initialized successfully")
    except Exception as e:
        print(f"❌ Failed to initialize generator: {e}")
        return
    
    # Test cases that failed with LLM - same examples user reported as junk
    test_cases = [
        # Previously bad LLM examples
        ("CAT", "animals"),
        ("KITTY", "animals"), 
        ("MEAL", "food"),
        ("HUNGER", "food"),
        ("TECH", "technology"),
        ("SCIENTIST", "science"),
        
        # Additional test cases
        ("DOG", "animals"),
        ("PYTHON", "technology"),
        ("GUITAR", "music"),
        ("OCEAN", "geography"),
        ("ATOM", "science"),
        ("PIZZA", "food"),
    ]
    
    print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
    print("=" * 60)
    
    successful_clues = 0
    total_tests = len(test_cases)
    
    for word, topic in test_cases:
        print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
        print("-" * 40)
        
        try:
            # Generate multiple clues with different styles for variety
            styles = ["category", "definition", "description"]
            candidates = []
            
            for style in styles:
                clue = generator.generate_clue(
                    word=word,
                    topic=topic,
                    clue_style=style
                )
                if clue and clue not in candidates:
                    candidates.append(clue)
            
            print(f"Generated {len(candidates)} candidates:")
            for i, candidate in enumerate(candidates, 1):
                print(f"  {i}. {candidate}")
            
            # Use the first/best clue
            best_clue = candidates[0] if candidates else None
            
            print(f"\nπŸ† Best clue: {best_clue}")
            
            # Quality evaluation - more comprehensive than LLM test
            if (best_clue and 
                len(best_clue) > 3 and 
                word.lower() not in best_clue.lower() and
                not any(junk in best_clue.lower() for junk in ['trick and treating', 'gritting your teeth', 'fender', 'occurrence'])):
                successful_clues += 1
                print("βœ… Quality: GOOD")
            else:
                print("❌ Quality: POOR")
                
        except Exception as e:
            print(f"❌ Error generating clue: {e}")
            logger.exception("Detailed error:")
    
    print(f"\n" + "=" * 60)
    print(f"πŸ“Š SEMANTIC TEMPLATE RESULTS")
    print(f"=" * 60)
    print(f"Total tests: {total_tests}")
    print(f"Successful clues: {successful_clues}")
    print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")
    
    # Compare with LLM performance (which was ~0% success)
    if successful_clues >= total_tests * 0.8:  # 80% success rate
        print("πŸŽ‰ Semantic templates show MAJOR improvement over LLM!")
    elif successful_clues >= total_tests * 0.6:  # 60% success rate  
        print("πŸ”„ Good improvement, semantic approach is viable")
    elif successful_clues >= total_tests * 0.3:  # 30% success rate
        print("⚠️  Some improvement, but templates need refinement")
    else:
        print("❌ Semantic approach also struggling, may need hybrid method")


def main():
    """Run the semantic template test."""
    test_semantic_clues()


if __name__ == "__main__":
    main()