abc123 / hack /test_semantic_clues.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
4.58 kB
#!/usr/bin/env python3
"""
Quick Test: Semantic Template Clue Generation
Test the semantic template approach against the same problematic examples that failed with LLM.
"""
import sys
import logging
from pathlib import Path
# Add hack directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
try:
from semantic_clue_generator import SemanticClueGenerator
GENERATOR_AVAILABLE = True
except ImportError as e:
print(f"❌ Import error: {e}")
GENERATOR_AVAILABLE = False
# Set up logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def test_semantic_clues():
"""Test semantic template clue generation with problematic examples."""
if not GENERATOR_AVAILABLE:
print("❌ Cannot run test - Semantic generator not available")
return
print("πŸ§ͺ Testing Semantic Template Clue Generation")
print("=" * 60)
# Initialize generator
print("πŸ”„ Initializing semantic clue generator...")
generator = SemanticClueGenerator()
try:
generator.initialize()
print("βœ… Generator initialized successfully")
except Exception as e:
print(f"❌ Failed to initialize generator: {e}")
return
# Test cases that failed with LLM - same examples user reported as junk
test_cases = [
# Previously bad LLM examples
("CAT", "animals"),
("KITTY", "animals"),
("MEAL", "food"),
("HUNGER", "food"),
("TECH", "technology"),
("SCIENTIST", "science"),
# Additional test cases
("DOG", "animals"),
("PYTHON", "technology"),
("GUITAR", "music"),
("OCEAN", "geography"),
("ATOM", "science"),
("PIZZA", "food"),
]
print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
print("=" * 60)
successful_clues = 0
total_tests = len(test_cases)
for word, topic in test_cases:
print(f"\nπŸ“ Testing: '{word}' + '{topic}'")
print("-" * 40)
try:
# Generate multiple clues with different styles for variety
styles = ["category", "definition", "description"]
candidates = []
for style in styles:
clue = generator.generate_clue(
word=word,
topic=topic,
clue_style=style
)
if clue and clue not in candidates:
candidates.append(clue)
print(f"Generated {len(candidates)} candidates:")
for i, candidate in enumerate(candidates, 1):
print(f" {i}. {candidate}")
# Use the first/best clue
best_clue = candidates[0] if candidates else None
print(f"\nπŸ† Best clue: {best_clue}")
# Quality evaluation - more comprehensive than LLM test
if (best_clue and
len(best_clue) > 3 and
word.lower() not in best_clue.lower() and
not any(junk in best_clue.lower() for junk in ['trick and treating', 'gritting your teeth', 'fender', 'occurrence'])):
successful_clues += 1
print("βœ… Quality: GOOD")
else:
print("❌ Quality: POOR")
except Exception as e:
print(f"❌ Error generating clue: {e}")
logger.exception("Detailed error:")
print(f"\n" + "=" * 60)
print(f"πŸ“Š SEMANTIC TEMPLATE RESULTS")
print(f"=" * 60)
print(f"Total tests: {total_tests}")
print(f"Successful clues: {successful_clues}")
print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")
# Compare with LLM performance (which was ~0% success)
if successful_clues >= total_tests * 0.8: # 80% success rate
print("πŸŽ‰ Semantic templates show MAJOR improvement over LLM!")
elif successful_clues >= total_tests * 0.6: # 60% success rate
print("πŸ”„ Good improvement, semantic approach is viable")
elif successful_clues >= total_tests * 0.3: # 30% success rate
print("⚠️ Some improvement, but templates need refinement")
else:
print("❌ Semantic approach also struggling, may need hybrid method")
def main():
"""Run the semantic template test."""
test_semantic_clues()
if __name__ == "__main__":
main()