Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_clue_generation.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 24 days ago

raw

history blame

24.4 kB

	#!/usr/bin/env python3
	"""
	Test Script: Thematic Word Generation + LLM Clue Generation

	Integrates the existing thematic_word_generator.py with the new llm_clue_generator.py
	to create a complete word-to-clue pipeline for crossword puzzles.

	Tests various scenarios:
	- Single topics
	- Multiple topics
	- Custom sentences
	- Different difficulties
	- Performance analysis
	"""

	import os
	import sys
	import time
	import logging
	from typing import List, Dict, Tuple, Any
	from pathlib import Path

	# Add hack directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	try:
	from thematic_word_generator import UnifiedThematicWordGenerator
	from llm_clue_generator import LLMClueGenerator
	GENERATORS_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Import error: {e}")
	print("Make sure thematic_word_generator.py and llm_clue_generator.py are in the same directory")
	GENERATORS_AVAILABLE = False

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s:%(lineno)d - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)
	logger = logging.getLogger(__name__)


	class CrosswordClueTestSuite:
	"""
	Test suite for integrated thematic word generation + LLM clue generation.
	"""

	def __init__(self, vocab_size_limit: int = 50000):
	"""Initialize the test suite.

	Args:
	vocab_size_limit: Vocabulary size for thematic generator (smaller for faster testing)
	"""
	if not GENERATORS_AVAILABLE:
	raise ImportError("Required generators not available")

	self.vocab_size_limit = vocab_size_limit
	self.word_generator = None
	self.clue_generator = None

	# Test results storage
	self.test_results = {}
	self.performance_stats = {}

	def initialize(self):
	"""Initialize both generators."""
	print("🚀 Initializing Crossword Clue Test Suite")
	print("=" * 60)

	# Initialize thematic word generator
	print("\n🔄 Initializing thematic word generator...")
	start_time = time.time()

	self.word_generator = UnifiedThematicWordGenerator(
	vocab_size_limit=self.vocab_size_limit
	)
	self.word_generator.initialize()

	word_gen_time = time.time() - start_time
	print(f"✅ Word generator ready in {word_gen_time:.2f}s")

	# Initialize LLM clue generator
	print("\n🔄 Initializing LLM clue generator...")
	start_time = time.time()

	self.clue_generator = LLMClueGenerator()
	self.clue_generator.initialize()

	clue_gen_time = time.time() - start_time
	print(f"✅ Clue generator ready in {clue_gen_time:.2f}s")

	# Store initialization stats
	self.performance_stats['word_gen_init_time'] = word_gen_time
	self.performance_stats['clue_gen_init_time'] = clue_gen_time
	self.performance_stats['total_init_time'] = word_gen_time + clue_gen_time

	print(f"\n✅ Test suite initialized in {word_gen_time + clue_gen_time:.2f}s")

	def run_single_topic_test(self, topic: str, num_words: int = 10) -> Dict[str, Any]:
	"""Test single topic word+clue generation.

	Args:
	topic: Single topic string
	num_words: Number of words to generate

	Returns:
	Test results dictionary
	"""
	print(f"\n🎯 Single Topic Test: '{topic}'")
	print("-" * 50)

	start_time = time.time()

	# Step 1: Generate thematic words
	print(f"📝 Generating {num_words} thematic words...")
	word_start_time = time.time()

	thematic_results = self.word_generator.generate_thematic_words(
	inputs=topic,
	num_words=num_words,
	min_similarity=0.3
	)

	word_gen_time = time.time() - word_start_time
	print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")

	if not thematic_results:
	return {"error": "No thematic words generated"}

	# Step 2: Generate clues for each word
	print(f"🎭 Generating clues for {len(thematic_results)} words...")
	clue_start_time = time.time()

	word_clue_pairs = []
	for word, similarity, tier in thematic_results:
	try:
	clue = self.clue_generator.generate_clue(
	word=word,
	topic=topic,
	clue_style="category",
	difficulty="medium"
	)

	word_clue_pairs.append({
	"word": word.upper(),
	"clue": clue,
	"similarity": similarity,
	"tier": tier,
	"length": len(word)
	})

	except Exception as e:
	logger.error(f"Failed to generate clue for '{word}': {e}")
	word_clue_pairs.append({
	"word": word.upper(),
	"clue": f"Related to {topic}: {word}", # Fallback
	"similarity": similarity,
	"tier": tier,
	"length": len(word),
	"error": str(e)
	})

	clue_gen_time = time.time() - clue_start_time
	total_time = time.time() - start_time

	# Display results
	print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
	print(f"\n📋 Results for topic '{topic}':")
	print("=" * 60)

	for i, item in enumerate(word_clue_pairs, 1):
	tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
	error_marker = " ⚠️" if 'error' in item else ""
	print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
	print(f" Similarity: {item['similarity']:.3f} \| {tier_desc}")

	# Performance summary
	print(f"\n⏱️ Performance:")
	print(f" Word generation: {word_gen_time:.2f}s")
	print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
	print(f" Total time: {total_time:.2f}s")

	return {
	"topic": topic,
	"num_words_requested": num_words,
	"num_words_generated": len(word_clue_pairs),
	"word_clue_pairs": word_clue_pairs,
	"performance": {
	"word_gen_time": word_gen_time,
	"clue_gen_time": clue_gen_time,
	"total_time": total_time,
	"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
	}
	}

	def run_multi_topic_test(self, topics: List[str], num_words: int = 12) -> Dict[str, Any]:
	"""Test multi-topic word+clue generation.

	Args:
	topics: List of topic strings
	num_words: Number of words to generate

	Returns:
	Test results dictionary
	"""
	print(f"\n🎯 Multi-Topic Test: {topics}")
	print("-" * 50)

	start_time = time.time()

	# Step 1: Generate thematic words (multi-theme enabled)
	print(f"📝 Generating {num_words} multi-thematic words...")
	word_start_time = time.time()

	thematic_results = self.word_generator.generate_thematic_words(
	inputs=topics,
	num_words=num_words,
	min_similarity=0.25, # Lower threshold for multi-topic
	multi_theme=True # Enable multi-theme processing
	)

	word_gen_time = time.time() - word_start_time
	print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")

	if not thematic_results:
	return {"error": "No thematic words generated"}

	# Step 2: Generate contextual clues
	print(f"🎭 Generating contextual clues...")
	clue_start_time = time.time()

	# Create topic context string for clue generation
	topic_context = " and ".join(topics)

	word_clue_pairs = []
	for word, similarity, tier in thematic_results:
	try:
	clue = self.clue_generator.generate_clue(
	word=word,
	topic=topic_context,
	clue_style="description", # Use descriptive style for multi-topic
	difficulty="medium"
	)

	word_clue_pairs.append({
	"word": word.upper(),
	"clue": clue,
	"similarity": similarity,
	"tier": tier,
	"length": len(word)
	})

	except Exception as e:
	logger.error(f"Failed to generate clue for '{word}': {e}")
	word_clue_pairs.append({
	"word": word.upper(),
	"clue": f"Related to {topic_context}: {word}",
	"similarity": similarity,
	"tier": tier,
	"length": len(word),
	"error": str(e)
	})

	clue_gen_time = time.time() - clue_start_time
	total_time = time.time() - start_time

	# Display results
	print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
	print(f"\n📋 Results for topics {topics}:")
	print("=" * 70)

	for i, item in enumerate(word_clue_pairs, 1):
	tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
	error_marker = " ⚠️" if 'error' in item else ""
	print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
	print(f" Similarity: {item['similarity']:.3f} \| {tier_desc}")

	# Performance summary
	print(f"\n⏱️ Performance:")
	print(f" Word generation: {word_gen_time:.2f}s")
	print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
	print(f" Total time: {total_time:.2f}s")

	return {
	"topics": topics,
	"num_words_requested": num_words,
	"num_words_generated": len(word_clue_pairs),
	"word_clue_pairs": word_clue_pairs,
	"performance": {
	"word_gen_time": word_gen_time,
	"clue_gen_time": clue_gen_time,
	"total_time": total_time,
	"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
	}
	}

	def run_custom_sentence_test(self, sentence: str, num_words: int = 10) -> Dict[str, Any]:
	"""Test custom sentence word+clue generation.

	Args:
	sentence: Custom sentence input
	num_words: Number of words to generate

	Returns:
	Test results dictionary
	"""
	print(f"\n🎯 Custom Sentence Test: '{sentence}'")
	print("-" * 60)

	start_time = time.time()

	# Step 1: Generate thematic words from sentence
	print(f"📝 Generating {num_words} words from sentence...")
	word_start_time = time.time()

	thematic_results = self.word_generator.generate_thematic_words(
	inputs=sentence,
	num_words=num_words,
	min_similarity=0.2 # Lower threshold for sentences
	)

	word_gen_time = time.time() - word_start_time
	print(f"✅ Generated {len(thematic_results)} words in {word_gen_time:.2f}s")

	if not thematic_results:
	return {"error": "No thematic words generated"}

	# Step 2: Generate personalized clues
	print(f"🎭 Generating personalized clues...")
	clue_start_time = time.time()

	word_clue_pairs = []
	for word, similarity, tier in thematic_results:
	try:
	# Use the original sentence as context for more personalized clues
	clue = self.clue_generator.generate_clue(
	word=word,
	topic=f"theme: {sentence}",
	clue_style="description",
	difficulty="medium"
	)

	word_clue_pairs.append({
	"word": word.upper(),
	"clue": clue,
	"similarity": similarity,
	"tier": tier,
	"length": len(word)
	})

	except Exception as e:
	logger.error(f"Failed to generate clue for '{word}': {e}")
	word_clue_pairs.append({
	"word": word.upper(),
	"clue": f"From '{sentence[:30]}...': {word}",
	"similarity": similarity,
	"tier": tier,
	"length": len(word),
	"error": str(e)
	})

	clue_gen_time = time.time() - clue_start_time
	total_time = time.time() - start_time

	# Display results
	print(f"✅ Generated {len(word_clue_pairs)} clues in {clue_gen_time:.2f}s")
	print(f"\n📋 Results for sentence: '{sentence}'")
	print("=" * 70)

	for i, item in enumerate(word_clue_pairs, 1):
	tier_desc = self.word_generator.tier_descriptions.get(item['tier'], item['tier'])
	error_marker = " ⚠️" if 'error' in item else ""
	print(f"{i:2d}. {item['word']:<12} ({item['length']} letters) - {item['clue']}{error_marker}")
	print(f" Similarity: {item['similarity']:.3f} \| {tier_desc}")

	# Performance summary
	print(f"\n⏱️ Performance:")
	print(f" Word generation: {word_gen_time:.2f}s")
	print(f" Clue generation: {clue_gen_time:.2f}s ({clue_gen_time/len(word_clue_pairs):.2f}s per clue)")
	print(f" Total time: {total_time:.2f}s")

	return {
	"sentence": sentence,
	"num_words_requested": num_words,
	"num_words_generated": len(word_clue_pairs),
	"word_clue_pairs": word_clue_pairs,
	"performance": {
	"word_gen_time": word_gen_time,
	"clue_gen_time": clue_gen_time,
	"total_time": total_time,
	"avg_clue_time": clue_gen_time / len(word_clue_pairs) if word_clue_pairs else 0
	}
	}

	def run_difficulty_comparison_test(self, topic: str, num_words: int = 6) -> Dict[str, Any]:
	"""Test different difficulty levels for the same topic.

	Args:
	topic: Topic to test
	num_words: Number of words to generate

	Returns:
	Comparison results
	"""
	print(f"\n🎯 Difficulty Comparison Test: '{topic}'")
	print("-" * 50)

	difficulties = ["easy", "medium", "hard"]
	results = {}

	# Generate words once (reuse for all difficulties)
	thematic_results = self.word_generator.generate_thematic_words(
	inputs=topic,
	num_words=num_words,
	min_similarity=0.3
	)[:num_words] # Take only requested number

	if not thematic_results:
	return {"error": "No thematic words generated"}

	print(f"📝 Testing {len(thematic_results)} words at different difficulty levels...")

	for difficulty in difficulties:
	print(f"\n--- {difficulty.upper()} Difficulty ---")

	clue_pairs = []
	start_time = time.time()

	for word, similarity, tier in thematic_results:
	try:
	clue = self.clue_generator.generate_clue(
	word=word,
	topic=topic,
	clue_style="category",
	difficulty=difficulty
	)

	clue_pairs.append({
	"word": word.upper(),
	"clue": clue,
	"similarity": similarity,
	"tier": tier
	})

	except Exception as e:
	logger.error(f"Failed to generate {difficulty} clue for '{word}': {e}")
	clue_pairs.append({
	"word": word.upper(),
	"clue": f"{difficulty.title()} clue for {word}",
	"similarity": similarity,
	"tier": tier,
	"error": str(e)
	})

	generation_time = time.time() - start_time
	results[difficulty] = {
	"clue_pairs": clue_pairs,
	"generation_time": generation_time
	}

	# Display this difficulty's results
	for i, item in enumerate(clue_pairs, 1):
	error_marker = " ⚠️" if 'error' in item else ""
	print(f" {i}. {item['word']:<10} - {item['clue']}{error_marker}")

	return {
	"topic": topic,
	"difficulties_tested": difficulties,
	"results": results,
	"base_words": [{"word": w, "similarity": s, "tier": t} for w, s, t in thematic_results]
	}

	def run_performance_analysis(self) -> Dict[str, Any]:
	"""Analyze overall performance characteristics."""
	print(f"\n📊 Performance Analysis")
	print("-" * 40)

	# Collect performance stats from previous tests
	if not self.test_results:
	print("⚠️ No test results available for performance analysis")
	return {}

	all_word_times = []
	all_clue_times = []
	all_total_times = []

	for test_name, result in self.test_results.items():
	if 'performance' in result:
	perf = result['performance']
	all_word_times.append(perf.get('word_gen_time', 0))
	all_clue_times.append(perf.get('clue_gen_time', 0))
	all_total_times.append(perf.get('total_time', 0))

	if all_word_times:
	print(f"📈 Word Generation Performance:")
	print(f" Average: {sum(all_word_times)/len(all_word_times):.2f}s")
	print(f" Min: {min(all_word_times):.2f}s")
	print(f" Max: {max(all_word_times):.2f}s")

	print(f"\n🎭 Clue Generation Performance:")
	print(f" Average: {sum(all_clue_times)/len(all_clue_times):.2f}s")
	print(f" Min: {min(all_clue_times):.2f}s")
	print(f" Max: {max(all_clue_times):.2f}s")

	print(f"\n⏱️ Total Pipeline Performance:")
	print(f" Average: {sum(all_total_times)/len(all_total_times):.2f}s")
	print(f" Min: {min(all_total_times):.2f}s")
	print(f" Max: {max(all_total_times):.2f}s")

	return {
	"word_gen_stats": {
	"avg": sum(all_word_times)/len(all_word_times) if all_word_times else 0,
	"min": min(all_word_times) if all_word_times else 0,
	"max": max(all_word_times) if all_word_times else 0
	},
	"clue_gen_stats": {
	"avg": sum(all_clue_times)/len(all_clue_times) if all_clue_times else 0,
	"min": min(all_clue_times) if all_clue_times else 0,
	"max": max(all_clue_times) if all_clue_times else 0
	},
	"total_stats": {
	"avg": sum(all_total_times)/len(all_total_times) if all_total_times else 0,
	"min": min(all_total_times) if all_total_times else 0,
	"max": max(all_total_times) if all_total_times else 0
	}
	}

	def run_full_test_suite(self):
	"""Run the complete test suite."""
	print("🧪 CROSSWORD CLUE GENERATION TEST SUITE")
	print("=" * 70)

	if not GENERATORS_AVAILABLE:
	print("❌ Cannot run tests - generators not available")
	return

	# Initialize
	self.initialize()

	# Test 1: Single topics
	print("\n" + "="*70)
	print("TEST 1: SINGLE TOPIC TESTS")
	print("="*70)

	single_topics = ["animals", "technology", "music", "food"]
	for topic in single_topics:
	result = self.run_single_topic_test(topic, num_words=8)
	self.test_results[f"single_{topic}"] = result

	# Test 2: Multi-topic
	print("\n" + "="*70)
	print("TEST 2: MULTI-TOPIC TEST")
	print("="*70)

	multi_result = self.run_multi_topic_test(["science", "technology"], num_words=10)
	self.test_results["multi_science_tech"] = multi_result

	# Test 3: Custom sentence
	print("\n" + "="*70)
	print("TEST 3: CUSTOM SENTENCE TEST")
	print("="*70)

	sentence_result = self.run_custom_sentence_test("I love cats and playing guitar", num_words=8)
	self.test_results["sentence_cats_guitar"] = sentence_result

	# Test 4: Difficulty comparison
	print("\n" + "="*70)
	print("TEST 4: DIFFICULTY COMPARISON")
	print("="*70)

	difficulty_result = self.run_difficulty_comparison_test("sports", num_words=5)
	self.test_results["difficulty_sports"] = difficulty_result

	# Test 5: Performance analysis
	print("\n" + "="*70)
	print("TEST 5: PERFORMANCE ANALYSIS")
	print("="*70)

	perf_result = self.run_performance_analysis()
	self.test_results["performance"] = perf_result

	# Final summary
	print("\n" + "="*70)
	print("📋 FINAL SUMMARY")
	print("="*70)

	print(f"✅ Test suite completed!")
	print(f"📊 Tests run: {len(self.test_results)}")

	# Model info
	word_info = {
	"vocab_size": self.word_generator.get_vocabulary_size(),
	"tier_distribution": len(self.word_generator.get_tier_distribution())
	}

	clue_info = self.clue_generator.get_model_info()

	print(f"\n🔧 System Information:")
	print(f" Word vocabulary: {word_info['vocab_size']:,} words")
	print(f" Clue model: {clue_info['model_name']}")
	print(f" Model size: {clue_info.get('model_size_mb', 0):.1f} MB")

	if perf_result:
	avg_total = perf_result['total_stats']['avg']
	print(f" Average pipeline time: {avg_total:.2f}s")

	print(f"\n💡 Recommendations for HF Spaces:")
	if perf_result and perf_result['total_stats']['avg'] < 15:
	print(" ✅ Performance suitable for interactive use")
	else:
	print(" ⚠️ Consider optimizations for better user experience")

	print("\n🎉 Test suite complete!")


	def main():
	"""Run the test suite."""
	if not GENERATORS_AVAILABLE:
	print("❌ Cannot run tests - required generators not available")
	print("Make sure thematic_word_generator.py and llm_clue_generator.py are working")
	return

	# Create and run test suite
	test_suite = CrosswordClueTestSuite(vocab_size_limit=50000) # Use existing cached embeddings

	try:
	test_suite.run_full_test_suite()
	except KeyboardInterrupt:
	print("\n\n⏹️ Test suite interrupted by user")
	except Exception as e:
	print(f"\n❌ Test suite failed: {e}")
	logger.error(f"Test suite error: {e}", exc_info=True)


	if __name__ == "__main__":
	main()