Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_multiple_models.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 22 days ago

raw

history blame

6.88 kB

	#!/usr/bin/env python3
	"""
	Test Multiple Models via API for Crossword Clue Generation
	Compare various models and find the best performer.
	"""

	import sys
	import logging
	from pathlib import Path

	# Add hack directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	try:
	from api_clue_generator import APIClueGenerator
	API_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Import error: {e}")
	API_AVAILABLE = False

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def test_multiple_models():
	"""Test multiple models via API and compare results."""
	if not API_AVAILABLE:
	print("❌ Cannot run test - API generator not available")
	return

	print("🧪 Testing Multiple Models via Hugging Face API")
	print("=" * 60)

	# Initialize API generator
	generator = APIClueGenerator()

	print(f"🎯 Testing {len(generator.models)} models:")
	for i, (key, model) in enumerate(generator.models.items(), 1):
	print(f" {i}. {key} ({model})")

	# Test cases for comprehensive evaluation
	test_cases = [
	# Cases that failed with local models
	("CAT", "animals"),
	("BATSMAN", "cricket"),
	("SWIMMING", "sports"),
	("AIRPORT", "transportation"),
	("DATABASE", "technology"),

	# Additional challenging cases
	("VIOLIN", "music"),
	("PIZZA", "food"),
	("SCIENTIST", "science"),
	("MOUNTAIN", "geography"),
	("ELEPHANT", "animals"),
	]

	print(f"\n📊 Testing {len(test_cases)} word-topic combinations")
	print("=" * 60)

	# Track results for each model
	model_scores = {model_key: {"total": 0, "excellent": 0, "good": 0, "poor": 0, "failed": 0}
	for model_key in generator.models.keys()}
	all_results = []

	for i, (word, topic) in enumerate(test_cases, 1):
	print(f"\n📝 Test {i}/{len(test_cases)}: '{word}' + '{topic}'")
	print("-" * 50)

	try:
	# Generate clues with all models
	results = generator.generate_clue(word, topic)
	test_result = {"word": word, "topic": topic, "results": {}}

	# Evaluate each model's response
	for model_key, clue in results.items():
	if clue:
	quality, score = generator.evaluate_clue_quality(word, clue)
	test_result["results"][model_key] = {"clue": clue, "quality": quality, "score": score}

	# Update model statistics
	model_scores[model_key]["total"] += 1
	if quality == "EXCELLENT":
	model_scores[model_key]["excellent"] += 1
	elif quality == "GOOD":
	model_scores[model_key]["good"] += 1
	elif quality == "ACCEPTABLE":
	model_scores[model_key]["good"] += 1 # Count as good
	else:
	model_scores[model_key]["poor"] += 1

	print(f" {model_key:20} \| {quality:10} \| {clue}")
	else:
	model_scores[model_key]["failed"] += 1
	test_result["results"][model_key] = {"clue": None, "quality": "FAILED", "score": 0.0}
	print(f" {model_key:20} \| FAILED \| No response")

	all_results.append(test_result)

	except Exception as e:
	print(f"❌ Error in test {i}: {e}")

	# Calculate final scores and rankings
	print(f"\n" + "=" * 60)
	print("📊 FINAL MODEL COMPARISON RESULTS")
	print("=" * 60)

	model_rankings = []
	for model_key, stats in model_scores.items():
	if stats["total"] > 0:
	success_rate = ((stats["excellent"] + stats["good"]) / len(test_cases)) * 100
	excellence_rate = (stats["excellent"] / len(test_cases)) * 100
	failure_rate = (stats["failed"] / len(test_cases)) * 100
	else:
	success_rate = excellence_rate = failure_rate = 0

	model_rankings.append({
	"model": model_key,
	"success_rate": success_rate,
	"excellence_rate": excellence_rate,
	"failure_rate": failure_rate,
	"stats": stats
	})

	# Sort by success rate, then by excellence rate
	model_rankings.sort(key=lambda x: (x["success_rate"], x["excellence_rate"]), reverse=True)

	print(f"{'Rank':4} {'Model':25} {'Success%':8} {'Excel%':7} {'Fail%':6} {'E':2} {'G':2} {'P':2} {'F':2}")
	print("-" * 75)

	for i, ranking in enumerate(model_rankings, 1):
	model = ranking["model"]
	success = ranking["success_rate"]
	excel = ranking["excellence_rate"]
	fail = ranking["failure_rate"]
	stats = ranking["stats"]

	print(f"{i:4} {model:25} {success:7.1f} {excel:6.1f} {fail:5.1f} "
	f"{stats['excellent']:2} {stats['good']:2} {stats['poor']:2} {stats['failed']:2}")

	# Show best results
	if model_rankings:
	best_model = model_rankings[0]
	print(f"\n🏆 BEST PERFORMING MODEL: {best_model['model']}")
	print(f" Success Rate: {best_model['success_rate']:.1f}%")
	print(f" Excellence Rate: {best_model['excellence_rate']:.1f}%")

	if best_model['success_rate'] >= 70:
	print("🎉 EXCELLENT! This model is ready for production use!")
	elif best_model['success_rate'] >= 50:
	print("🔄 Good results! This model shows promise for crossword generation")
	else:
	print("⚠️ Moderate results. May need prompt refinement or different approach")

	# Show some example excellent clues
	print(f"\n🌟 BEST CLUE EXAMPLES:")
	print("-" * 40)
	excellent_examples = []
	for result in all_results:
	for model_key, res in result["results"].items():
	if res["quality"] == "EXCELLENT":
	excellent_examples.append((result["word"], result["topic"], res["clue"], model_key))

	for word, topic, clue, model in excellent_examples[:5]: # Show top 5
	print(f" {word} + {topic}: \"{clue}\" ({model})")

	return model_rankings


	def main():
	"""Run the multiple model comparison test."""
	rankings = test_multiple_models()

	if rankings:
	print(f"\n💡 RECOMMENDATION:")
	best = rankings[0]
	print(f"Use '{best['model']}' as your primary clue generation model.")
	print(f"It achieved {best['success_rate']:.1f}% success rate with {best['excellence_rate']:.1f}% excellent clues.")


	if __name__ == "__main__":
	main()