Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_custom_nltk_path.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 22 days ago

raw

history blame

4.48 kB

	#!/usr/bin/env python3
	"""
	Test script to verify custom NLTK data path functionality.
	This shows that the WordNet generator now downloads NLTK data to model_cache/nltk_data.
	"""

	import sys
	from pathlib import Path

	# Add hack directory to path
	sys.path.insert(0, str(Path(__file__).parent))

	def test_custom_nltk_path():
	"""Test that NLTK data downloads to custom cache directory."""
	print("🧪 Testing Custom NLTK Data Path")
	print("=" * 50)

	model_cache = Path('./model_cache')
	nltk_cache = model_cache / 'nltk_data'

	print(f"📂 Model cache directory: {model_cache.absolute()}")
	print(f"📂 Expected NLTK data directory: {nltk_cache.absolute()}")

	try:
	from wordnet_clue_generator import WordNetClueGenerator

	# Create generator with explicit cache directory
	generator = WordNetClueGenerator(cache_dir=str(model_cache))

	print(f"\n🔧 Generator Configuration:")
	print(f" Cache dir: {generator.cache_dir}")
	print(f" NLTK data dir: {generator.nltk_data_dir}")

	# Check NLTK path configuration
	print(f"\n📋 NLTK Path Test:")
	if nltk_cache.exists():
	print(f" ✅ NLTK cache directory exists")
	contents = list(nltk_cache.iterdir())
	print(f" Contents: {len(contents)} items")
	for item in contents[:5]: # Show first 5 items
	print(f" - {item.name}")
	else:
	print(f" 📁 NLTK cache directory will be created on initialization")

	# Test ensure_nltk_data function directly
	print(f"\n🔍 Testing ensure_nltk_data function:")
	from wordnet_clue_generator import ensure_nltk_data

	# This should create the directory and set up paths
	success = ensure_nltk_data(str(nltk_cache))
	print(f" Result: {'✅ Success' if success else '❌ Failed'}")

	if success:
	# Check NLTK path was added
	import nltk
	print(f" NLTK search paths (first 3):")
	for i, path in enumerate(nltk.data.path[:3], 1):
	print(f" {i}. {path}")

	# Check if directory was created
	if nltk_cache.exists():
	print(f" ✅ NLTK data directory created")

	# List contents if any
	contents = list(nltk_cache.rglob('*'))
	print(f" 📁 Directory contents: {len(contents)} total items")

	return True

	except Exception as e:
	print(f"❌ Test error: {e}")
	import traceback
	traceback.print_exc()
	return False

	def show_cache_integration():
	"""Show how the cache integrates with existing structure."""
	print(f"\n📊 Cache Integration Overview")
	print("=" * 40)

	model_cache = Path('./model_cache')

	print(f"Cache Structure:")
	print(f"model_cache/")
	print(f"├── nltk_data/ # ← New NLTK data location")
	print(f"│ └── corpora/")
	print(f"│ ├── wordnet/")
	print(f"│ ├── punkt/")
	print(f"│ └── omw-1.4/")
	print(f"├── unified_vocabulary_*.pkl")
	print(f"├── unified_embeddings_*.npy")
	print(f"└── models--sentence-transformers/")

	if model_cache.exists():
	actual_size = sum(f.stat().st_size for f in model_cache.rglob('') if f.is_file()) / (10241024)
	print(f"\n📊 Current cache size: {actual_size:.1f} MB")

	nltk_dir = model_cache / 'nltk_data'
	if nltk_dir.exists():
	nltk_size = sum(f.stat().st_size for f in nltk_dir.rglob('') if f.is_file()) / (10241024)
	print(f"📊 NLTK data size: {nltk_size:.1f} MB")

	def main():
	"""Run the custom NLTK path test."""
	print("🚀 Custom NLTK Path Test")
	print("=" * 60)
	print("Testing WordNet generator with model_cache/nltk_data location")

	success = test_custom_nltk_path()
	show_cache_integration()

	if success:
	print(f"\n✅ SUCCESS!")
	print(f"📂 NLTK data will now download to: model_cache/nltk_data/")
	print(f"🎯 This keeps all AI/NLP data centralized in model_cache")
	print(f"⚡ WordNet clue generator ready for use!")
	else:
	print(f"\n❌ Test failed - check configuration")

	if __name__ == "__main__":
	main()