abc123 / hack /test_custom_nltk_path.py
vimalk78's picture
feat(crossword): generated crosswords with clues
486eff6
raw
history blame
4.48 kB
#!/usr/bin/env python3
"""
Test script to verify custom NLTK data path functionality.
This shows that the WordNet generator now downloads NLTK data to model_cache/nltk_data.
"""
import sys
from pathlib import Path
# Add hack directory to path
sys.path.insert(0, str(Path(__file__).parent))
def test_custom_nltk_path():
"""Test that NLTK data downloads to custom cache directory."""
print("πŸ§ͺ Testing Custom NLTK Data Path")
print("=" * 50)
model_cache = Path('./model_cache')
nltk_cache = model_cache / 'nltk_data'
print(f"πŸ“‚ Model cache directory: {model_cache.absolute()}")
print(f"πŸ“‚ Expected NLTK data directory: {nltk_cache.absolute()}")
try:
from wordnet_clue_generator import WordNetClueGenerator
# Create generator with explicit cache directory
generator = WordNetClueGenerator(cache_dir=str(model_cache))
print(f"\nπŸ”§ Generator Configuration:")
print(f" Cache dir: {generator.cache_dir}")
print(f" NLTK data dir: {generator.nltk_data_dir}")
# Check NLTK path configuration
print(f"\nπŸ“‹ NLTK Path Test:")
if nltk_cache.exists():
print(f" βœ… NLTK cache directory exists")
contents = list(nltk_cache.iterdir())
print(f" Contents: {len(contents)} items")
for item in contents[:5]: # Show first 5 items
print(f" - {item.name}")
else:
print(f" πŸ“ NLTK cache directory will be created on initialization")
# Test ensure_nltk_data function directly
print(f"\nπŸ” Testing ensure_nltk_data function:")
from wordnet_clue_generator import ensure_nltk_data
# This should create the directory and set up paths
success = ensure_nltk_data(str(nltk_cache))
print(f" Result: {'βœ… Success' if success else '❌ Failed'}")
if success:
# Check NLTK path was added
import nltk
print(f" NLTK search paths (first 3):")
for i, path in enumerate(nltk.data.path[:3], 1):
print(f" {i}. {path}")
# Check if directory was created
if nltk_cache.exists():
print(f" βœ… NLTK data directory created")
# List contents if any
contents = list(nltk_cache.rglob('*'))
print(f" πŸ“ Directory contents: {len(contents)} total items")
return True
except Exception as e:
print(f"❌ Test error: {e}")
import traceback
traceback.print_exc()
return False
def show_cache_integration():
"""Show how the cache integrates with existing structure."""
print(f"\nπŸ“Š Cache Integration Overview")
print("=" * 40)
model_cache = Path('./model_cache')
print(f"Cache Structure:")
print(f"model_cache/")
print(f"β”œβ”€β”€ nltk_data/ # ← New NLTK data location")
print(f"β”‚ └── corpora/")
print(f"β”‚ β”œβ”€β”€ wordnet/")
print(f"β”‚ β”œβ”€β”€ punkt/")
print(f"β”‚ └── omw-1.4/")
print(f"β”œβ”€β”€ unified_vocabulary_*.pkl")
print(f"β”œβ”€β”€ unified_embeddings_*.npy")
print(f"└── models--sentence-transformers/")
if model_cache.exists():
actual_size = sum(f.stat().st_size for f in model_cache.rglob('*') if f.is_file()) / (1024*1024)
print(f"\nπŸ“Š Current cache size: {actual_size:.1f} MB")
nltk_dir = model_cache / 'nltk_data'
if nltk_dir.exists():
nltk_size = sum(f.stat().st_size for f in nltk_dir.rglob('*') if f.is_file()) / (1024*1024)
print(f"πŸ“Š NLTK data size: {nltk_size:.1f} MB")
def main():
"""Run the custom NLTK path test."""
print("πŸš€ Custom NLTK Path Test")
print("=" * 60)
print("Testing WordNet generator with model_cache/nltk_data location")
success = test_custom_nltk_path()
show_cache_integration()
if success:
print(f"\nβœ… SUCCESS!")
print(f"πŸ“‚ NLTK data will now download to: model_cache/nltk_data/")
print(f"🎯 This keeps all AI/NLP data centralized in model_cache")
print(f"⚑ WordNet clue generator ready for use!")
else:
print(f"\n❌ Test failed - check configuration")
if __name__ == "__main__":
main()