#!/usr/bin/env python3 """ Test script to verify custom NLTK data path functionality. This shows that the WordNet generator now downloads NLTK data to model_cache/nltk_data. """ import sys from pathlib import Path # Add hack directory to path sys.path.insert(0, str(Path(__file__).parent)) def test_custom_nltk_path(): """Test that NLTK data downloads to custom cache directory.""" print("๐Ÿงช Testing Custom NLTK Data Path") print("=" * 50) model_cache = Path('./model_cache') nltk_cache = model_cache / 'nltk_data' print(f"๐Ÿ“‚ Model cache directory: {model_cache.absolute()}") print(f"๐Ÿ“‚ Expected NLTK data directory: {nltk_cache.absolute()}") try: from wordnet_clue_generator import WordNetClueGenerator # Create generator with explicit cache directory generator = WordNetClueGenerator(cache_dir=str(model_cache)) print(f"\n๐Ÿ”ง Generator Configuration:") print(f" Cache dir: {generator.cache_dir}") print(f" NLTK data dir: {generator.nltk_data_dir}") # Check NLTK path configuration print(f"\n๐Ÿ“‹ NLTK Path Test:") if nltk_cache.exists(): print(f" โœ… NLTK cache directory exists") contents = list(nltk_cache.iterdir()) print(f" Contents: {len(contents)} items") for item in contents[:5]: # Show first 5 items print(f" - {item.name}") else: print(f" ๐Ÿ“ NLTK cache directory will be created on initialization") # Test ensure_nltk_data function directly print(f"\n๐Ÿ” Testing ensure_nltk_data function:") from wordnet_clue_generator import ensure_nltk_data # This should create the directory and set up paths success = ensure_nltk_data(str(nltk_cache)) print(f" Result: {'โœ… Success' if success else 'โŒ Failed'}") if success: # Check NLTK path was added import nltk print(f" NLTK search paths (first 3):") for i, path in enumerate(nltk.data.path[:3], 1): print(f" {i}. {path}") # Check if directory was created if nltk_cache.exists(): print(f" โœ… NLTK data directory created") # List contents if any contents = list(nltk_cache.rglob('*')) print(f" ๐Ÿ“ Directory contents: {len(contents)} total items") return True except Exception as e: print(f"โŒ Test error: {e}") import traceback traceback.print_exc() return False def show_cache_integration(): """Show how the cache integrates with existing structure.""" print(f"\n๐Ÿ“Š Cache Integration Overview") print("=" * 40) model_cache = Path('./model_cache') print(f"Cache Structure:") print(f"model_cache/") print(f"โ”œโ”€โ”€ nltk_data/ # โ† New NLTK data location") print(f"โ”‚ โ””โ”€โ”€ corpora/") print(f"โ”‚ โ”œโ”€โ”€ wordnet/") print(f"โ”‚ โ”œโ”€โ”€ punkt/") print(f"โ”‚ โ””โ”€โ”€ omw-1.4/") print(f"โ”œโ”€โ”€ unified_vocabulary_*.pkl") print(f"โ”œโ”€โ”€ unified_embeddings_*.npy") print(f"โ””โ”€โ”€ models--sentence-transformers/") if model_cache.exists(): actual_size = sum(f.stat().st_size for f in model_cache.rglob('*') if f.is_file()) / (1024*1024) print(f"\n๐Ÿ“Š Current cache size: {actual_size:.1f} MB") nltk_dir = model_cache / 'nltk_data' if nltk_dir.exists(): nltk_size = sum(f.stat().st_size for f in nltk_dir.rglob('*') if f.is_file()) / (1024*1024) print(f"๐Ÿ“Š NLTK data size: {nltk_size:.1f} MB") def main(): """Run the custom NLTK path test.""" print("๐Ÿš€ Custom NLTK Path Test") print("=" * 60) print("Testing WordNet generator with model_cache/nltk_data location") success = test_custom_nltk_path() show_cache_integration() if success: print(f"\nโœ… SUCCESS!") print(f"๐Ÿ“‚ NLTK data will now download to: model_cache/nltk_data/") print(f"๐ŸŽฏ This keeps all AI/NLP data centralized in model_cache") print(f"โšก WordNet clue generator ready for use!") else: print(f"\nโŒ Test failed - check configuration") if __name__ == "__main__": main()