Spaces:

vimalk78
/

abc123

Running

File size: 4,479 Bytes

486eff6

#!/usr/bin/env python3
"""
Test script to verify custom NLTK data path functionality.
This shows that the WordNet generator now downloads NLTK data to model_cache/nltk_data.
"""

import sys
from pathlib import Path

# Add hack directory to path
sys.path.insert(0, str(Path(__file__).parent))

def test_custom_nltk_path():
    """Test that NLTK data downloads to custom cache directory."""
    print("🧪 Testing Custom NLTK Data Path")
    print("=" * 50)
    
    model_cache = Path('./model_cache')
    nltk_cache = model_cache / 'nltk_data'
    
    print(f"📂 Model cache directory: {model_cache.absolute()}")
    print(f"📂 Expected NLTK data directory: {nltk_cache.absolute()}")
    
    try:
        from wordnet_clue_generator import WordNetClueGenerator
        
        # Create generator with explicit cache directory
        generator = WordNetClueGenerator(cache_dir=str(model_cache))
        
        print(f"\n🔧 Generator Configuration:")
        print(f"   Cache dir: {generator.cache_dir}")
        print(f"   NLTK data dir: {generator.nltk_data_dir}")
        
        # Check NLTK path configuration
        print(f"\n📋 NLTK Path Test:")
        if nltk_cache.exists():
            print(f"   ✅ NLTK cache directory exists")
            contents = list(nltk_cache.iterdir())
            print(f"   Contents: {len(contents)} items")
            for item in contents[:5]:  # Show first 5 items
                print(f"     - {item.name}")
        else:
            print(f"   📁 NLTK cache directory will be created on initialization")
        
        # Test ensure_nltk_data function directly
        print(f"\n🔍 Testing ensure_nltk_data function:")
        from wordnet_clue_generator import ensure_nltk_data
        
        # This should create the directory and set up paths
        success = ensure_nltk_data(str(nltk_cache))
        print(f"   Result: {'✅ Success' if success else '❌ Failed'}")
        
        if success:
            # Check NLTK path was added
            import nltk
            print(f"   NLTK search paths (first 3):")
            for i, path in enumerate(nltk.data.path[:3], 1):
                print(f"     {i}. {path}")
            
            # Check if directory was created
            if nltk_cache.exists():
                print(f"   ✅ NLTK data directory created")
                
                # List contents if any
                contents = list(nltk_cache.rglob('*'))
                print(f"   📁 Directory contents: {len(contents)} total items")
        
        return True
        
    except Exception as e:
        print(f"❌ Test error: {e}")
        import traceback
        traceback.print_exc()
        return False

def show_cache_integration():
    """Show how the cache integrates with existing structure."""
    print(f"\n📊 Cache Integration Overview")
    print("=" * 40)
    
    model_cache = Path('./model_cache')
    
    print(f"Cache Structure:")
    print(f"model_cache/")
    print(f"├── nltk_data/              # ← New NLTK data location")
    print(f"│   └── corpora/")
    print(f"│       ├── wordnet/")
    print(f"│       ├── punkt/")
    print(f"│       └── omw-1.4/")
    print(f"├── unified_vocabulary_*.pkl")
    print(f"├── unified_embeddings_*.npy")
    print(f"└── models--sentence-transformers/")
    
    if model_cache.exists():
        actual_size = sum(f.stat().st_size for f in model_cache.rglob('*') if f.is_file()) / (1024*1024)
        print(f"\n📊 Current cache size: {actual_size:.1f} MB")
        
        nltk_dir = model_cache / 'nltk_data'
        if nltk_dir.exists():
            nltk_size = sum(f.stat().st_size for f in nltk_dir.rglob('*') if f.is_file()) / (1024*1024)
            print(f"📊 NLTK data size: {nltk_size:.1f} MB")

def main():
    """Run the custom NLTK path test."""
    print("🚀 Custom NLTK Path Test")
    print("=" * 60)
    print("Testing WordNet generator with model_cache/nltk_data location")
    
    success = test_custom_nltk_path()
    show_cache_integration()
    
    if success:
        print(f"\n✅ SUCCESS!")
        print(f"📂 NLTK data will now download to: model_cache/nltk_data/")
        print(f"🎯 This keeps all AI/NLP data centralized in model_cache")
        print(f"⚡ WordNet clue generator ready for use!")
    else:
        print(f"\n❌ Test failed - check configuration")

if __name__ == "__main__":
    main()