File size: 4,479 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
#!/usr/bin/env python3
"""
Test script to verify custom NLTK data path functionality.
This shows that the WordNet generator now downloads NLTK data to model_cache/nltk_data.
"""
import sys
from pathlib import Path
# Add hack directory to path
sys.path.insert(0, str(Path(__file__).parent))
def test_custom_nltk_path():
"""Test that NLTK data downloads to custom cache directory."""
print("π§ͺ Testing Custom NLTK Data Path")
print("=" * 50)
model_cache = Path('./model_cache')
nltk_cache = model_cache / 'nltk_data'
print(f"π Model cache directory: {model_cache.absolute()}")
print(f"π Expected NLTK data directory: {nltk_cache.absolute()}")
try:
from wordnet_clue_generator import WordNetClueGenerator
# Create generator with explicit cache directory
generator = WordNetClueGenerator(cache_dir=str(model_cache))
print(f"\nπ§ Generator Configuration:")
print(f" Cache dir: {generator.cache_dir}")
print(f" NLTK data dir: {generator.nltk_data_dir}")
# Check NLTK path configuration
print(f"\nπ NLTK Path Test:")
if nltk_cache.exists():
print(f" β
NLTK cache directory exists")
contents = list(nltk_cache.iterdir())
print(f" Contents: {len(contents)} items")
for item in contents[:5]: # Show first 5 items
print(f" - {item.name}")
else:
print(f" π NLTK cache directory will be created on initialization")
# Test ensure_nltk_data function directly
print(f"\nπ Testing ensure_nltk_data function:")
from wordnet_clue_generator import ensure_nltk_data
# This should create the directory and set up paths
success = ensure_nltk_data(str(nltk_cache))
print(f" Result: {'β
Success' if success else 'β Failed'}")
if success:
# Check NLTK path was added
import nltk
print(f" NLTK search paths (first 3):")
for i, path in enumerate(nltk.data.path[:3], 1):
print(f" {i}. {path}")
# Check if directory was created
if nltk_cache.exists():
print(f" β
NLTK data directory created")
# List contents if any
contents = list(nltk_cache.rglob('*'))
print(f" π Directory contents: {len(contents)} total items")
return True
except Exception as e:
print(f"β Test error: {e}")
import traceback
traceback.print_exc()
return False
def show_cache_integration():
"""Show how the cache integrates with existing structure."""
print(f"\nπ Cache Integration Overview")
print("=" * 40)
model_cache = Path('./model_cache')
print(f"Cache Structure:")
print(f"model_cache/")
print(f"βββ nltk_data/ # β New NLTK data location")
print(f"β βββ corpora/")
print(f"β βββ wordnet/")
print(f"β βββ punkt/")
print(f"β βββ omw-1.4/")
print(f"βββ unified_vocabulary_*.pkl")
print(f"βββ unified_embeddings_*.npy")
print(f"βββ models--sentence-transformers/")
if model_cache.exists():
actual_size = sum(f.stat().st_size for f in model_cache.rglob('*') if f.is_file()) / (1024*1024)
print(f"\nπ Current cache size: {actual_size:.1f} MB")
nltk_dir = model_cache / 'nltk_data'
if nltk_dir.exists():
nltk_size = sum(f.stat().st_size for f in nltk_dir.rglob('*') if f.is_file()) / (1024*1024)
print(f"π NLTK data size: {nltk_size:.1f} MB")
def main():
"""Run the custom NLTK path test."""
print("π Custom NLTK Path Test")
print("=" * 60)
print("Testing WordNet generator with model_cache/nltk_data location")
success = test_custom_nltk_path()
show_cache_integration()
if success:
print(f"\nβ
SUCCESS!")
print(f"π NLTK data will now download to: model_cache/nltk_data/")
print(f"π― This keeps all AI/NLP data centralized in model_cache")
print(f"β‘ WordNet clue generator ready for use!")
else:
print(f"\nβ Test failed - check configuration")
if __name__ == "__main__":
main() |