|
|
|
""" |
|
Test script to verify custom NLTK data path functionality. |
|
This shows that the WordNet generator now downloads NLTK data to model_cache/nltk_data. |
|
""" |
|
|
|
import sys |
|
from pathlib import Path |
|
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent)) |
|
|
|
def test_custom_nltk_path(): |
|
"""Test that NLTK data downloads to custom cache directory.""" |
|
print("π§ͺ Testing Custom NLTK Data Path") |
|
print("=" * 50) |
|
|
|
model_cache = Path('./model_cache') |
|
nltk_cache = model_cache / 'nltk_data' |
|
|
|
print(f"π Model cache directory: {model_cache.absolute()}") |
|
print(f"π Expected NLTK data directory: {nltk_cache.absolute()}") |
|
|
|
try: |
|
from wordnet_clue_generator import WordNetClueGenerator |
|
|
|
|
|
generator = WordNetClueGenerator(cache_dir=str(model_cache)) |
|
|
|
print(f"\nπ§ Generator Configuration:") |
|
print(f" Cache dir: {generator.cache_dir}") |
|
print(f" NLTK data dir: {generator.nltk_data_dir}") |
|
|
|
|
|
print(f"\nπ NLTK Path Test:") |
|
if nltk_cache.exists(): |
|
print(f" β
NLTK cache directory exists") |
|
contents = list(nltk_cache.iterdir()) |
|
print(f" Contents: {len(contents)} items") |
|
for item in contents[:5]: |
|
print(f" - {item.name}") |
|
else: |
|
print(f" π NLTK cache directory will be created on initialization") |
|
|
|
|
|
print(f"\nπ Testing ensure_nltk_data function:") |
|
from wordnet_clue_generator import ensure_nltk_data |
|
|
|
|
|
success = ensure_nltk_data(str(nltk_cache)) |
|
print(f" Result: {'β
Success' if success else 'β Failed'}") |
|
|
|
if success: |
|
|
|
import nltk |
|
print(f" NLTK search paths (first 3):") |
|
for i, path in enumerate(nltk.data.path[:3], 1): |
|
print(f" {i}. {path}") |
|
|
|
|
|
if nltk_cache.exists(): |
|
print(f" β
NLTK data directory created") |
|
|
|
|
|
contents = list(nltk_cache.rglob('*')) |
|
print(f" π Directory contents: {len(contents)} total items") |
|
|
|
return True |
|
|
|
except Exception as e: |
|
print(f"β Test error: {e}") |
|
import traceback |
|
traceback.print_exc() |
|
return False |
|
|
|
def show_cache_integration(): |
|
"""Show how the cache integrates with existing structure.""" |
|
print(f"\nπ Cache Integration Overview") |
|
print("=" * 40) |
|
|
|
model_cache = Path('./model_cache') |
|
|
|
print(f"Cache Structure:") |
|
print(f"model_cache/") |
|
print(f"βββ nltk_data/ # β New NLTK data location") |
|
print(f"β βββ corpora/") |
|
print(f"β βββ wordnet/") |
|
print(f"β βββ punkt/") |
|
print(f"β βββ omw-1.4/") |
|
print(f"βββ unified_vocabulary_*.pkl") |
|
print(f"βββ unified_embeddings_*.npy") |
|
print(f"βββ models--sentence-transformers/") |
|
|
|
if model_cache.exists(): |
|
actual_size = sum(f.stat().st_size for f in model_cache.rglob('*') if f.is_file()) / (1024*1024) |
|
print(f"\nπ Current cache size: {actual_size:.1f} MB") |
|
|
|
nltk_dir = model_cache / 'nltk_data' |
|
if nltk_dir.exists(): |
|
nltk_size = sum(f.stat().st_size for f in nltk_dir.rglob('*') if f.is_file()) / (1024*1024) |
|
print(f"π NLTK data size: {nltk_size:.1f} MB") |
|
|
|
def main(): |
|
"""Run the custom NLTK path test.""" |
|
print("π Custom NLTK Path Test") |
|
print("=" * 60) |
|
print("Testing WordNet generator with model_cache/nltk_data location") |
|
|
|
success = test_custom_nltk_path() |
|
show_cache_integration() |
|
|
|
if success: |
|
print(f"\nβ
SUCCESS!") |
|
print(f"π NLTK data will now download to: model_cache/nltk_data/") |
|
print(f"π― This keeps all AI/NLP data centralized in model_cache") |
|
print(f"β‘ WordNet clue generator ready for use!") |
|
else: |
|
print(f"\nβ Test failed - check configuration") |
|
|
|
if __name__ == "__main__": |
|
main() |