File size: 4,479 Bytes
486eff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
"""
Test script to verify custom NLTK data path functionality.
This shows that the WordNet generator now downloads NLTK data to model_cache/nltk_data.
"""

import sys
from pathlib import Path

# Add hack directory to path
sys.path.insert(0, str(Path(__file__).parent))

def test_custom_nltk_path():
    """Test that NLTK data downloads to custom cache directory."""
    print("πŸ§ͺ Testing Custom NLTK Data Path")
    print("=" * 50)
    
    model_cache = Path('./model_cache')
    nltk_cache = model_cache / 'nltk_data'
    
    print(f"πŸ“‚ Model cache directory: {model_cache.absolute()}")
    print(f"πŸ“‚ Expected NLTK data directory: {nltk_cache.absolute()}")
    
    try:
        from wordnet_clue_generator import WordNetClueGenerator
        
        # Create generator with explicit cache directory
        generator = WordNetClueGenerator(cache_dir=str(model_cache))
        
        print(f"\nπŸ”§ Generator Configuration:")
        print(f"   Cache dir: {generator.cache_dir}")
        print(f"   NLTK data dir: {generator.nltk_data_dir}")
        
        # Check NLTK path configuration
        print(f"\nπŸ“‹ NLTK Path Test:")
        if nltk_cache.exists():
            print(f"   βœ… NLTK cache directory exists")
            contents = list(nltk_cache.iterdir())
            print(f"   Contents: {len(contents)} items")
            for item in contents[:5]:  # Show first 5 items
                print(f"     - {item.name}")
        else:
            print(f"   πŸ“ NLTK cache directory will be created on initialization")
        
        # Test ensure_nltk_data function directly
        print(f"\nπŸ” Testing ensure_nltk_data function:")
        from wordnet_clue_generator import ensure_nltk_data
        
        # This should create the directory and set up paths
        success = ensure_nltk_data(str(nltk_cache))
        print(f"   Result: {'βœ… Success' if success else '❌ Failed'}")
        
        if success:
            # Check NLTK path was added
            import nltk
            print(f"   NLTK search paths (first 3):")
            for i, path in enumerate(nltk.data.path[:3], 1):
                print(f"     {i}. {path}")
            
            # Check if directory was created
            if nltk_cache.exists():
                print(f"   βœ… NLTK data directory created")
                
                # List contents if any
                contents = list(nltk_cache.rglob('*'))
                print(f"   πŸ“ Directory contents: {len(contents)} total items")
        
        return True
        
    except Exception as e:
        print(f"❌ Test error: {e}")
        import traceback
        traceback.print_exc()
        return False

def show_cache_integration():
    """Show how the cache integrates with existing structure."""
    print(f"\nπŸ“Š Cache Integration Overview")
    print("=" * 40)
    
    model_cache = Path('./model_cache')
    
    print(f"Cache Structure:")
    print(f"model_cache/")
    print(f"β”œβ”€β”€ nltk_data/              # ← New NLTK data location")
    print(f"β”‚   └── corpora/")
    print(f"β”‚       β”œβ”€β”€ wordnet/")
    print(f"β”‚       β”œβ”€β”€ punkt/")
    print(f"β”‚       └── omw-1.4/")
    print(f"β”œβ”€β”€ unified_vocabulary_*.pkl")
    print(f"β”œβ”€β”€ unified_embeddings_*.npy")
    print(f"└── models--sentence-transformers/")
    
    if model_cache.exists():
        actual_size = sum(f.stat().st_size for f in model_cache.rglob('*') if f.is_file()) / (1024*1024)
        print(f"\nπŸ“Š Current cache size: {actual_size:.1f} MB")
        
        nltk_dir = model_cache / 'nltk_data'
        if nltk_dir.exists():
            nltk_size = sum(f.stat().st_size for f in nltk_dir.rglob('*') if f.is_file()) / (1024*1024)
            print(f"πŸ“Š NLTK data size: {nltk_size:.1f} MB")

def main():
    """Run the custom NLTK path test."""
    print("πŸš€ Custom NLTK Path Test")
    print("=" * 60)
    print("Testing WordNet generator with model_cache/nltk_data location")
    
    success = test_custom_nltk_path()
    show_cache_integration()
    
    if success:
        print(f"\nβœ… SUCCESS!")
        print(f"πŸ“‚ NLTK data will now download to: model_cache/nltk_data/")
        print(f"🎯 This keeps all AI/NLP data centralized in model_cache")
        print(f"⚑ WordNet clue generator ready for use!")
    else:
        print(f"\n❌ Test failed - check configuration")

if __name__ == "__main__":
    main()