File size: 4,297 Bytes
486eff6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
#!/usr/bin/env python3
"""
Verification script to confirm test_integrated_system.py uses cached embeddings.
"""
import sys
import os
from pathlib import Path
# Add hack directory to path
sys.path.insert(0, str(Path(__file__).parent))
def verify_cache_setup():
"""Verify that cached files are available and test setup is correct."""
print("π Verifying Cached Test Setup")
print("=" * 50)
# Check cache directory
cache_dir = Path(__file__).parent / 'model_cache'
print(f"π Cache directory: {cache_dir}")
print(f" Exists: {'β
' if cache_dir.exists() else 'β'}")
if not cache_dir.exists():
print("β Cache directory not found")
return False
# Check required cached files for 50K vocabulary
required_files = [
"unified_vocabulary_50000.pkl",
"unified_frequencies_50000.pkl",
"unified_embeddings_all-mpnet-base-v2_50000.npy"
]
print("\nπ Required Cache Files (50K vocabulary):")
all_present = True
for filename in required_files:
filepath = cache_dir / filename
exists = filepath.exists()
size_mb = filepath.stat().st_size / (1024*1024) if exists else 0
status = "β
" if exists else "β"
size_str = f"({size_mb:.1f} MB)" if exists else "(missing)"
print(f" {status} {filename} {size_str}")
if not exists:
all_present = False
# Check test file configuration
print("\nπ§ͺ Test Configuration:")
try:
from test_integrated_system import TestIntegratedCrosswordGenerator
# Create test instance to check setup
test_instance = TestIntegratedCrosswordGenerator()
test_instance.setUpClass()
test_instance.setUp()
# Check generator configuration
generator = test_instance.generator
print(f" β
Vocabulary limit: {generator.vocab_size_limit:,} words")
print(f" β
Cache directory: {generator.cache_dir}")
# Verify cache directory matches
expected_cache = str(cache_dir)
actual_cache = generator.cache_dir
cache_match = expected_cache == actual_cache
print(f" {'β
' if cache_match else 'β'} Cache path match: {cache_match}")
if not cache_match:
print(f" Expected: {expected_cache}")
print(f" Actual: {actual_cache}")
except Exception as e:
print(f" β Test setup error: {e}")
all_present = False
# Summary
print("\n" + "=" * 50)
if all_present:
print("β
VERIFICATION SUCCESSFUL")
print(" β’ All cached files are present")
print(" β’ Test suite is configured to use 50K cached vocabulary")
print(" β’ Embeddings cache will be loaded instead of recomputed")
print(" β’ Tests should run much faster (~90s vs ~200s+ initialization)")
else:
print("β VERIFICATION FAILED")
print(" β’ Missing cached files or configuration issues")
print(" β’ Tests may run slower or fail")
return all_present
def show_cache_benefits():
"""Show the benefits of using cached files."""
print("\nπ‘ Cache Benefits:")
print("-" * 30)
print("π Without Cache:")
print(" β’ Download WordFreq data: ~30s")
print(" β’ Filter 50K vocabulary: ~10s")
print(" β’ Load sentence transformer: ~90s")
print(" β’ Generate embeddings: ~120s")
print(" β’ Total: ~250s")
print()
print("β‘ With Cache:")
print(" β’ Load cached vocabulary: ~1s")
print(" β’ Load cached embeddings: ~2s")
print(" β’ Load sentence transformer: ~90s")
print(" β’ Total: ~93s")
print()
print("π Speed Improvement: ~2.7x faster initialization")
def main():
"""Main verification."""
success = verify_cache_setup()
show_cache_benefits()
if success:
print("\nπ Ready to run optimized tests!")
print(" Run: python test_integrated_system.py")
else:
print("\nβ οΈ Cache setup needs attention")
print(" Check that model_cache/ contains the required files")
return success
if __name__ == "__main__":
main() |