product_ingredient_demo / debug_embeddings.py
esilver's picture
Initial commit
31ebc8b
raw
history blame
4.68 kB
#!/usr/bin/env python3
"""
Debug tool for checking ingredient embeddings
Run with: python debug_embeddings.py [optional_embeddings_path]
"""
import os
import sys
import json
import pickle
import logging
from pathlib import Path
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('debug_embeddings')
def check_embeddings_file(filepath):
"""Check if embeddings file exists and is valid"""
logger.info(f"Checking embeddings file: {filepath}")
# Check if file exists
if not os.path.exists(filepath):
logger.error(f"ERROR: Embeddings file not found at {filepath}")
return False
# Check file size
file_size = os.path.getsize(filepath) / (1024 * 1024) # Size in MB
logger.info(f"File size: {file_size:.2f} MB")
# Determine file type based on extension
is_pickle = filepath.endswith(('.pkl', '.pickle'))
# Check if file is valid
try:
if is_pickle:
with open(filepath, 'rb') as f:
data = pickle.load(f)
else:
with open(filepath, 'r') as f:
data = json.load(f)
if not isinstance(data, dict):
logger.error("ERROR: Embeddings file is not a valid dictionary")
return False
num_ingredients = len(data)
logger.info(f"Number of ingredients/categories: {num_ingredients}")
if num_ingredients == 0:
logger.error("ERROR: Embeddings dictionary is empty")
return False
# Check a few random entries
import random
sample_keys = random.sample(list(data.keys()), min(3, len(data)))
logger.info(f"Sample keys: {sample_keys}")
for key in sample_keys:
embedding = data[key]
if isinstance(embedding, list):
embedding_dim = len(embedding)
logger.info(f"Embedding for '{key}' is a list with dimension: {embedding_dim}")
elif hasattr(embedding, 'shape'): # numpy array
logger.info(f"Embedding for '{key}' is a numpy array with shape: {embedding.shape}")
else:
logger.info(f"Embedding for '{key}' is of type: {type(embedding)}")
return True
except json.JSONDecodeError:
logger.error("ERROR: File is not valid JSON")
return False
except pickle.UnpicklingError:
logger.error("ERROR: File is not a valid pickle file")
return False
except Exception as e:
logger.error(f"ERROR: Unexpected error checking embeddings: {str(e)}")
return False
def main():
# Get embeddings path from argument or environment or default
if len(sys.argv) > 1:
filepath = sys.argv[1]
else:
filepath = os.environ.get('EMBEDDINGS_PATH', 'data/ingredient_embeddings_voyageai.pkl')
# Check if path exists and is valid
if check_embeddings_file(filepath):
logger.info("βœ… Embeddings file looks valid!")
# Suggest setting environment variable if not already set
if 'EMBEDDINGS_PATH' not in os.environ:
logger.info(f"TIP: Set the EMBEDDINGS_PATH environment variable to: {filepath}")
logger.info(f" export EMBEDDINGS_PATH=\"{filepath}\"")
else:
logger.error("❌ Embeddings file has issues that need to be fixed")
# Look for specific pickle files
specific_files = [
'data/ingredient_embeddings_voyageai.pkl',
'data/category_embeddings.pickle'
]
# Look for embedding files in data directory
data_dir = Path('data')
if data_dir.exists():
logger.info("Checking 'data' directory for embedding files:")
for file in data_dir.glob('*embed*.p*'):
logger.info(f" - {file}")
if file.name in specific_files:
logger.info(f" βœ“ Found target file: {file}")
logger.info(f" Try running with: python debug_embeddings.py {file}")
# Look for similar files that might be the correct embeddings
dir_path = os.path.dirname(filepath) or '.'
try:
similar_files = list(Path(dir_path).glob("*embed*.p*"))
if similar_files:
logger.info("Found similar files that might contain embeddings:")
for file in similar_files:
logger.info(f" - {file}")
except Exception:
pass
if __name__ == "__main__":
main()