#!/usr/bin/env python3 """ Debug tool for checking ingredient embeddings Run with: python debug_embeddings.py [optional_embeddings_path] """ import os import sys import json import pickle import logging from pathlib import Path # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger('debug_embeddings') def check_embeddings_file(filepath): """Check if embeddings file exists and is valid""" logger.info(f"Checking embeddings file: {filepath}") # Check if file exists if not os.path.exists(filepath): logger.error(f"ERROR: Embeddings file not found at {filepath}") return False # Check file size file_size = os.path.getsize(filepath) / (1024 * 1024) # Size in MB logger.info(f"File size: {file_size:.2f} MB") # Determine file type based on extension is_pickle = filepath.endswith(('.pkl', '.pickle')) # Check if file is valid try: if is_pickle: with open(filepath, 'rb') as f: data = pickle.load(f) else: with open(filepath, 'r') as f: data = json.load(f) if not isinstance(data, dict): logger.error("ERROR: Embeddings file is not a valid dictionary") return False num_ingredients = len(data) logger.info(f"Number of ingredients/categories: {num_ingredients}") if num_ingredients == 0: logger.error("ERROR: Embeddings dictionary is empty") return False # Check a few random entries import random sample_keys = random.sample(list(data.keys()), min(3, len(data))) logger.info(f"Sample keys: {sample_keys}") for key in sample_keys: embedding = data[key] if isinstance(embedding, list): embedding_dim = len(embedding) logger.info(f"Embedding for '{key}' is a list with dimension: {embedding_dim}") elif hasattr(embedding, 'shape'): # numpy array logger.info(f"Embedding for '{key}' is a numpy array with shape: {embedding.shape}") else: logger.info(f"Embedding for '{key}' is of type: {type(embedding)}") return True except json.JSONDecodeError: logger.error("ERROR: File is not valid JSON") return False except pickle.UnpicklingError: logger.error("ERROR: File is not a valid pickle file") return False except Exception as e: logger.error(f"ERROR: Unexpected error checking embeddings: {str(e)}") return False def main(): # Get embeddings path from argument or environment or default if len(sys.argv) > 1: filepath = sys.argv[1] else: filepath = os.environ.get('EMBEDDINGS_PATH', 'data/ingredient_embeddings_voyageai.pkl') # Check if path exists and is valid if check_embeddings_file(filepath): logger.info("✅ Embeddings file looks valid!") # Suggest setting environment variable if not already set if 'EMBEDDINGS_PATH' not in os.environ: logger.info(f"TIP: Set the EMBEDDINGS_PATH environment variable to: {filepath}") logger.info(f" export EMBEDDINGS_PATH=\"{filepath}\"") else: logger.error("❌ Embeddings file has issues that need to be fixed") # Look for specific pickle files specific_files = [ 'data/ingredient_embeddings_voyageai.pkl', 'data/category_embeddings.pickle' ] # Look for embedding files in data directory data_dir = Path('data') if data_dir.exists(): logger.info("Checking 'data' directory for embedding files:") for file in data_dir.glob('*embed*.p*'): logger.info(f" - {file}") if file.name in specific_files: logger.info(f" ✓ Found target file: {file}") logger.info(f" Try running with: python debug_embeddings.py {file}") # Look for similar files that might be the correct embeddings dir_path = os.path.dirname(filepath) or '.' try: similar_files = list(Path(dir_path).glob("*embed*.p*")) if similar_files: logger.info("Found similar files that might contain embeddings:") for file in similar_files: logger.info(f" - {file}") except Exception: pass if __name__ == "__main__": main()