Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Debug tool for checking ingredient embeddings | |
Run with: python debug_embeddings.py [optional_embeddings_path] | |
""" | |
import os | |
import sys | |
import json | |
import pickle | |
import logging | |
from pathlib import Path | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format='%(asctime)s - %(levelname)s - %(message)s' | |
) | |
logger = logging.getLogger('debug_embeddings') | |
def check_embeddings_file(filepath): | |
"""Check if embeddings file exists and is valid""" | |
logger.info(f"Checking embeddings file: {filepath}") | |
# Check if file exists | |
if not os.path.exists(filepath): | |
logger.error(f"ERROR: Embeddings file not found at {filepath}") | |
return False | |
# Check file size | |
file_size = os.path.getsize(filepath) / (1024 * 1024) # Size in MB | |
logger.info(f"File size: {file_size:.2f} MB") | |
# Determine file type based on extension | |
is_pickle = filepath.endswith(('.pkl', '.pickle')) | |
# Check if file is valid | |
try: | |
if is_pickle: | |
with open(filepath, 'rb') as f: | |
data = pickle.load(f) | |
else: | |
with open(filepath, 'r') as f: | |
data = json.load(f) | |
if not isinstance(data, dict): | |
logger.error("ERROR: Embeddings file is not a valid dictionary") | |
return False | |
num_ingredients = len(data) | |
logger.info(f"Number of ingredients/categories: {num_ingredients}") | |
if num_ingredients == 0: | |
logger.error("ERROR: Embeddings dictionary is empty") | |
return False | |
# Check a few random entries | |
import random | |
sample_keys = random.sample(list(data.keys()), min(3, len(data))) | |
logger.info(f"Sample keys: {sample_keys}") | |
for key in sample_keys: | |
embedding = data[key] | |
if isinstance(embedding, list): | |
embedding_dim = len(embedding) | |
logger.info(f"Embedding for '{key}' is a list with dimension: {embedding_dim}") | |
elif hasattr(embedding, 'shape'): # numpy array | |
logger.info(f"Embedding for '{key}' is a numpy array with shape: {embedding.shape}") | |
else: | |
logger.info(f"Embedding for '{key}' is of type: {type(embedding)}") | |
return True | |
except json.JSONDecodeError: | |
logger.error("ERROR: File is not valid JSON") | |
return False | |
except pickle.UnpicklingError: | |
logger.error("ERROR: File is not a valid pickle file") | |
return False | |
except Exception as e: | |
logger.error(f"ERROR: Unexpected error checking embeddings: {str(e)}") | |
return False | |
def main(): | |
# Get embeddings path from argument or environment or default | |
if len(sys.argv) > 1: | |
filepath = sys.argv[1] | |
else: | |
filepath = os.environ.get('EMBEDDINGS_PATH', 'data/ingredient_embeddings_voyageai.pkl') | |
# Check if path exists and is valid | |
if check_embeddings_file(filepath): | |
logger.info("β Embeddings file looks valid!") | |
# Suggest setting environment variable if not already set | |
if 'EMBEDDINGS_PATH' not in os.environ: | |
logger.info(f"TIP: Set the EMBEDDINGS_PATH environment variable to: {filepath}") | |
logger.info(f" export EMBEDDINGS_PATH=\"{filepath}\"") | |
else: | |
logger.error("β Embeddings file has issues that need to be fixed") | |
# Look for specific pickle files | |
specific_files = [ | |
'data/ingredient_embeddings_voyageai.pkl', | |
'data/category_embeddings.pickle' | |
] | |
# Look for embedding files in data directory | |
data_dir = Path('data') | |
if data_dir.exists(): | |
logger.info("Checking 'data' directory for embedding files:") | |
for file in data_dir.glob('*embed*.p*'): | |
logger.info(f" - {file}") | |
if file.name in specific_files: | |
logger.info(f" β Found target file: {file}") | |
logger.info(f" Try running with: python debug_embeddings.py {file}") | |
# Look for similar files that might be the correct embeddings | |
dir_path = os.path.dirname(filepath) or '.' | |
try: | |
similar_files = list(Path(dir_path).glob("*embed*.p*")) | |
if similar_files: | |
logger.info("Found similar files that might contain embeddings:") | |
for file in similar_files: | |
logger.info(f" - {file}") | |
except Exception: | |
pass | |
if __name__ == "__main__": | |
main() | |