product_ingredient_demo / similarity.py
esilver's picture
Initial commit
31ebc8b
raw
history blame
11.2 kB
import numpy as np
from typing import Dict, List, Tuple, Any
import json
import voyageai
from openai import OpenAI
from api_utils import get_openai_client
def compute_similarities(ingredients_dict, products_dict):
"""
Compute cosine similarities between ingredient embeddings and product embeddings
Args:
ingredients_dict: Dictionary of ingredient names to embeddings
products_dict: Dictionary of product names to embedding dictionaries
Returns:
Dictionary of products with their similar ingredients and scores
"""
# Validate inputs
if not ingredients_dict:
print("Warning: ingredients_dict is empty")
return {}
if not products_dict:
print("Warning: products_dict is empty")
return {}
# Process ingredients - ensure we have proper embeddings
ingredient_names = []
ingredient_embeddings_list = []
for name, emb in ingredients_dict.items():
# Make sure we have valid embeddings (not None, not empty)
if emb is None or (isinstance(emb, (list, np.ndarray)) and len(emb) == 0):
continue
# Handle both direct embedding vectors and dictionary formats
if isinstance(emb, dict) and "embedding" in emb:
embedding_vector = emb["embedding"]
if embedding_vector is not None:
ingredient_names.append(name)
ingredient_embeddings_list.append(embedding_vector)
elif isinstance(emb, (list, np.ndarray)):
ingredient_names.append(name)
ingredient_embeddings_list.append(emb)
if not ingredient_names:
print("Warning: No valid ingredient embeddings found")
return {}
# Convert to numpy array ensuring we have a 2D array
ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
if ingredient_embeddings.ndim == 1:
# If we got a 1D array, reshape it to 2D (1 x dimension)
print(f"Warning: Ingredient embeddings have only 1 dimension, reshaping. Shape: {ingredient_embeddings.shape}")
if len(ingredient_embeddings) > 0:
ingredient_embeddings = ingredient_embeddings.reshape(1, -1)
else:
print("Error: Empty ingredient embeddings array")
return {}
# Normalize ingredient embeddings for cosine similarity
# Add safety checks for zero norms
ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
# Avoid division by zero
ingredient_norms = np.where(ingredient_norms == 0, 1e-10, ingredient_norms)
normalized_ingredients = ingredient_embeddings / ingredient_norms
# Process products
product_names = []
valid_embeddings = []
# Extract the actual embedding vectors from product dictionaries
for product_name, product_data in products_dict.items():
# Skip None values
if product_data is None:
continue
# Check if the product has an embedding dictionary with the expected structure
if isinstance(product_data, dict) and "embedding" in product_data:
embedding_vector = product_data["embedding"]
if embedding_vector is not None:
product_names.append(product_name)
valid_embeddings.append(embedding_vector)
# If the product data is already a vector, use it directly
elif isinstance(product_data, (list, np.ndarray)):
product_names.append(product_name)
valid_embeddings.append(product_data)
if not product_names:
print("Warning: No valid product embeddings found")
return {}
# Convert to numpy array for calculations
product_embeddings = np.array(valid_embeddings, dtype=np.float32)
# Handle case where we got a 1D array
if product_embeddings.ndim == 1:
print(f"Warning: Product embeddings have only 1 dimension, reshaping. Shape: {product_embeddings.shape}")
if len(product_embeddings) > 0:
product_embeddings = product_embeddings.reshape(1, -1)
else:
print("Error: Empty product embeddings array")
return {}
# Check and handle embedding dimension mismatch
product_dim = product_embeddings.shape[1] if product_embeddings.ndim > 1 else len(product_embeddings)
ingredient_dim = normalized_ingredients.shape[1] if normalized_ingredients.ndim > 1 else len(normalized_ingredients)
if product_dim != ingredient_dim:
print(f"Warning: Dimension mismatch between product embeddings ({product_dim}) and ingredient embeddings ({ingredient_dim})")
# Return empty results if dimensions don't match
return {}
# Normalize product embeddings for cosine similarity
product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
# Avoid division by zero
product_norms = np.where(product_norms == 0, 1e-10, product_norms)
normalized_products = product_embeddings / product_norms
# Compute cosine similarity
similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
# Create result dictionary
results = {}
for i, product_name in enumerate(product_names):
similarities = similarity_matrix[i]
product_similarities = [(ingredient_names[j], float(similarities[j]))
for j in range(len(ingredient_names))]
# Sort by similarity score (descending)
product_similarities.sort(key=lambda x: x[1], reverse=True)
results[product_name] = product_similarities
return results
def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str, Any],
embedding_top_n: int = 20, final_top_n: int = 5,
confidence_threshold: float = 0.5,
progress=None) -> Dict[str, List[Tuple]]:
"""
Two-stage matching: first use embeddings to find candidate ingredients, then apply re-ranking
Args:
products: List of product names to categorize
ingredients_dict: Dictionary of ingredient names to embeddings
embedding_top_n: Number of top ingredients to retrieve using embeddings
final_top_n: Number of final ingredients to return after re-ranking
confidence_threshold: Minimum score threshold for final results
progress: Optional progress tracking object
Returns:
Dictionary mapping products to their matched ingredients with scores
"""
from utils import SafeProgress
from embeddings import create_product_embeddings
progress_tracker = SafeProgress(progress, desc="Hybrid ingredient matching")
progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
# Stage 1: Use embeddings to find candidate ingredients
# Generate product embeddings
product_embeddings = create_product_embeddings(products, progress=progress_tracker)
# Compute similarities to get candidate ingredients
similarities = compute_similarities(ingredients_dict, product_embeddings)
# Filter to top N candidates per product
embedding_results = {}
for product, product_similarities in similarities.items():
embedding_results[product] = product_similarities[:embedding_top_n]
progress_tracker(0.4, desc="Stage 2: Re-ranking candidates")
# Initialize OpenAI client using the centralized function
openai_client = get_openai_client()
# Stage 2: Re-rank the candidates for each product
final_results = {}
for i, product in enumerate(products):
progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
# Get the embedding candidates for this product
if product not in embedding_results:
final_results[product] = []
continue
candidates = embedding_results[product]
if not candidates:
final_results[product] = []
continue
# Extract just the ingredient names for re-ranking
candidate_ingredients = [c[0] for c in candidates]
try:
# Apply re-ranking using OpenAI's structured output
response = openai_client.responses.create(
model="o3-mini",
# reasoning={"effort": "low"},
input=[
{"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
{"role": "user", "content": f"Product: {product}\n\nPotential ingredients: {', '.join(candidate_ingredients)}"}
],
text={
"format": {
"type": "json_schema",
"name": "ingredient_selection",
"schema": {
"type": "object",
"properties": {
"best_match": {
"type": "object",
"properties": {
"ingredient": {
"type": "string",
"description": "The name of the best matching ingredient"
},
"explanation": {
"type": "string",
"description": "Brief explanation for the matching"
},
"relevance_score": {
"type": "number",
"description": "Score between 0 and 1 indicating relevance"
}
},
"required": ["ingredient", "relevance_score", "explanation"],
"additionalProperties": False
}
},
"required": ["best_match"],
"additionalProperties": False
},
"strict": True
}
}
)
# Parse the response
best_match = json.loads(response.output_text)["best_match"]
# Only include the result if it meets the confidence threshold
if best_match["relevance_score"] >= confidence_threshold:
final_results[product] = [(best_match["ingredient"], best_match["relevance_score"])]
else:
final_results[product] = []
except Exception as e:
print(f"Error during OpenAI re-ranking for '{product}': {e}")
# Fall back to embedding results if re-ranking fails
final_results[product] = candidates[:1] # Select the top embedding result as fallback
except Exception as e:
print(f"Error during OpenAI re-ranking for '{product}': {e}")
# Fall back to embedding results if re-ranking fails
final_results[product] = candidates[:final_top_n]
progress_tracker(1.0, desc="Hybrid ingredient matching complete")
return final_results