product_ingredient_demo / similarity.py
esilver's picture
Added expansion for embeddings
1e737a1
raw
history blame
11.6 kB
import numpy as np
from typing import Dict, List, Tuple, Any
import json
import voyageai
from openai import OpenAI
from api_utils import get_openai_client
def compute_similarities(ingredients_dict, products_dict):
"""
Compute cosine similarities between ingredient embeddings and product embeddings
Args:
ingredients_dict: Dictionary of ingredient names to embeddings
products_dict: Dictionary of product names to embedding dictionaries
Returns:
Dictionary of products with their similar ingredients and scores
"""
# Validate inputs
if not ingredients_dict:
print("Warning: ingredients_dict is empty")
return {}
if not products_dict:
print("Warning: products_dict is empty")
return {}
# Process ingredients - ensure we have proper embeddings
ingredient_names = []
ingredient_embeddings_list = []
for name, emb in ingredients_dict.items():
# Make sure we have valid embeddings (not None, not empty)
if emb is None or (isinstance(emb, (list, np.ndarray)) and len(emb) == 0):
continue
# Handle both direct embedding vectors and dictionary formats
if isinstance(emb, dict) and "embedding" in emb:
embedding_vector = emb["embedding"]
if embedding_vector is not None:
ingredient_names.append(name)
ingredient_embeddings_list.append(embedding_vector)
elif isinstance(emb, (list, np.ndarray)):
ingredient_names.append(name)
ingredient_embeddings_list.append(emb)
if not ingredient_names:
print("Warning: No valid ingredient embeddings found")
return {}
# Convert to numpy array ensuring we have a 2D array
ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
if ingredient_embeddings.ndim == 1:
# If we got a 1D array, reshape it to 2D (1 x dimension)
print(f"Warning: Ingredient embeddings have only 1 dimension, reshaping. Shape: {ingredient_embeddings.shape}")
if len(ingredient_embeddings) > 0:
ingredient_embeddings = ingredient_embeddings.reshape(1, -1)
else:
print("Error: Empty ingredient embeddings array")
return {}
# Normalize ingredient embeddings for cosine similarity
# Add safety checks for zero norms
ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
# Avoid division by zero
ingredient_norms = np.where(ingredient_norms == 0, 1e-10, ingredient_norms)
normalized_ingredients = ingredient_embeddings / ingredient_norms
# Process products
product_names = []
valid_embeddings = []
# Extract the actual embedding vectors from product dictionaries
for product_name, product_data in products_dict.items():
# Skip None values
if product_data is None:
continue
# Check if the product has an embedding dictionary with the expected structure
if isinstance(product_data, dict) and "embedding" in product_data:
embedding_vector = product_data["embedding"]
if embedding_vector is not None:
product_names.append(product_name)
valid_embeddings.append(embedding_vector)
# If the product data is already a vector, use it directly
elif isinstance(product_data, (list, np.ndarray)):
product_names.append(product_name)
valid_embeddings.append(product_data)
if not product_names:
print("Warning: No valid product embeddings found")
return {}
# Convert to numpy array for calculations
product_embeddings = np.array(valid_embeddings, dtype=np.float32)
# Handle case where we got a 1D array
if product_embeddings.ndim == 1:
print(f"Warning: Product embeddings have only 1 dimension, reshaping. Shape: {product_embeddings.shape}")
if len(product_embeddings) > 0:
product_embeddings = product_embeddings.reshape(1, -1)
else:
print("Error: Empty product embeddings array")
return {}
# Check and handle embedding dimension mismatch
product_dim = product_embeddings.shape[1] if product_embeddings.ndim > 1 else len(product_embeddings)
ingredient_dim = normalized_ingredients.shape[1] if normalized_ingredients.ndim > 1 else len(normalized_ingredients)
if product_dim != ingredient_dim:
print(f"Warning: Dimension mismatch between product embeddings ({product_dim}) and ingredient embeddings ({ingredient_dim})")
# Return empty results if dimensions don't match
return {}
# Normalize product embeddings for cosine similarity
product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
# Avoid division by zero
product_norms = np.where(product_norms == 0, 1e-10, product_norms)
normalized_products = product_embeddings / product_norms
# Compute cosine similarity
similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
# Create result dictionary
results = {}
for i, product_name in enumerate(product_names):
similarities = similarity_matrix[i]
product_similarities = [(ingredient_names[j], float(similarities[j]))
for j in range(len(ingredient_names))]
# Sort by similarity score (descending)
product_similarities.sort(key=lambda x: x[1], reverse=True)
results[product_name] = product_similarities
return results
def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str, Any],
embedding_top_n: int = 20, final_top_n: int = 5,
confidence_threshold: float = 0.5,
expanded_descriptions: Dict[str, str] = None,
progress=None) -> Dict[str, List[Tuple]]:
"""
Two-stage matching: first use embeddings to find candidate ingredients, then apply re-ranking
Args:
products: List of product names to categorize
ingredients_dict: Dictionary of ingredient names to embeddings
embedding_top_n: Number of top ingredients to retrieve using embeddings
final_top_n: Number of final ingredients to return after re-ranking
confidence_threshold: Minimum score threshold for final results
expanded_descriptions: Optional dict mapping products to their expanded descriptions
progress: Optional progress tracking object
Returns:
Dictionary mapping products to their matched ingredients with scores
"""
from utils import SafeProgress
from embeddings import create_product_embeddings
progress_tracker = SafeProgress(progress, desc="Hybrid ingredient matching")
progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
# Stage 1: Use embeddings to find candidate ingredients
# Generate product embeddings
product_embeddings = create_product_embeddings(products, progress=progress_tracker)
# Compute similarities to get candidate ingredients
similarities = compute_similarities(ingredients_dict, product_embeddings)
# Filter to top N candidates per product
embedding_results = {}
for product, product_similarities in similarities.items():
embedding_results[product] = product_similarities[:embedding_top_n]
progress_tracker(0.4, desc="Stage 2: Re-ranking candidates")
# Initialize OpenAI client using the centralized function
openai_client = get_openai_client()
# Stage 2: Re-rank the candidates for each product
final_results = {}
for i, product in enumerate(products):
progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
# Get the embedding candidates for this product
if product not in embedding_results:
final_results[product] = []
continue
candidates = embedding_results[product]
if not candidates:
final_results[product] = []
continue
# Extract just the ingredient names for re-ranking
candidate_ingredients = [c[0] for c in candidates]
try:
# Use expanded description if available
product_text = product
if expanded_descriptions and product in expanded_descriptions:
product_text = expanded_descriptions[product]
# Apply re-ranking using OpenAI's structured output
response = openai_client.responses.create(
model="gpt-4o-mini",
# reasoning={"effort": "low"},
input=[
{"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
{"role": "user", "content": f"Product: {product_text}\n\nPotential ingredients: {', '.join(candidate_ingredients)}"}
],
text={
"format": {
"type": "json_schema",
"name": "ingredient_selection",
"schema": {
"type": "object",
"properties": {
"best_match": {
"type": "object",
"properties": {
"ingredient": {
"type": "string",
"description": "The name of the best matching ingredient"
},
"explanation": {
"type": "string",
"description": "Brief explanation for the matching"
},
"relevance_score": {
"type": "number",
"description": "Score between 0 and 1 indicating relevance"
}
},
"required": ["ingredient", "relevance_score", "explanation"],
"additionalProperties": False
}
},
"required": ["best_match"],
"additionalProperties": False
},
"strict": True
}
}
)
# Parse the response
best_match = json.loads(response.output_text)["best_match"]
# Only include the result if it meets the confidence threshold
if best_match["relevance_score"] >= confidence_threshold:
final_results[product] = [(best_match["ingredient"], best_match["relevance_score"])]
else:
final_results[product] = []
except Exception as e:
print(f"Error during OpenAI re-ranking for '{product}': {e}")
# Fall back to embedding results if re-ranking fails
final_results[product] = candidates[:1] # Select the top embedding result as fallback
progress_tracker(1.0, desc="Hybrid ingredient matching complete")
return final_results