import numpy as np from typing import Dict, List, Tuple, Any import json import voyageai from openai import OpenAI from api_utils import get_openai_client def compute_similarities(ingredients_dict, products_dict): """ Compute cosine similarities between ingredient embeddings and product embeddings Args: ingredients_dict: Dictionary of ingredient names to embeddings products_dict: Dictionary of product names to embedding dictionaries Returns: Dictionary of products with their similar ingredients and scores """ # Validate inputs if not ingredients_dict: print("Warning: ingredients_dict is empty") return {} if not products_dict: print("Warning: products_dict is empty") return {} # Process ingredients - ensure we have proper embeddings ingredient_names = [] ingredient_embeddings_list = [] for name, emb in ingredients_dict.items(): # Make sure we have valid embeddings (not None, not empty) if emb is None or (isinstance(emb, (list, np.ndarray)) and len(emb) == 0): continue # Handle both direct embedding vectors and dictionary formats if isinstance(emb, dict) and "embedding" in emb: embedding_vector = emb["embedding"] if embedding_vector is not None: ingredient_names.append(name) ingredient_embeddings_list.append(embedding_vector) elif isinstance(emb, (list, np.ndarray)): ingredient_names.append(name) ingredient_embeddings_list.append(emb) if not ingredient_names: print("Warning: No valid ingredient embeddings found") return {} # Convert to numpy array ensuring we have a 2D array ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32) if ingredient_embeddings.ndim == 1: # If we got a 1D array, reshape it to 2D (1 x dimension) print(f"Warning: Ingredient embeddings have only 1 dimension, reshaping. Shape: {ingredient_embeddings.shape}") if len(ingredient_embeddings) > 0: ingredient_embeddings = ingredient_embeddings.reshape(1, -1) else: print("Error: Empty ingredient embeddings array") return {} # Normalize ingredient embeddings for cosine similarity # Add safety checks for zero norms ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True) # Avoid division by zero ingredient_norms = np.where(ingredient_norms == 0, 1e-10, ingredient_norms) normalized_ingredients = ingredient_embeddings / ingredient_norms # Process products product_names = [] valid_embeddings = [] # Extract the actual embedding vectors from product dictionaries for product_name, product_data in products_dict.items(): # Skip None values if product_data is None: continue # Check if the product has an embedding dictionary with the expected structure if isinstance(product_data, dict) and "embedding" in product_data: embedding_vector = product_data["embedding"] if embedding_vector is not None: product_names.append(product_name) valid_embeddings.append(embedding_vector) # If the product data is already a vector, use it directly elif isinstance(product_data, (list, np.ndarray)): product_names.append(product_name) valid_embeddings.append(product_data) if not product_names: print("Warning: No valid product embeddings found") return {} # Convert to numpy array for calculations product_embeddings = np.array(valid_embeddings, dtype=np.float32) # Handle case where we got a 1D array if product_embeddings.ndim == 1: print(f"Warning: Product embeddings have only 1 dimension, reshaping. Shape: {product_embeddings.shape}") if len(product_embeddings) > 0: product_embeddings = product_embeddings.reshape(1, -1) else: print("Error: Empty product embeddings array") return {} # Check and handle embedding dimension mismatch product_dim = product_embeddings.shape[1] if product_embeddings.ndim > 1 else len(product_embeddings) ingredient_dim = normalized_ingredients.shape[1] if normalized_ingredients.ndim > 1 else len(normalized_ingredients) if product_dim != ingredient_dim: print(f"Warning: Dimension mismatch between product embeddings ({product_dim}) and ingredient embeddings ({ingredient_dim})") # Return empty results if dimensions don't match return {} # Normalize product embeddings for cosine similarity product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True) # Avoid division by zero product_norms = np.where(product_norms == 0, 1e-10, product_norms) normalized_products = product_embeddings / product_norms # Compute cosine similarity similarity_matrix = np.dot(normalized_products, normalized_ingredients.T) # Create result dictionary results = {} for i, product_name in enumerate(product_names): similarities = similarity_matrix[i] product_similarities = [(ingredient_names[j], float(similarities[j])) for j in range(len(ingredient_names))] # Sort by similarity score (descending) product_similarities.sort(key=lambda x: x[1], reverse=True) results[product_name] = product_similarities return results def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str, Any], embedding_top_n: int = 20, final_top_n: int = 5, confidence_threshold: float = 0.5, expanded_descriptions: Dict[str, str] = None, progress=None) -> Dict[str, List[Tuple]]: """ Two-stage matching: first use embeddings to find candidate ingredients, then apply re-ranking Args: products: List of product names to categorize ingredients_dict: Dictionary of ingredient names to embeddings embedding_top_n: Number of top ingredients to retrieve using embeddings final_top_n: Number of final ingredients to return after re-ranking confidence_threshold: Minimum score threshold for final results expanded_descriptions: Optional dict mapping products to their expanded descriptions progress: Optional progress tracking object Returns: Dictionary mapping products to their matched ingredients with scores """ from utils import SafeProgress from embeddings import create_product_embeddings progress_tracker = SafeProgress(progress, desc="Hybrid ingredient matching") progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings") # Stage 1: Use embeddings to find candidate ingredients # Generate product embeddings product_embeddings = create_product_embeddings(products, progress=progress_tracker) # Compute similarities to get candidate ingredients similarities = compute_similarities(ingredients_dict, product_embeddings) # Filter to top N candidates per product embedding_results = {} for product, product_similarities in similarities.items(): embedding_results[product] = product_similarities[:embedding_top_n] progress_tracker(0.4, desc="Stage 2: Re-ranking candidates") # Initialize OpenAI client using the centralized function openai_client = get_openai_client() # Stage 2: Re-rank the candidates for each product final_results = {} for i, product in enumerate(products): progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") # Get the embedding candidates for this product if product not in embedding_results: final_results[product] = [] continue candidates = embedding_results[product] if not candidates: final_results[product] = [] continue # Extract just the ingredient names for re-ranking candidate_ingredients = [c[0] for c in candidates] try: # Use expanded description if available product_text = product if expanded_descriptions and product in expanded_descriptions: product_text = expanded_descriptions[product] # Apply re-ranking using OpenAI's structured output response = openai_client.responses.create( model="gpt-4o-mini", # reasoning={"effort": "low"}, input=[ {"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."}, {"role": "user", "content": f"Product: {product_text}\n\nPotential ingredients: {', '.join(candidate_ingredients)}"} ], text={ "format": { "type": "json_schema", "name": "ingredient_selection", "schema": { "type": "object", "properties": { "best_match": { "type": "object", "properties": { "ingredient": { "type": "string", "description": "The name of the best matching ingredient" }, "explanation": { "type": "string", "description": "Brief explanation for the matching" }, "relevance_score": { "type": "number", "description": "Score between 0 and 1 indicating relevance" } }, "required": ["ingredient", "relevance_score", "explanation"], "additionalProperties": False } }, "required": ["best_match"], "additionalProperties": False }, "strict": True } } ) # Parse the response best_match = json.loads(response.output_text)["best_match"] # Only include the result if it meets the confidence threshold if best_match["relevance_score"] >= confidence_threshold: final_results[product] = [(best_match["ingredient"], best_match["relevance_score"])] else: final_results[product] = [] except Exception as e: print(f"Error during OpenAI re-ranking for '{product}': {e}") # Fall back to embedding results if re-ranking fails final_results[product] = candidates[:1] # Select the top embedding result as fallback progress_tracker(1.0, desc="Hybrid ingredient matching complete") return final_results