Spaces:
Sleeping
Sleeping
import numpy as np | |
from typing import Dict, List, Tuple, Any | |
import json | |
import voyageai | |
from openai import OpenAI | |
from api_utils import get_openai_client | |
def compute_similarities(ingredients_dict, products_dict): | |
""" | |
Compute cosine similarities between ingredient embeddings and product embeddings | |
Args: | |
ingredients_dict: Dictionary of ingredient names to embeddings | |
products_dict: Dictionary of product names to embedding dictionaries | |
Returns: | |
Dictionary of products with their similar ingredients and scores | |
""" | |
# Validate inputs | |
if not ingredients_dict: | |
print("Warning: ingredients_dict is empty") | |
return {} | |
if not products_dict: | |
print("Warning: products_dict is empty") | |
return {} | |
# Process ingredients - ensure we have proper embeddings | |
ingredient_names = [] | |
ingredient_embeddings_list = [] | |
for name, emb in ingredients_dict.items(): | |
# Make sure we have valid embeddings (not None, not empty) | |
if emb is None or (isinstance(emb, (list, np.ndarray)) and len(emb) == 0): | |
continue | |
# Handle both direct embedding vectors and dictionary formats | |
if isinstance(emb, dict) and "embedding" in emb: | |
embedding_vector = emb["embedding"] | |
if embedding_vector is not None: | |
ingredient_names.append(name) | |
ingredient_embeddings_list.append(embedding_vector) | |
elif isinstance(emb, (list, np.ndarray)): | |
ingredient_names.append(name) | |
ingredient_embeddings_list.append(emb) | |
if not ingredient_names: | |
print("Warning: No valid ingredient embeddings found") | |
return {} | |
# Convert to numpy array ensuring we have a 2D array | |
ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32) | |
if ingredient_embeddings.ndim == 1: | |
# If we got a 1D array, reshape it to 2D (1 x dimension) | |
print(f"Warning: Ingredient embeddings have only 1 dimension, reshaping. Shape: {ingredient_embeddings.shape}") | |
if len(ingredient_embeddings) > 0: | |
ingredient_embeddings = ingredient_embeddings.reshape(1, -1) | |
else: | |
print("Error: Empty ingredient embeddings array") | |
return {} | |
# Normalize ingredient embeddings for cosine similarity | |
# Add safety checks for zero norms | |
ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True) | |
# Avoid division by zero | |
ingredient_norms = np.where(ingredient_norms == 0, 1e-10, ingredient_norms) | |
normalized_ingredients = ingredient_embeddings / ingredient_norms | |
# Process products | |
product_names = [] | |
valid_embeddings = [] | |
# Extract the actual embedding vectors from product dictionaries | |
for product_name, product_data in products_dict.items(): | |
# Skip None values | |
if product_data is None: | |
continue | |
# Check if the product has an embedding dictionary with the expected structure | |
if isinstance(product_data, dict) and "embedding" in product_data: | |
embedding_vector = product_data["embedding"] | |
if embedding_vector is not None: | |
product_names.append(product_name) | |
valid_embeddings.append(embedding_vector) | |
# If the product data is already a vector, use it directly | |
elif isinstance(product_data, (list, np.ndarray)): | |
product_names.append(product_name) | |
valid_embeddings.append(product_data) | |
if not product_names: | |
print("Warning: No valid product embeddings found") | |
return {} | |
# Convert to numpy array for calculations | |
product_embeddings = np.array(valid_embeddings, dtype=np.float32) | |
# Handle case where we got a 1D array | |
if product_embeddings.ndim == 1: | |
print(f"Warning: Product embeddings have only 1 dimension, reshaping. Shape: {product_embeddings.shape}") | |
if len(product_embeddings) > 0: | |
product_embeddings = product_embeddings.reshape(1, -1) | |
else: | |
print("Error: Empty product embeddings array") | |
return {} | |
# Check and handle embedding dimension mismatch | |
product_dim = product_embeddings.shape[1] if product_embeddings.ndim > 1 else len(product_embeddings) | |
ingredient_dim = normalized_ingredients.shape[1] if normalized_ingredients.ndim > 1 else len(normalized_ingredients) | |
if product_dim != ingredient_dim: | |
print(f"Warning: Dimension mismatch between product embeddings ({product_dim}) and ingredient embeddings ({ingredient_dim})") | |
# Return empty results if dimensions don't match | |
return {} | |
# Normalize product embeddings for cosine similarity | |
product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True) | |
# Avoid division by zero | |
product_norms = np.where(product_norms == 0, 1e-10, product_norms) | |
normalized_products = product_embeddings / product_norms | |
# Compute cosine similarity | |
similarity_matrix = np.dot(normalized_products, normalized_ingredients.T) | |
# Create result dictionary | |
results = {} | |
for i, product_name in enumerate(product_names): | |
similarities = similarity_matrix[i] | |
product_similarities = [(ingredient_names[j], float(similarities[j])) | |
for j in range(len(ingredient_names))] | |
# Sort by similarity score (descending) | |
product_similarities.sort(key=lambda x: x[1], reverse=True) | |
results[product_name] = product_similarities | |
return results | |
def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str, Any], | |
embedding_top_n: int = 20, final_top_n: int = 5, | |
confidence_threshold: float = 0.5, | |
progress=None) -> Dict[str, List[Tuple]]: | |
""" | |
Two-stage matching: first use embeddings to find candidate ingredients, then apply re-ranking | |
Args: | |
products: List of product names to categorize | |
ingredients_dict: Dictionary of ingredient names to embeddings | |
embedding_top_n: Number of top ingredients to retrieve using embeddings | |
final_top_n: Number of final ingredients to return after re-ranking | |
confidence_threshold: Minimum score threshold for final results | |
progress: Optional progress tracking object | |
Returns: | |
Dictionary mapping products to their matched ingredients with scores | |
""" | |
from utils import SafeProgress | |
from embeddings import create_product_embeddings | |
progress_tracker = SafeProgress(progress, desc="Hybrid ingredient matching") | |
progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings") | |
# Stage 1: Use embeddings to find candidate ingredients | |
# Generate product embeddings | |
product_embeddings = create_product_embeddings(products, progress=progress_tracker) | |
# Compute similarities to get candidate ingredients | |
similarities = compute_similarities(ingredients_dict, product_embeddings) | |
# Filter to top N candidates per product | |
embedding_results = {} | |
for product, product_similarities in similarities.items(): | |
embedding_results[product] = product_similarities[:embedding_top_n] | |
progress_tracker(0.4, desc="Stage 2: Re-ranking candidates") | |
# Initialize OpenAI client using the centralized function | |
openai_client = get_openai_client() | |
# Stage 2: Re-rank the candidates for each product | |
final_results = {} | |
for i, product in enumerate(products): | |
progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") | |
# Get the embedding candidates for this product | |
if product not in embedding_results: | |
final_results[product] = [] | |
continue | |
candidates = embedding_results[product] | |
if not candidates: | |
final_results[product] = [] | |
continue | |
# Extract just the ingredient names for re-ranking | |
candidate_ingredients = [c[0] for c in candidates] | |
try: | |
# Apply re-ranking using OpenAI's structured output | |
response = openai_client.responses.create( | |
model="o3-mini", | |
# reasoning={"effort": "low"}, | |
input=[ | |
{"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."}, | |
{"role": "user", "content": f"Product: {product}\n\nPotential ingredients: {', '.join(candidate_ingredients)}"} | |
], | |
text={ | |
"format": { | |
"type": "json_schema", | |
"name": "ingredient_selection", | |
"schema": { | |
"type": "object", | |
"properties": { | |
"best_match": { | |
"type": "object", | |
"properties": { | |
"ingredient": { | |
"type": "string", | |
"description": "The name of the best matching ingredient" | |
}, | |
"explanation": { | |
"type": "string", | |
"description": "Brief explanation for the matching" | |
}, | |
"relevance_score": { | |
"type": "number", | |
"description": "Score between 0 and 1 indicating relevance" | |
} | |
}, | |
"required": ["ingredient", "relevance_score", "explanation"], | |
"additionalProperties": False | |
} | |
}, | |
"required": ["best_match"], | |
"additionalProperties": False | |
}, | |
"strict": True | |
} | |
} | |
) | |
# Parse the response | |
best_match = json.loads(response.output_text)["best_match"] | |
# Only include the result if it meets the confidence threshold | |
if best_match["relevance_score"] >= confidence_threshold: | |
final_results[product] = [(best_match["ingredient"], best_match["relevance_score"])] | |
else: | |
final_results[product] = [] | |
except Exception as e: | |
print(f"Error during OpenAI re-ranking for '{product}': {e}") | |
# Fall back to embedding results if re-ranking fails | |
final_results[product] = candidates[:1] # Select the top embedding result as fallback | |
except Exception as e: | |
print(f"Error during OpenAI re-ranking for '{product}': {e}") | |
# Fall back to embedding results if re-ranking fails | |
final_results[product] = candidates[:final_top_n] | |
progress_tracker(1.0, desc="Hybrid ingredient matching complete") | |
return final_results | |