Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

product_ingredient_demo / similarity.py

esilver

Added expansion for embeddings

1e737a1 3 months ago

raw

history blame

11.6 kB

	import numpy as np
	from typing import Dict, List, Tuple, Any
	import json
	import voyageai
	from openai import OpenAI
	from api_utils import get_openai_client

	def compute_similarities(ingredients_dict, products_dict):
	"""
	Compute cosine similarities between ingredient embeddings and product embeddings

	Args:
	ingredients_dict: Dictionary of ingredient names to embeddings
	products_dict: Dictionary of product names to embedding dictionaries

	Returns:
	Dictionary of products with their similar ingredients and scores
	"""
	# Validate inputs
	if not ingredients_dict:
	print("Warning: ingredients_dict is empty")
	return {}

	if not products_dict:
	print("Warning: products_dict is empty")
	return {}

	# Process ingredients - ensure we have proper embeddings
	ingredient_names = []
	ingredient_embeddings_list = []

	for name, emb in ingredients_dict.items():
	# Make sure we have valid embeddings (not None, not empty)
	if emb is None or (isinstance(emb, (list, np.ndarray)) and len(emb) == 0):
	continue

	# Handle both direct embedding vectors and dictionary formats
	if isinstance(emb, dict) and "embedding" in emb:
	embedding_vector = emb["embedding"]
	if embedding_vector is not None:
	ingredient_names.append(name)
	ingredient_embeddings_list.append(embedding_vector)
	elif isinstance(emb, (list, np.ndarray)):
	ingredient_names.append(name)
	ingredient_embeddings_list.append(emb)

	if not ingredient_names:
	print("Warning: No valid ingredient embeddings found")
	return {}

	# Convert to numpy array ensuring we have a 2D array
	ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
	if ingredient_embeddings.ndim == 1:
	# If we got a 1D array, reshape it to 2D (1 x dimension)
	print(f"Warning: Ingredient embeddings have only 1 dimension, reshaping. Shape: {ingredient_embeddings.shape}")
	if len(ingredient_embeddings) > 0:
	ingredient_embeddings = ingredient_embeddings.reshape(1, -1)
	else:
	print("Error: Empty ingredient embeddings array")
	return {}

	# Normalize ingredient embeddings for cosine similarity
	# Add safety checks for zero norms
	ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
	# Avoid division by zero
	ingredient_norms = np.where(ingredient_norms == 0, 1e-10, ingredient_norms)
	normalized_ingredients = ingredient_embeddings / ingredient_norms

	# Process products
	product_names = []
	valid_embeddings = []

	# Extract the actual embedding vectors from product dictionaries
	for product_name, product_data in products_dict.items():
	# Skip None values
	if product_data is None:
	continue

	# Check if the product has an embedding dictionary with the expected structure
	if isinstance(product_data, dict) and "embedding" in product_data:
	embedding_vector = product_data["embedding"]
	if embedding_vector is not None:
	product_names.append(product_name)
	valid_embeddings.append(embedding_vector)
	# If the product data is already a vector, use it directly
	elif isinstance(product_data, (list, np.ndarray)):
	product_names.append(product_name)
	valid_embeddings.append(product_data)

	if not product_names:
	print("Warning: No valid product embeddings found")
	return {}

	# Convert to numpy array for calculations
	product_embeddings = np.array(valid_embeddings, dtype=np.float32)

	# Handle case where we got a 1D array
	if product_embeddings.ndim == 1:
	print(f"Warning: Product embeddings have only 1 dimension, reshaping. Shape: {product_embeddings.shape}")
	if len(product_embeddings) > 0:
	product_embeddings = product_embeddings.reshape(1, -1)
	else:
	print("Error: Empty product embeddings array")
	return {}

	# Check and handle embedding dimension mismatch
	product_dim = product_embeddings.shape[1] if product_embeddings.ndim > 1 else len(product_embeddings)
	ingredient_dim = normalized_ingredients.shape[1] if normalized_ingredients.ndim > 1 else len(normalized_ingredients)

	if product_dim != ingredient_dim:
	print(f"Warning: Dimension mismatch between product embeddings ({product_dim}) and ingredient embeddings ({ingredient_dim})")
	# Return empty results if dimensions don't match
	return {}

	# Normalize product embeddings for cosine similarity
	product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
	# Avoid division by zero
	product_norms = np.where(product_norms == 0, 1e-10, product_norms)
	normalized_products = product_embeddings / product_norms

	# Compute cosine similarity
	similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)

	# Create result dictionary
	results = {}
	for i, product_name in enumerate(product_names):
	similarities = similarity_matrix[i]
	product_similarities = [(ingredient_names[j], float(similarities[j]))
	for j in range(len(ingredient_names))]

	# Sort by similarity score (descending)
	product_similarities.sort(key=lambda x: x[1], reverse=True)

	results[product_name] = product_similarities

	return results

	def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str, Any],
	embedding_top_n: int = 20, final_top_n: int = 5,
	confidence_threshold: float = 0.5,
	expanded_descriptions: Dict[str, str] = None,
	progress=None) -> Dict[str, List[Tuple]]:
	"""
	Two-stage matching: first use embeddings to find candidate ingredients, then apply re-ranking

	Args:
	products: List of product names to categorize
	ingredients_dict: Dictionary of ingredient names to embeddings
	embedding_top_n: Number of top ingredients to retrieve using embeddings
	final_top_n: Number of final ingredients to return after re-ranking
	confidence_threshold: Minimum score threshold for final results
	expanded_descriptions: Optional dict mapping products to their expanded descriptions
	progress: Optional progress tracking object

	Returns:
	Dictionary mapping products to their matched ingredients with scores
	"""
	from utils import SafeProgress
	from embeddings import create_product_embeddings

	progress_tracker = SafeProgress(progress, desc="Hybrid ingredient matching")
	progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")

	# Stage 1: Use embeddings to find candidate ingredients
	# Generate product embeddings
	product_embeddings = create_product_embeddings(products, progress=progress_tracker)

	# Compute similarities to get candidate ingredients
	similarities = compute_similarities(ingredients_dict, product_embeddings)

	# Filter to top N candidates per product
	embedding_results = {}
	for product, product_similarities in similarities.items():
	embedding_results[product] = product_similarities[:embedding_top_n]

	progress_tracker(0.4, desc="Stage 2: Re-ranking candidates")

	# Initialize OpenAI client using the centralized function
	openai_client = get_openai_client()

	# Stage 2: Re-rank the candidates for each product
	final_results = {}

	for i, product in enumerate(products):
	progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")

	# Get the embedding candidates for this product
	if product not in embedding_results:
	final_results[product] = []
	continue

	candidates = embedding_results[product]
	if not candidates:
	final_results[product] = []
	continue

	# Extract just the ingredient names for re-ranking
	candidate_ingredients = [c[0] for c in candidates]

	try:
	# Use expanded description if available
	product_text = product
	if expanded_descriptions and product in expanded_descriptions:
	product_text = expanded_descriptions[product]

	# Apply re-ranking using OpenAI's structured output
	response = openai_client.responses.create(
	model="gpt-4o-mini",
	# reasoning={"effort": "low"},
	input=[
	{"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
	{"role": "user", "content": f"Product: {product_text}\n\nPotential ingredients: {', '.join(candidate_ingredients)}"}
	],
	text={
	"format": {
	"type": "json_schema",
	"name": "ingredient_selection",
	"schema": {
	"type": "object",
	"properties": {
	"best_match": {
	"type": "object",
	"properties": {
	"ingredient": {
	"type": "string",
	"description": "The name of the best matching ingredient"
	},
	"explanation": {
	"type": "string",
	"description": "Brief explanation for the matching"
	},
	"relevance_score": {
	"type": "number",
	"description": "Score between 0 and 1 indicating relevance"
	}
	},
	"required": ["ingredient", "relevance_score", "explanation"],
	"additionalProperties": False
	}
	},
	"required": ["best_match"],
	"additionalProperties": False
	},
	"strict": True
	}
	}
	)

	# Parse the response
	best_match = json.loads(response.output_text)["best_match"]

	# Only include the result if it meets the confidence threshold
	if best_match["relevance_score"] >= confidence_threshold:
	final_results[product] = [(best_match["ingredient"], best_match["relevance_score"])]
	else:
	final_results[product] = []

	except Exception as e:
	print(f"Error during OpenAI re-ranking for '{product}': {e}")
	# Fall back to embedding results if re-ranking fails
	final_results[product] = candidates[:1] # Select the top embedding result as fallback

	progress_tracker(1.0, desc="Hybrid ingredient matching complete")
	return final_results