Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

product_ingredient_demo / ui_hybrid_matching.py

esilver

feat: Update UI components

c15c118 3 months ago

raw

history blame

13.9 kB

	# import gradio as gr # Removed Gradio import
	# from utils import SafeProgress # Removed SafeProgress import
	from category_matching import load_categories, hybrid_category_matching
	from similarity import hybrid_ingredient_matching, compute_similarities
	from ui_core import embeddings, parse_input
	from ui_formatters import format_reranking_results_html
	from openai_expansion import expand_product_descriptions
	from api_utils import get_voyage_client

	def categorize_products_with_voyage_reranking(product_input, is_file=False, use_expansion=False,
	embedding_top_n=20, final_top_n=5, confidence_threshold=0.5,
	match_type="categories"): # Removed progress parameter
	"""
	Categorize products using Voyage reranking with optional description expansion
	"""
	# Removed Gradio progress tracking
	# progress_tracker = SafeProgress(progress)
	# progress_tracker(0, desc=f"Starting Voyage reranking for {match_type}...")

	# Parse input
	product_names, error = parse_input(product_input, is_file)
	if error:
	return error

	# Optional description expansion
	expanded_descriptions = {}
	if use_expansion:
	# progress_tracker(0.3, desc="Expanding product descriptions...") # Removed progress
	expanded_descriptions = expand_product_descriptions(product_names) # Removed progress argument

	match_results = {}
	if match_type == "categories":
	# Load categories
	# progress_tracker(0.2, desc="Loading categories...") # Removed progress
	categories = load_categories()

	# Use hybrid approach for categories with optional expanded descriptions
	# progress_tracker(0.5, desc="Finding and re-ranking categories...") # Removed progress
	match_results = hybrid_category_matching(
	product_names, categories,
	embedding_top_n=int(embedding_top_n),
	final_top_n=int(final_top_n),
	confidence_threshold=0.0, # Don't apply threshold here - do it in display
	expanded_descriptions=expanded_descriptions if use_expansion else None
	# Removed progress argument
	)
	else: # ingredients
	# Validate embeddings are loaded
	if not embeddings:
	return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"

	# Use hybrid approach for ingredients with optional expanded descriptions
	# progress_tracker(0.5, desc="Finding and re-ranking ingredients...") # Removed progress
	match_results = hybrid_ingredient_matching(
	product_names, embeddings,
	embedding_top_n=int(embedding_top_n),
	final_top_n=int(final_top_n),
	confidence_threshold=0.0, # Don't apply threshold here - do it in display
	expanded_descriptions=expanded_descriptions if use_expansion else None
	# Removed progress argument
	)

	# Format results
	# progress_tracker(0.9, desc="Formatting results...") # Removed progress

	# Convert to unified format for formatter
	formatted_results = []
	for product, matches in match_results.items():
	# Include all products, even with no matches
	formatted_result = {
	"product_name": product,
	"confidence": max([item[-1] for item in matches]) if matches else 0,
	"matching_items": [],
	"item_scores": [],
	"explanation": expanded_descriptions.get(product, "") if use_expansion else ""
	}

	# Format matching items based on match type
	if match_type == "ingredients":
	# Extract ingredient names and scores
	formatted_result["matching_items"] = [item[0] for item in matches]
	formatted_result["item_scores"] = [item[1] for item in matches]
	else: # categories
	for match in matches:
	if len(match) >= 2:
	cat_id = match[0]
	# Some category matches might include a text description
	cat_text = match[1] if len(match) > 2 else ""
	score = match[-1]
	if isinstance(cat_text, (int, float)): # This is not text but a score
	cat_text = ""
	formatted_result["matching_items"].append(
	f"{cat_id}: {cat_text}" if cat_text else f"{cat_id}"
	)
	formatted_result["item_scores"].append(score)

	formatted_results.append(formatted_result)

	if not formatted_results:
	return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"

	print(f"FORMAT RESULTS: {formatted_results}")

	result_html = format_reranking_results_html(
	results=formatted_results,
	match_type=match_type,
	show_scores=True,
	include_explanation=use_expansion,
	method="voyage",
	confidence_threshold=confidence_threshold # Pass the threshold to the formatter
	)

	# progress_tracker(1.0, desc="Done!") # Removed progress
	return result_html

	# Update the function in ui_hybrid_matching.py
	def hybrid_ingredient_matching_voyage(products, ingredients_dict,
	embedding_top_n=20, final_top_n=5,
	confidence_threshold=0.5,
	expanded_descriptions=None,
	): # Removed progress parameter
	"""Use Voyage AI for reranking instead of OpenAI"""
	# from utils import SafeProgress # Removed SafeProgress import
	from embeddings import create_product_embeddings

	# Removed Gradio progress tracking
	# progress_tracker = SafeProgress(progress, desc="Voyage ingredient matching")
	# progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")

	# Stage 1: Same as before - use embeddings to find candidates
	if expanded_descriptions:
	# Use expanded descriptions for embedding creation when available
	products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
	# Map expanded descriptions back to original product names for consistent keys
	product_embeddings = {}
	temp_embeddings = create_product_embeddings(products_for_embedding, original_products=products) # Removed progress, pass original names

	# Ensure we use original product names as keys
	for i, product_name in enumerate(products):
	if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
	product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
	else:
	# Standard embedding creation with just product names
	product_embeddings = create_product_embeddings(products) # Removed progress

	similarities = compute_similarities(ingredients_dict, product_embeddings)

	# Filter to top N candidates per product
	embedding_results = {}
	for product, product_similarities in similarities.items():
	embedding_results[product] = product_similarities[:embedding_top_n]

	# progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI") # Removed progress

	# Initialize Voyage client
	voyage_client = get_voyage_client()

	# Stage 2: Re-rank using Voyage instead of OpenAI
	final_results = {}

	for i, product in enumerate(products):
	# progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") # Removed progress

	if product not in embedding_results or not embedding_results[product]:
	final_results[product] = []
	continue

	candidates = embedding_results[product]
	candidate_ingredients = [c[0] for c in candidates]

	try:
	# Use expanded description if available
	product_text = product
	if expanded_descriptions and product in expanded_descriptions:
	product_text = expanded_descriptions[product]

	# Use plain strings for the documents instead of objects with text property
	documents = candidate_ingredients # Simply use the list of strings directly

	# Use Voyage reranking
	reranked = voyage_client.rerank(
	query=f"Which ingredient best matches: {product_text}",
	documents=documents,
	model="rerank-2"
	)

	# Process results - include all results but keep the threshold for later filtering
	voyage_results = []
	for result in reranked["results"]:
	score = result["relevance_score"]
	text = result["document"] # Now this is the direct string, not an object
	voyage_results.append((text, score))

	# Still limit to final_top_n but don't filter by threshold here
	final_results[product] = voyage_results[:final_top_n]

	except Exception as e:
	print(f"Error during Voyage reranking for '{product}': {e}")
	# Fall back to embedding results
	final_results[product] = candidates[:1]

	# progress_tracker(1.0, desc="Voyage ingredient matching complete") # Removed progress
	return final_results

	# Add this function to ui_hybrid_matching.py

	def hybrid_category_matching_voyage(products, categories_dict,
	embedding_top_n=20, final_top_n=5,
	confidence_threshold=0.5,
	expanded_descriptions=None,
	): # Removed progress parameter
	"""Use Voyage AI for reranking categories instead of OpenAI"""
	# from utils import SafeProgress # Removed SafeProgress import
	from embeddings import create_product_embeddings

	# Removed Gradio progress tracking
	# progress_tracker = SafeProgress(progress, desc="Voyage category matching")
	# progress_tracker(0.1, desc="Stage 1: Finding candidate categories with embeddings")

	# Stage 1: Same as before - use embeddings to find candidates
	if expanded_descriptions:
	# Use expanded descriptions for embedding creation when available
	products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
	# Map expanded descriptions back to original product names for consistent keys
	product_embeddings = {}
	temp_embeddings = create_product_embeddings(products_for_embedding, original_products=products) # Removed progress, pass original names

	# Ensure we use original product names as keys
	for i, product_name in enumerate(products):
	if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
	product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
	else:
	# Standard embedding creation with just product names
	product_embeddings = create_product_embeddings(products) # Removed progress

	from similarity import compute_similarities
	similarities = compute_similarities(categories_dict, product_embeddings)

	# Filter to top N candidates per product
	embedding_results = {}
	for product, product_similarities in similarities.items():
	embedding_results[product] = product_similarities[:embedding_top_n]

	# progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI") # Removed progress

	# Initialize Voyage client
	voyage_client = get_voyage_client()

	# Stage 2: Re-rank using Voyage AI
	final_results = {}
	for i, product in enumerate(products):
	# progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") # Removed progress

	if product not in embedding_results or not embedding_results[product]:
	final_results[product] = []
	continue

	candidates = embedding_results[product]
	candidate_categories = [c[0] for c in candidates]

	try:
	# Use expanded description if available
	product_text = product
	if expanded_descriptions and product in expanded_descriptions:
	product_text = expanded_descriptions[product]

	# Use plain strings for the documents
	documents = candidate_categories

	# Use Voyage reranking
	reranked = voyage_client.rerank(
	query=f"Which food category best matches: {product_text}",
	documents=documents,
	model="rerank-2"
	)

	# Process results - include all results but keep the threshold for later filtering
	voyage_results = []
	for result in reranked["results"]:
	score = result["relevance_score"]
	text = result["document"]
	voyage_results.append((text, score))

	# Limit to final_top_n but don't filter by threshold here
	final_results[product] = voyage_results[:final_top_n]

	except Exception as e:
	print(f"Error during Voyage category reranking for '{product}': {e}")
	# Fall back to embedding results
	final_results[product] = candidates[:1]

	# progress_tracker(1.0, desc="Voyage category matching complete") # Removed progress
	return final_results