import gradio as gr
from utils import SafeProgress
from category_matching import load_categories, hybrid_category_matching
from similarity import hybrid_ingredient_matching, compute_similarities
from ui_core import embeddings, parse_input
from ui_formatters import format_hybrid_results_html, create_results_container, format_reranking_results_html
from openai_expansion import expand_product_descriptions
from api_utils import get_voyage_client
def categorize_products_with_voyage_reranking(product_input, is_file=False, use_expansion=False,
embedding_top_n=20, final_top_n=5, confidence_threshold=0.5,
match_type="categories", progress=gr.Progress()):
"""
Categorize products using Voyage reranking with optional description expansion
"""
progress_tracker = SafeProgress(progress)
progress_tracker(0, desc=f"Starting Voyage reranking for {match_type}...")
# Parse input
product_names, error = parse_input(product_input, is_file)
if error:
return error
# Optional description expansion
expanded_descriptions = {}
if use_expansion:
progress_tracker(0.3, desc="Expanding product descriptions...")
expanded_descriptions = expand_product_descriptions(product_names, progress=progress)
match_results = {}
if match_type == "categories":
# Load categories
progress_tracker(0.2, desc="Loading categories...")
categories = load_categories()
# Use hybrid approach for categories with optional expanded descriptions
progress_tracker(0.5, desc="Finding and re-ranking categories...")
match_results = hybrid_category_matching(
product_names, categories,
embedding_top_n=int(embedding_top_n),
final_top_n=int(final_top_n),
confidence_threshold=0.0, # Don't apply threshold here - do it in display
expanded_descriptions=expanded_descriptions if use_expansion else None,
progress=progress
)
else: # ingredients
# Validate embeddings are loaded
if not embeddings:
return "
Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.
"
# Use hybrid approach for ingredients with optional expanded descriptions
progress_tracker(0.5, desc="Finding and re-ranking ingredients...")
match_results = hybrid_ingredient_matching(
product_names, embeddings,
embedding_top_n=int(embedding_top_n),
final_top_n=int(final_top_n),
confidence_threshold=0.0, # Don't apply threshold here - do it in display
expanded_descriptions=expanded_descriptions if use_expansion else None,
progress=progress
)
# Format results
progress_tracker(0.9, desc="Formatting results...")
# Convert to unified format for formatter
formatted_results = []
for product, matches in match_results.items():
# Include all products, even with no matches
formatted_result = {
"product_name": product,
"confidence": max([item[-1] for item in matches]) if matches else 0,
"matching_items": [],
"item_scores": [],
"explanation": expanded_descriptions.get(product, "") if use_expansion else ""
}
# Format matching items based on match type
if match_type == "ingredients":
# Extract ingredient names and scores
formatted_result["matching_items"] = [item[0] for item in matches]
formatted_result["item_scores"] = [item[1] for item in matches]
else: # categories
for match in matches:
if len(match) >= 2:
cat_id = match[0]
# Some category matches might include a text description
cat_text = match[1] if len(match) > 2 else ""
score = match[-1]
if isinstance(cat_text, (int, float)): # This is not text but a score
cat_text = ""
formatted_result["matching_items"].append(
f"{cat_id}: {cat_text}" if cat_text else f"{cat_id}"
)
formatted_result["item_scores"].append(score)
formatted_results.append(formatted_result)
if not formatted_results:
return "No results found. Please check your input or try different products.
"
result_html = format_reranking_results_html(
results=formatted_results,
match_type=match_type,
show_scores=True,
include_explanation=use_expansion,
method="voyage",
confidence_threshold=confidence_threshold # Pass the threshold to the formatter
)
progress_tracker(1.0, desc="Done!")
return result_html
# Update the function in ui_hybrid_matching.py
def hybrid_ingredient_matching_voyage(products, ingredients_dict,
embedding_top_n=20, final_top_n=5,
confidence_threshold=0.5,
expanded_descriptions=None,
progress=None):
"""Use Voyage AI for reranking instead of OpenAI"""
from utils import SafeProgress
from embeddings import create_product_embeddings
progress_tracker = SafeProgress(progress, desc="Voyage ingredient matching")
progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
# Stage 1: Same as before - use embeddings to find candidates
if expanded_descriptions:
# Use expanded descriptions for embedding creation when available
products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
# Map expanded descriptions back to original product names for consistent keys
product_embeddings = {}
temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker)
# Ensure we use original product names as keys
for i, product_name in enumerate(products):
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
else:
# Standard embedding creation with just product names
product_embeddings = create_product_embeddings(products, progress=progress_tracker)
similarities = compute_similarities(ingredients_dict, product_embeddings)
# Filter to top N candidates per product
embedding_results = {}
for product, product_similarities in similarities.items():
embedding_results[product] = product_similarities[:embedding_top_n]
progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI")
# Initialize Voyage client
voyage_client = get_voyage_client()
# Stage 2: Re-rank using Voyage instead of OpenAI
final_results = {}
for i, product in enumerate(products):
progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
if product not in embedding_results or not embedding_results[product]:
final_results[product] = []
continue
candidates = embedding_results[product]
candidate_ingredients = [c[0] for c in candidates]
try:
# Use expanded description if available
product_text = product
if expanded_descriptions and product in expanded_descriptions:
product_text = expanded_descriptions[product]
# Use plain strings for the documents instead of objects with text property
documents = candidate_ingredients # Simply use the list of strings directly
# Use Voyage reranking
reranked = voyage_client.rerank(
query=f"Which ingredient best matches: {product_text}",
documents=documents,
model="rerank-2"
)
# Process results - include all results but keep the threshold for later filtering
voyage_results = []
for result in reranked["results"]:
score = result["relevance_score"]
text = result["document"] # Now this is the direct string, not an object
voyage_results.append((text, score))
# Still limit to final_top_n but don't filter by threshold here
final_results[product] = voyage_results[:final_top_n]
except Exception as e:
print(f"Error during Voyage reranking for '{product}': {e}")
# Fall back to embedding results
final_results[product] = candidates[:1]
progress_tracker(1.0, desc="Voyage ingredient matching complete")
return final_results
# Add this function to ui_hybrid_matching.py
def hybrid_category_matching_voyage(products, categories_dict,
embedding_top_n=20, final_top_n=5,
confidence_threshold=0.5,
expanded_descriptions=None,
progress=None):
"""Use Voyage AI for reranking categories instead of OpenAI"""
from utils import SafeProgress
from embeddings import create_product_embeddings
progress_tracker = SafeProgress(progress, desc="Voyage category matching")
progress_tracker(0.1, desc="Stage 1: Finding candidate categories with embeddings")
# Stage 1: Same as before - use embeddings to find candidates
if expanded_descriptions:
# Use expanded descriptions for embedding creation when available
products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
# Map expanded descriptions back to original product names for consistent keys
product_embeddings = {}
temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker)
# Ensure we use original product names as keys
for i, product_name in enumerate(products):
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
else:
# Standard embedding creation with just product names
product_embeddings = create_product_embeddings(products, progress=progress_tracker)
from similarity import compute_similarities
similarities = compute_similarities(categories_dict, product_embeddings)
# Filter to top N candidates per product
embedding_results = {}
for product, product_similarities in similarities.items():
embedding_results[product] = product_similarities[:embedding_top_n]
progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI")
# Initialize Voyage client
voyage_client = get_voyage_client()
# Stage 2: Re-rank using Voyage AI
final_results = {}
for i, product in enumerate(products):
progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
if product not in embedding_results or not embedding_results[product]:
final_results[product] = []
continue
candidates = embedding_results[product]
candidate_categories = [c[0] for c in candidates]
try:
# Use expanded description if available
product_text = product
if expanded_descriptions and product in expanded_descriptions:
product_text = expanded_descriptions[product]
# Use plain strings for the documents
documents = candidate_categories
# Use Voyage reranking
reranked = voyage_client.rerank(
query=f"Which food category best matches: {product_text}",
documents=documents,
model="rerank-2"
)
# Process results - include all results but keep the threshold for later filtering
voyage_results = []
for result in reranked["results"]:
score = result["relevance_score"]
text = result["document"]
voyage_results.append((text, score))
# Limit to final_top_n but don't filter by threshold here
final_results[product] = voyage_results[:final_top_n]
except Exception as e:
print(f"Error during Voyage category reranking for '{product}': {e}")
# Fall back to embedding results
final_results[product] = candidates[:1]
progress_tracker(1.0, desc="Voyage category matching complete")
return final_results