import gradio as gr from utils import SafeProgress from category_matching import load_categories, hybrid_category_matching from similarity import hybrid_ingredient_matching, compute_similarities from ui_core import embeddings, parse_input from ui_formatters import format_hybrid_results_html, create_results_container, format_reranking_results_html from openai_expansion import expand_product_descriptions from api_utils import get_voyage_client def categorize_products_with_voyage_reranking(product_input, is_file=False, use_expansion=False, embedding_top_n=20, final_top_n=5, confidence_threshold=0.5, match_type="categories", progress=gr.Progress()): """ Categorize products using Voyage reranking with optional description expansion """ progress_tracker = SafeProgress(progress) progress_tracker(0, desc=f"Starting Voyage reranking for {match_type}...") # Parse input product_names, error = parse_input(product_input, is_file) if error: return error # Optional description expansion expanded_descriptions = {} if use_expansion: progress_tracker(0.3, desc="Expanding product descriptions...") expanded_descriptions = expand_product_descriptions(product_names, progress=progress) match_results = {} if match_type == "categories": # Load categories progress_tracker(0.2, desc="Loading categories...") categories = load_categories() # Use hybrid approach for categories with optional expanded descriptions progress_tracker(0.5, desc="Finding and re-ranking categories...") match_results = hybrid_category_matching( product_names, categories, embedding_top_n=int(embedding_top_n), final_top_n=int(final_top_n), confidence_threshold=0.0, # Don't apply threshold here - do it in display expanded_descriptions=expanded_descriptions if use_expansion else None, progress=progress ) else: # ingredients # Validate embeddings are loaded if not embeddings: return "
Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.
" # Use hybrid approach for ingredients with optional expanded descriptions progress_tracker(0.5, desc="Finding and re-ranking ingredients...") match_results = hybrid_ingredient_matching( product_names, embeddings, embedding_top_n=int(embedding_top_n), final_top_n=int(final_top_n), confidence_threshold=0.0, # Don't apply threshold here - do it in display expanded_descriptions=expanded_descriptions if use_expansion else None, progress=progress ) # Format results progress_tracker(0.9, desc="Formatting results...") # Convert to unified format for formatter formatted_results = [] for product, matches in match_results.items(): # Include all products, even with no matches formatted_result = { "product_name": product, "confidence": max([item[-1] for item in matches]) if matches else 0, "matching_items": [], "item_scores": [], "explanation": expanded_descriptions.get(product, "") if use_expansion else "" } # Format matching items based on match type if match_type == "ingredients": # Extract ingredient names and scores formatted_result["matching_items"] = [item[0] for item in matches] formatted_result["item_scores"] = [item[1] for item in matches] else: # categories for match in matches: if len(match) >= 2: cat_id = match[0] # Some category matches might include a text description cat_text = match[1] if len(match) > 2 else "" score = match[-1] if isinstance(cat_text, (int, float)): # This is not text but a score cat_text = "" formatted_result["matching_items"].append( f"{cat_id}: {cat_text}" if cat_text else f"{cat_id}" ) formatted_result["item_scores"].append(score) formatted_results.append(formatted_result) if not formatted_results: return "
No results found. Please check your input or try different products.
" result_html = format_reranking_results_html( results=formatted_results, match_type=match_type, show_scores=True, include_explanation=use_expansion, method="voyage", confidence_threshold=confidence_threshold # Pass the threshold to the formatter ) progress_tracker(1.0, desc="Done!") return result_html # Update the function in ui_hybrid_matching.py def hybrid_ingredient_matching_voyage(products, ingredients_dict, embedding_top_n=20, final_top_n=5, confidence_threshold=0.5, expanded_descriptions=None, progress=None): """Use Voyage AI for reranking instead of OpenAI""" from utils import SafeProgress from embeddings import create_product_embeddings progress_tracker = SafeProgress(progress, desc="Voyage ingredient matching") progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings") # Stage 1: Same as before - use embeddings to find candidates if expanded_descriptions: # Use expanded descriptions for embedding creation when available products_for_embedding = [expanded_descriptions.get(name, name) for name in products] # Map expanded descriptions back to original product names for consistent keys product_embeddings = {} temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker) # Ensure we use original product names as keys for i, product_name in enumerate(products): if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings: product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]] else: # Standard embedding creation with just product names product_embeddings = create_product_embeddings(products, progress=progress_tracker) similarities = compute_similarities(ingredients_dict, product_embeddings) # Filter to top N candidates per product embedding_results = {} for product, product_similarities in similarities.items(): embedding_results[product] = product_similarities[:embedding_top_n] progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI") # Initialize Voyage client voyage_client = get_voyage_client() # Stage 2: Re-rank using Voyage instead of OpenAI final_results = {} for i, product in enumerate(products): progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") if product not in embedding_results or not embedding_results[product]: final_results[product] = [] continue candidates = embedding_results[product] candidate_ingredients = [c[0] for c in candidates] try: # Use expanded description if available product_text = product if expanded_descriptions and product in expanded_descriptions: product_text = expanded_descriptions[product] # Use plain strings for the documents instead of objects with text property documents = candidate_ingredients # Simply use the list of strings directly # Use Voyage reranking reranked = voyage_client.rerank( query=f"Which ingredient best matches: {product_text}", documents=documents, model="rerank-2" ) # Process results - include all results but keep the threshold for later filtering voyage_results = [] for result in reranked["results"]: score = result["relevance_score"] text = result["document"] # Now this is the direct string, not an object voyage_results.append((text, score)) # Still limit to final_top_n but don't filter by threshold here final_results[product] = voyage_results[:final_top_n] except Exception as e: print(f"Error during Voyage reranking for '{product}': {e}") # Fall back to embedding results final_results[product] = candidates[:1] progress_tracker(1.0, desc="Voyage ingredient matching complete") return final_results # Add this function to ui_hybrid_matching.py def hybrid_category_matching_voyage(products, categories_dict, embedding_top_n=20, final_top_n=5, confidence_threshold=0.5, expanded_descriptions=None, progress=None): """Use Voyage AI for reranking categories instead of OpenAI""" from utils import SafeProgress from embeddings import create_product_embeddings progress_tracker = SafeProgress(progress, desc="Voyage category matching") progress_tracker(0.1, desc="Stage 1: Finding candidate categories with embeddings") # Stage 1: Same as before - use embeddings to find candidates if expanded_descriptions: # Use expanded descriptions for embedding creation when available products_for_embedding = [expanded_descriptions.get(name, name) for name in products] # Map expanded descriptions back to original product names for consistent keys product_embeddings = {} temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker) # Ensure we use original product names as keys for i, product_name in enumerate(products): if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings: product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]] else: # Standard embedding creation with just product names product_embeddings = create_product_embeddings(products, progress=progress_tracker) from similarity import compute_similarities similarities = compute_similarities(categories_dict, product_embeddings) # Filter to top N candidates per product embedding_results = {} for product, product_similarities in similarities.items(): embedding_results[product] = product_similarities[:embedding_top_n] progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI") # Initialize Voyage client voyage_client = get_voyage_client() # Stage 2: Re-rank using Voyage AI final_results = {} for i, product in enumerate(products): progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") if product not in embedding_results or not embedding_results[product]: final_results[product] = [] continue candidates = embedding_results[product] candidate_categories = [c[0] for c in candidates] try: # Use expanded description if available product_text = product if expanded_descriptions and product in expanded_descriptions: product_text = expanded_descriptions[product] # Use plain strings for the documents documents = candidate_categories # Use Voyage reranking reranked = voyage_client.rerank( query=f"Which food category best matches: {product_text}", documents=documents, model="rerank-2" ) # Process results - include all results but keep the threshold for later filtering voyage_results = [] for result in reranked["results"]: score = result["relevance_score"] text = result["document"] voyage_results.append((text, score)) # Limit to final_top_n but don't filter by threshold here final_results[product] = voyage_results[:final_top_n] except Exception as e: print(f"Error during Voyage category reranking for '{product}': {e}") # Fall back to embedding results final_results[product] = candidates[:1] progress_tracker(1.0, desc="Voyage category matching complete") return final_results