import json import numpy as np from typing import Dict, List, Tuple, Any import concurrent.futures import time import os from api_utils import get_openai_client, get_voyage_client, process_in_parallel, rank_ingredients_openai from ui_formatters import format_comparison_html, create_results_container def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str, Any], embedding_top_n: int = 20, final_top_n: int = 3, confidence_threshold: float = 0.5, progress=None) -> Dict[str, Dict[str, List[Tuple]]]: """ Compare four different methods for ingredient matching: 1. Base embeddings (without re-ranking) 2. Voyage AI reranker (via hybrid approach) 3. Chicory parser 4. GPT-4o structured output Args: products: List of product names to categorize ingredients_dict: Dictionary of ingredient names to embeddings embedding_top_n: Number of top ingredients to retrieve using embeddings final_top_n: Number of final results to show for each method confidence_threshold: Minimum score threshold for final results progress: Optional progress tracking object Returns: Dictionary mapping products to results from each method """ from utils import SafeProgress, preprocess_product_for_matching from embeddings import create_product_embeddings from chicory_api import call_chicory_parser from similarity import compute_similarities progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods") # Step 1: Generate embeddings for all products (used by multiple methods) progress_tracker(0.1, desc="Generating product embeddings") product_embeddings = create_product_embeddings(products, progress=progress_tracker) # Step 2: Get embedding-based candidates for all products progress_tracker(0.2, desc="Finding embedding candidates") similarities = compute_similarities(ingredients_dict, product_embeddings) # Filter to top N candidates per product embedding_results = {} for product, product_similarities in similarities.items(): embedding_results[product] = product_similarities[:embedding_top_n] # Step 3: Call Chicory Parser API (this is done for all products at once) progress_tracker(0.3, desc="Calling Chicory Parser API") chicory_results = call_chicory_parser(products, progress=progress_tracker) # Create final results dictionary with base embeddings (which don't need any further processing) comparison_results = {} for product in products: if product in embedding_results: # Initialize with base embeddings already calculated candidates = embedding_results[product] base_results = [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold] comparison_results[product] = { "base": base_results, "voyage": [], "chicory": [], "openai": [] } # Also process Chicory results immediately as they're already fetched chicory_matches = [] if product in chicory_results: chicory_data = chicory_results[product] if isinstance(chicory_data, dict): ingredient = chicory_data.get("ingredient", "") confidence = chicory_data.get("confidence", 0) if ingredient and confidence >= confidence_threshold: chicory_matches.append((ingredient, confidence)) comparison_results[product]["chicory"] = chicory_matches else: comparison_results[product] = { "base": [], "voyage": [], "chicory": [], "openai": [] } # Initialize clients for reranking - REPLACED WITH UTILITY FUNCTIONS voyage_client = get_voyage_client() openai_client = get_openai_client() # Define the methods that will be executed in parallel (now focused only on the API-heavy tasks) def process_voyage_reranking(product): if product not in embedding_results or not embedding_results[product]: return product, [] candidates = embedding_results[product] candidate_ingredients = [c[0] for c in candidates] candidate_texts = [f"Ingredient: {c[0]}" for c in candidates] try: # Apply Voyage reranking to the candidates query = product # Use product directly as query reranking = voyage_client.rerank( query=query, documents=candidate_texts, model="rerank-2", top_k=final_top_n ) # Process reranking results voyage_ingredients = [] for result in reranking.results: # Find the ingredient for this result candidate_index = candidate_texts.index(result.document) ingredient = candidate_ingredients[candidate_index] score = float(result.relevance_score) # Only include results above the confidence threshold if score >= confidence_threshold: voyage_ingredients.append((ingredient, score)) return product, voyage_ingredients except Exception as e: print(f"Error during Voyage reranking for '{product}': {e}") # Fall back to embedding results return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold] def process_openai(product): if product not in embedding_results or not embedding_results[product]: return product, [] candidates = embedding_results[product] candidate_ingredients = [c[0] for c in candidates] try: # Use the shared utility function openai_ingredients = rank_ingredients_openai( product=product, candidates=candidate_ingredients, client=openai_client, model="gpt-4o-mini", max_results=final_top_n, confidence_threshold=confidence_threshold ) return product, openai_ingredients except Exception as e: print(f"Error during OpenAI processing for '{product}': {e}") # Fall back to embedding results return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold] # Process Voyage AI reranking in parallel - REPLACED WITH SHARED UTILITY progress_tracker(0.4, desc="Running Voyage AI reranking in parallel") voyage_results = process_in_parallel( items=products, processor_func=process_voyage_reranking, max_workers=min(20, len(products)), progress_tracker=progress_tracker, progress_start=0.4, progress_end=0.65, progress_desc="Voyage AI" ) # Update comparison results with Voyage results for product, results in voyage_results.items(): if product in comparison_results: comparison_results[product]["voyage"] = results # Process OpenAI queries in parallel - REPLACED WITH SHARED UTILITY progress_tracker(0.7, desc="Running OpenAI processing in parallel") openai_results = process_in_parallel( items=products, processor_func=process_openai, max_workers=min(20, len(products)), progress_tracker=progress_tracker, progress_start=0.7, progress_end=0.95, progress_desc="OpenAI" ) # Update comparison results with OpenAI results for product, results in openai_results.items(): if product in comparison_results: comparison_results[product]["openai"] = results progress_tracker(1.0, desc="Comparison complete") return comparison_results def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=20, final_top_n=3, confidence_threshold=0.5, progress=None): """ Compare multiple ingredient matching methods on the same products Args: product_input: Text input with product names or file path is_file: Whether the input is a file embedding_top_n: Number of top ingredients to retrieve using embeddings final_top_n: Number of final results to show for each method confidence_threshold: Minimum score threshold for final results progress: Optional progress tracking object Returns: HTML formatted comparison results """ from utils import SafeProgress, load_embeddings progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods") progress_tracker(0.1, desc="Processing input") # Split text input by lines and remove empty lines if not product_input: return "Please enter at least one product." product_names = [p.strip() for p in product_input.split('\n') if p.strip()] if not product_names: return "Please enter at least one product." # Load ingredient embeddings try: progress_tracker(0.2, desc="Loading ingredient embeddings") ingredients_dict = load_embeddings("data/ingredient_embeddings_voyageai.pkl") progress_tracker(0.3, desc="Comparing methods") comparison_results = compare_ingredient_methods( products=product_names, ingredients_dict=ingredients_dict, embedding_top_n=embedding_top_n, final_top_n=final_top_n, confidence_threshold=confidence_threshold, progress=progress_tracker ) except Exception as e: import traceback error_details = traceback.format_exc() return f"
{error_details}