Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

esilver commited on Mar 25

Commit

3b88add

1 Parent(s): 8007258

Fixed bugs

Browse files

Files changed (6) hide show

comparison.py +236 -120
data/category_embeddings_voyageai.pkl +3 -0
ui.py +17 -2
ui_formatters.py +10 -8
ui_hybrid_matching.py +90 -1
ui_ingredient_matching.py +2 -1

comparison.py CHANGED Viewed

@@ -1,40 +1,40 @@
 import json
 import numpy as np
 from typing import Dict, List, Tuple, Any
-import concurrent.futures
-import time
-import os
-from api_utils import get_openai_client, get_voyage_client, process_in_parallel, rank_ingredients_openai
 from ui_formatters import format_comparison_html, create_results_container
 def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str, Any],
                             embedding_top_n: int = 20, final_top_n: int = 3,
-                            confidence_threshold: float = 0.5,
-                            progress=None) -> Dict[str, Dict[str, List[Tuple]]]:
     """
-    Compare four different methods for ingredient matching:
-    1. Base embeddings (without re-ranking)
-    2. Voyage AI reranker (via hybrid approach)
-    3. Chicory parser
-    4. GPT-4o structured output
     Args:
-        products: List of product names to categorize
-        ingredients_dict: Dictionary of ingredient names to embeddings
         embedding_top_n: Number of top ingredients to retrieve using embeddings
         final_top_n: Number of final results to show for each method
         confidence_threshold: Minimum score threshold for final results
         progress: Optional progress tracking object
     Returns:
-        Dictionary mapping products to results from each method
     """
-    from utils import SafeProgress, preprocess_product_for_matching
-    from embeddings import create_product_embeddings
-    from chicory_api import call_chicory_parser
-    from similarity import compute_similarities
-    progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods")
     # Step 1: Generate embeddings for all products (used by multiple methods)
     progress_tracker(0.1, desc="Generating product embeddings")
@@ -49,112 +49,144 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
     for product, product_similarities in similarities.items():
         embedding_results[product] = product_similarities[:embedding_top_n]
-    # Step 3: Call Chicory Parser API (this is done for all products at once)
-    progress_tracker(0.3, desc="Calling Chicory Parser API")
     chicory_results = call_chicory_parser(products, progress=progress_tracker)
-    # Create final results dictionary with base embeddings (which don't need any further processing)
     comparison_results = {}
     for product in products:
         if product in embedding_results:
-            # Initialize with base embeddings already calculated
-            candidates = embedding_results[product]
-            base_results = [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
-            comparison_results[product] = {
-                "base": base_results,
-                "voyage": [],
-                "chicory": [],
-                "openai": []
-            }
-            # Also process Chicory results immediately as they're already fetched
-            chicory_matches = []
-            if product in chicory_results:
-                chicory_data = chicory_results[product]
-                if isinstance(chicory_data, dict):
                     ingredient = chicory_data.get("ingredient", "")
                     confidence = chicory_data.get("confidence", 0)
                     if ingredient and confidence >= confidence_threshold:
                         chicory_matches.append((ingredient, confidence))
-            comparison_results[product]["chicory"] = chicory_matches
-        else:
-            comparison_results[product] = {
-                "base": [],
-                "voyage": [],
-                "chicory": [],
-                "openai": []
-            }
-    # Initialize clients for reranking - REPLACED WITH UTILITY FUNCTIONS
-    voyage_client = get_voyage_client()
-    openai_client = get_openai_client()
-    # Define the methods that will be executed in parallel (now focused only on the API-heavy tasks)
-    def process_voyage_reranking(product):
-        if product not in embedding_results or not embedding_results[product]:
-            return product, []
-        candidates = embedding_results[product]
-        candidate_ingredients = [c[0] for c in candidates]
-        candidate_texts = [f"Ingredient: {c[0]}" for c in candidates]
         try:
-            # Apply Voyage reranking to the candidates
-            query = product  # Use product directly as query
-            reranking = voyage_client.rerank(
-                query=query,
-                documents=candidate_texts,
-                model="rerank-2",
-                top_k=final_top_n
-            )
-            # Process reranking results
-            voyage_ingredients = []
-            for result in reranking.results:
-                # Find the ingredient for this result
-                candidate_index = candidate_texts.index(result.document)
-                ingredient = candidate_ingredients[candidate_index]
-                score = float(result.relevance_score)
-                # Only include results above the confidence threshold
-                if score >= confidence_threshold:
-                    voyage_ingredients.append((ingredient, score))
-            return product, voyage_ingredients
         except Exception as e:
-            print(f"Error during Voyage reranking for '{product}': {e}")
-            # Fall back to embedding results
-            return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
-    def process_openai(product):
-        if product not in embedding_results or not embedding_results[product]:
             return product, []
-        candidates = embedding_results[product]
-        candidate_ingredients = [c[0] for c in candidates]
-        try:
-            # Use the shared utility function
-            openai_ingredients = rank_ingredients_openai(
-                product=product,
-                candidates=candidate_ingredients,
-                client=openai_client,
-                model="gpt-4o-mini",
-                max_results=final_top_n,
-                confidence_threshold=confidence_threshold
-            )
-            return product, openai_ingredients
-        except Exception as e:
-            print(f"Error during OpenAI processing for '{product}': {e}")
-            # Fall back to embedding results
-            return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
-    # Process Voyage AI reranking in parallel - REPLACED WITH SHARED UTILITY
-    progress_tracker(0.4, desc="Running Voyage AI reranking in parallel")
     voyage_results = process_in_parallel(
         items=products,
-        processor_func=process_voyage_reranking,
         max_workers=min(20, len(products)),
         progress_tracker=progress_tracker,
         progress_start=0.4,
@@ -167,8 +199,44 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
         if product in comparison_results:
             comparison_results[product]["voyage"] = results
-    # Process OpenAI queries in parallel - REPLACED WITH SHARED UTILITY
     progress_tracker(0.7, desc="Running OpenAI processing in parallel")
     openai_results = process_in_parallel(
         items=products,
         processor_func=process_openai,
@@ -184,20 +252,52 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
         if product in comparison_results:
             comparison_results[product]["openai"] = results
     progress_tracker(1.0, desc="Comparison complete")
     return comparison_results
-def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=20,
-                                final_top_n=3, confidence_threshold=0.5, progress=None):
     """
     Compare multiple ingredient matching methods on the same products
     Args:
         product_input: Text input with product names or file path
-        is_file: Whether the input is a file
         embedding_top_n: Number of top ingredients to retrieve using embeddings
         final_top_n: Number of final results to show for each method
         confidence_threshold: Minimum score threshold for final results
         progress: Optional progress tracking object
     Returns:
@@ -205,10 +305,9 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
     """
     from utils import SafeProgress, load_embeddings
-    progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods")
     progress_tracker(0.1, desc="Processing input")
     # Split text input by lines and remove empty lines
     if not product_input:
         return "Please enter at least one product."
@@ -216,19 +315,37 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
     if not product_names:
         return "Please enter at least one product."
-    # Load ingredient embeddings
     try:
-        progress_tracker(0.2, desc="Loading ingredient embeddings")
-        ingredients_dict = load_embeddings("data/ingredient_embeddings_voyageai.pkl")
         progress_tracker(0.3, desc="Comparing methods")
         comparison_results = compare_ingredient_methods(
             products=product_names,
-            ingredients_dict=ingredients_dict,
             embedding_top_n=embedding_top_n,
             final_top_n=final_top_n,
             confidence_threshold=confidence_threshold,
-            progress=progress_tracker
         )
     except Exception as e:
         import traceback
@@ -237,7 +354,6 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
     # Format results as HTML using centralized formatters
     progress_tracker(0.9, desc="Formatting results")
     result_elements = []
     for product in product_names:
         if product in comparison_results:
@@ -245,7 +361,7 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
     output_html = create_results_container(
         result_elements,
-        header_text=f"Comparing {len(product_names)} products using multiple ingredient matching methods."
     )
     progress_tracker(1.0, desc="Complete")

 import json
 import numpy as np
 from typing import Dict, List, Tuple, Any
+from category_matching import hybrid_category_matching
+from similarity import hybrid_ingredient_matching
+from api_utils import process_in_parallel, rank_ingredients_openai
 from ui_formatters import format_comparison_html, create_results_container
+from utils import SafeProgress
+from chicory_api import call_chicory_parser
+from embeddings import create_product_embeddings
+from similarity import compute_similarities
 def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str, Any],
                             embedding_top_n: int = 20, final_top_n: int = 3,
+                            confidence_threshold: float = 0.5, match_type="ingredients",
+                            progress=None, expanded_descriptions=None) -> Dict[str, Dict[str, List[Tuple]]]:
     """
+    Compare multiple ingredient/category matching methods on the same products
     Args:
+        products: List of product names to process
+        ingredients_dict: Dictionary with ingredient embeddings
         embedding_top_n: Number of top ingredients to retrieve using embeddings
         final_top_n: Number of final results to show for each method
         confidence_threshold: Minimum score threshold for final results
+        match_type: Type of matching to perform ('ingredients' or 'categories')
         progress: Optional progress tracking object
     Returns:
+        Dictionary mapping products to methods and their results
     """
+    progress_tracker = SafeProgress(progress, desc="Comparing matching methods")
     # Step 1: Generate embeddings for all products (used by multiple methods)
     progress_tracker(0.1, desc="Generating product embeddings")
     for product, product_similarities in similarities.items():
         embedding_results[product] = product_similarities[:embedding_top_n]
+    # Step 3: Process with Chicory Parser
+    progress_tracker(0.3, desc="Running Chicory Parser")
+    # Import here to avoid circular imports
+    # from chicory_parser import parse_products
     chicory_results = call_chicory_parser(products, progress=progress_tracker)
+    # Initialize result structure
     comparison_results = {}
     for product in products:
+        comparison_results[product] = {
+            "base": [],
+            "voyage": [],
+            "chicory": [],
+            "openai": []
+        }
+        # Add basic embedding results
         if product in embedding_results:
+            base_results = []
+            for name, score in embedding_results[product]:
+                if score >= confidence_threshold:
+                    base_results.append((name, score))
+            comparison_results[product]["base"] = base_results[:final_top_n]
+        # Process Chicory results
+        chicory_matches = []
+        if product in chicory_results:
+            chicory_data = chicory_results[product]
+            if isinstance(chicory_data, dict):
+                # Handle different response formats based on match type
+                if match_type == "ingredients":
                     ingredient = chicory_data.get("ingredient", "")
                     confidence = chicory_data.get("confidence", 0)
                     if ingredient and confidence >= confidence_threshold:
                         chicory_matches.append((ingredient, confidence))
+                else:  # categories
+                    category = chicory_data.get("category", "")
+                    confidence = chicory_data.get("confidence", 0)
+                    if category and confidence >= confidence_threshold:
+                        chicory_matches.append((category, confidence))
+        comparison_results[product]["chicory"] = chicory_matches
+    # Step 4: Process with Voyage AI
+    progress_tracker(0.4, desc="Processing with Voyage AI")
+    # Define processing function for Voyage
+    def process_voyage(product):
         try:
+            # Get candidates from embedding results
+            candidates = []
+            if product in embedding_results:
+                candidates = embedding_results[product]
+            if not candidates:
+                print(f"No candidates found for product: {product}")
+                return product, []
+            # Rerank using Voyage
+            try:
+                if match_type == "ingredients":
+                    # Create a proper dictionary with just this product if expanded_descriptions exists
+                    expanded_product_desc = None
+                    if expanded_descriptions and product in expanded_descriptions:
+                        expanded_product_desc = {product: expanded_descriptions.get(product)}
+                    # Convert candidates to the expected dictionary format
+                    ingredient_dict = {}
+                    for c in candidates:
+                        if c[0] in ingredients_dict:  # Get from the original embeddings
+                            ingredient_dict[c[0]] = ingredients_dict[c[0]]
+                    results = hybrid_ingredient_matching(
+                        [product],  # Pass as a list of one product
+                        ingredient_dict,
+                        expanded_descriptions=expanded_product_desc
+                    )
+                else:
+                    # Convert candidates to the expected format
+                    candidate_dict = {c[0]: c[0] for c in candidates}
+                    results = hybrid_category_matching(
+                        products=[product],
+                        categories=candidate_dict,
+                        embedding_top_n=embedding_top_n,
+                        final_top_n=final_top_n,
+                        confidence_threshold=confidence_threshold,
+                        expanded_descriptions=expanded_descriptions
+                    )
+                # Handle special case: if results is a dictionary with product as key
+                if isinstance(results, dict):
+                    results = results.get(product, [])
+                # No need to check 'product in results' when results is not a dict
+                # Ensure results are in the expected format
+                formatted_results = []
+                for r in results[:final_top_n]:
+                    if isinstance(r, dict) and "name" in r and "score" in r:
+                        # Convert score to float to ensure type compatibility
+                        try:
+                            score = float(r["score"])
+                            if score >= confidence_threshold:
+                                formatted_results.append((r["name"], score))
+                        except (ValueError, TypeError):
+                            print(f"Invalid score format in result: {r}")
+                    elif isinstance(r, tuple) and len(r) >= 2:
+                        try:
+                            # Handle 3-element tuples from category matching (id, description, score)
+                            if len(r) >= 3:
+                                score = float(r[2])  # Score is the third element
+                                name = r[0]  # Use category ID as the name
+                            else:
+                                # Standard 2-element tuple (name, score)
+                                score = float(r[1])
+                                name = r[0]
+                            if score >= confidence_threshold:
+                                formatted_results.append((name, score))
+                        except (ValueError, TypeError):
+                            print(f"Invalid score format in tuple: {r}")
+                return product, formatted_results
+            except Exception as e:
+                print(f"Error in Voyage AI reranking for {product}: {str(e)}")
+                # Fall back to embedding results
+                return product, [(c[0], c[1]) for c in candidates[:final_top_n]
+                                if c[1] >= confidence_threshold]
         except Exception as e:
+            print(f"Error processing {product} with Voyage: {str(e)}")
+            # Return an empty result as the ultimate fallback
             return product, []
+    # Process all products with Voyage in parallel
     voyage_results = process_in_parallel(
         items=products,
+        processor_func=process_voyage,
         max_workers=min(20, len(products)),
         progress_tracker=progress_tracker,
         progress_start=0.4,
         if product in comparison_results:
             comparison_results[product]["voyage"] = results
+    # Step 5: Process with OpenAI
     progress_tracker(0.7, desc="Running OpenAI processing in parallel")
+    # Define processing function for OpenAI
+    def process_openai(product):
+        try:
+            # Get candidates from embedding results
+            candidates = []
+            if product in embedding_results:
+                candidates = embedding_results[product]
+            if not candidates:
+                return product, []
+            from api_utils import rank_ingredients_openai
+            # Extract just the names for OpenAI
+            candidate_names = [c[0] for c in candidates]
+            # Use appropriate function based on match type
+            if match_type == "ingredients":
+                ranked_candidates = rank_ingredients_openai(product, candidate_names)
+            else:
+                # For categories, use a similar function but with category prompt
+                from api_utils import rank_categories_openai
+                # Convert the list of names to the dictionary format expected by rank_categories_openai
+                categories_dict = {name: name for name in candidate_names}
+                ranked_candidates = rank_categories_openai(product, categories_dict)
+            return product, [(c[0], c[1]) for c in ranked_candidates[:final_top_n]
+                            if c[1] >= confidence_threshold]
+        except Exception as e:
+            print(f"Error processing {product} with OpenAI: {str(e)}")
+            return product, []
+    # Process all products with OpenAI in parallel
     openai_results = process_in_parallel(
         items=products,
         processor_func=process_openai,
         if product in comparison_results:
             comparison_results[product]["openai"] = results
+    # After processing with each method, ensure consistent format
+    for product, method_results in comparison_results.items():
+        # Ensure all results are in the same format
+        for method in method_results:
+            formatted_results = []
+            for item in method_results[method]:
+                # Convert all results to (name, score) tuples
+                if isinstance(item, tuple) and len(item) >= 2:
+                    formatted_results.append((str(item[0]), float(item[1])))
+                elif isinstance(item, dict):
+                    if "ingredient" in item:
+                        name = item["ingredient"]
+                    elif "category" in item:
+                        name = item["category"]
+                    else:
+                        name = str(item)
+                    if "relevance_score" in item:
+                        score = float(item["relevance_score"])
+                    elif "confidence" in item:
+                        score = float(item["confidence"])
+                    else:
+                        score = 0.0
+                    formatted_results.append((name, score))
+                else:
+                    formatted_results.append((str(item), 0.0))
+            method_results[method] = formatted_results
     progress_tracker(1.0, desc="Comparison complete")
     return comparison_results
+def compare_ingredient_methods_ui(product_input, embedding_top_n=20,
+                                final_top_n=3, confidence_threshold=0.5,
+                                match_type="categories", use_expansion=False, progress=None):
     """
     Compare multiple ingredient matching methods on the same products
     Args:
         product_input: Text input with product names or file path
         embedding_top_n: Number of top ingredients to retrieve using embeddings
         final_top_n: Number of final results to show for each method
         confidence_threshold: Minimum score threshold for final results
+        match_type: Type of matching to perform ('ingredients' or 'categories')
+        use_expansion: Whether to use description expansion
         progress: Optional progress tracking object
     Returns:
     """
     from utils import SafeProgress, load_embeddings
+    progress_tracker = SafeProgress(progress, desc="Comparing matching methods")
     progress_tracker(0.1, desc="Processing input")
     # Split text input by lines and remove empty lines
     if not product_input:
         return "Please enter at least one product."
     if not product_names:
         return "Please enter at least one product."
+    # Load appropriate embeddings based on match type
     try:
+        progress_tracker(0.2, desc="Loading embeddings")
+        if match_type == "ingredients":
+            embeddings_path = "data/ingredient_embeddings_voyageai.pkl"
+            embeddings_dict = load_embeddings(embeddings_path)
+            header_text = f"Comparing {len(product_names)} products using multiple ingredient matching methods."
+        else:  # categories
+            embeddings_path = "data/category_embeddings.pickle"
+            embeddings_dict = load_embeddings(embeddings_path)
+            header_text = f"Comparing {len(product_names)} products using multiple category matching methods."
+        # Initialize expanded_products variable
+        expanded_products = None
+        # Expand descriptions if requested
+        if use_expansion:
+            from openai_expansion import expand_product_descriptions
+            progress_tracker(0.25, desc="Expanding product descriptions")
+            expanded_products = expand_product_descriptions(product_names, progress=progress_tracker)
         progress_tracker(0.3, desc="Comparing methods")
         comparison_results = compare_ingredient_methods(
             products=product_names,
+            ingredients_dict=embeddings_dict,
             embedding_top_n=embedding_top_n,
             final_top_n=final_top_n,
             confidence_threshold=confidence_threshold,
+            match_type=match_type,
+            progress=progress_tracker,
+            expanded_descriptions=expanded_products
         )
     except Exception as e:
         import traceback
     # Format results as HTML using centralized formatters
     progress_tracker(0.9, desc="Formatting results")
     result_elements = []
     for product in product_names:
         if product in comparison_results:
     output_html = create_results_container(
         result_elements,
+        header_text=header_text
     )
     progress_tracker(1.0, desc="Complete")

data/category_embeddings_voyageai.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c51642451d7f5853975e974b46d7466c1a4c238f9caaa302c7ad454111c4fed
+size 1275461

ui.py CHANGED Viewed

@@ -149,6 +149,20 @@ def create_demo():
                                 label="Confidence threshold"
                             )
                         compare_btn = gr.Button("Compare Methods", variant="primary")
                         compare_examples_btn = gr.Button("Load Examples", variant="secondary")
@@ -160,10 +174,11 @@ def create_demo():
                     fn=compare_ingredient_methods_ui,
                     inputs=[
                         compare_product_input,
-                        gr.State(False),  # Always text input mode
                         compare_embedding_top_n,
                         compare_final_top_n,
-                        compare_confidence_threshold
                     ],
                     outputs=comparison_output
                 )

                                 label="Confidence threshold"
                             )
+                        compare_match_type = gr.Radio(
+                            choices=["categories", "ingredients"],
+                            value="categories",
+                            label="Match Type",
+                            info="Choose whether to match against ingredients or categories"
+                        )
+                        # Add expansion checkbox
+                        compare_expansion = gr.Checkbox(
+                            value=False,
+                            label="Use Description Expansion",
+                            info="Expand product descriptions using AI before matching"
+                        )
                         compare_btn = gr.Button("Compare Methods", variant="primary")
                         compare_examples_btn = gr.Button("Load Examples", variant="secondary")
                     fn=compare_ingredient_methods_ui,
                     inputs=[
                         compare_product_input,
                         compare_embedding_top_n,
                         compare_final_top_n,
+                        compare_confidence_threshold,
+                        compare_match_type,
+                        compare_expansion
                     ],
                     outputs=comparison_output
                 )

ui_formatters.py CHANGED Viewed

@@ -37,7 +37,7 @@ STYLES = {
     "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
     "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
     "flex_container": "display: flex; flex-wrap: wrap;",
-    "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
     "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
     "item_list": "list-style-type: none; padding-left: 0;",
     "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
@@ -64,7 +64,8 @@ METHOD_NAMES = {
     "openai": "OpenAI",
     "expanded": "Expanded Description",
     "hybrid": "Hybrid Matching",
-    "categories": "Category Matches"
 }
 def format_method_results(method_key, results, color_hex=None):
@@ -175,8 +176,8 @@ def format_comparison_html(product, method_results):
     Returns:
         HTML string
     """
-    # Create the methods comparison content
-    methods_html = f"<div class='methods-comparison' style='{STYLES['flex_container']}'>"
     # Add results for each method
     for method_key in ["base", "voyage", "chicory", "openai"]:
@@ -502,7 +503,7 @@ def set_theme(theme_name):
             "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
             "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
             "flex_container": "display: flex; flex-wrap: wrap;",
-            "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
             "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
             "item_list": "list-style-type: none; padding-left: 0;",
             "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
@@ -512,7 +513,7 @@ def set_theme(theme_name):
         return True
     return False
-def format_categories_html(product, categories, chicory_result=None, header_color=None, explanation=""):
     """
     Format category matching results as HTML
@@ -522,6 +523,7 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
         chicory_result: Optional chicory parser result for the product
         header_color: Optional header background color
         explanation: Optional expanded description text
     Returns:
         HTML string
@@ -556,9 +558,9 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
     # Add the category results
     content += format_method_results(
-        method_key="categories",
         results=categories,
-        color_hex=header_color or METHOD_COLORS.get("categories", "#1abc9c")
     )
     return format_result_card(title=product, content=content)

     "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
     "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
     "flex_container": "display: flex; flex-wrap: wrap;",
+    "method_container": f"flex: 1; width: 100%; padding: 15px; border-bottom: 1px solid {COLORS['card_border']};",
     "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
     "item_list": "list-style-type: none; padding-left: 0;",
     "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
     "openai": "OpenAI",
     "expanded": "Expanded Description",
     "hybrid": "Hybrid Matching",
+    "categories": "Category Matches",
+    "ingredients": "Ingredient Matches"
 }
 def format_method_results(method_key, results, color_hex=None):
     Returns:
         HTML string
     """
+    # Create the methods comparison content with column direction
+    methods_html = f"<div class='methods-comparison' style='{STYLES['flex_container']}; flex-direction: column;'>"
     # Add results for each method
     for method_key in ["base", "voyage", "chicory", "openai"]:
             "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
             "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
             "flex_container": "display: flex; flex-wrap: wrap;",
+            "method_container": f"flex: 1; width: 100%; padding: 15px; border-bottom: 1px solid {COLORS['card_border']};",
             "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
             "item_list": "list-style-type: none; padding-left: 0;",
             "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
         return True
     return False
+def format_categories_html(product, categories, chicory_result=None, header_color=None, explanation="", match_type="categories"):
     """
     Format category matching results as HTML
         chicory_result: Optional chicory parser result for the product
         header_color: Optional header background color
         explanation: Optional expanded description text
+        match_type: Either "ingredients" or "categories"
     Returns:
         HTML string
     # Add the category results
     content += format_method_results(
+        method_key=match_type,
         results=categories,
+        color_hex=header_color or METHOD_COLORS.get(match_type, "#1abc9c")
     )
     return format_result_card(title=product, content=content)

ui_hybrid_matching.py CHANGED Viewed

@@ -50,7 +50,7 @@ def categorize_products_with_voyage_reranking(product_input, is_file=False, use_
         # Use hybrid approach for ingredients with optional expanded descriptions
         progress_tracker(0.5, desc="Finding and re-ranking ingredients...")
-        match_results = hybrid_ingredient_matching_voyage(
             product_names, embeddings,
             embedding_top_n=int(embedding_top_n),
             final_top_n=int(final_top_n),
@@ -196,4 +196,93 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
             final_results[product] = candidates[:1]
     progress_tracker(1.0, desc="Voyage ingredient matching complete")
     return final_results

         # Use hybrid approach for ingredients with optional expanded descriptions
         progress_tracker(0.5, desc="Finding and re-ranking ingredients...")
+        match_results = hybrid_ingredient_matching(
             product_names, embeddings,
             embedding_top_n=int(embedding_top_n),
             final_top_n=int(final_top_n),
             final_results[product] = candidates[:1]
     progress_tracker(1.0, desc="Voyage ingredient matching complete")
+    return final_results
+# Add this function to ui_hybrid_matching.py
+def hybrid_category_matching_voyage(products, categories_dict,
+                                   embedding_top_n=20, final_top_n=5,
+                                   confidence_threshold=0.5,
+                                   expanded_descriptions=None,
+                                   progress=None):
+    """Use Voyage AI for reranking categories instead of OpenAI"""
+    from utils import SafeProgress
+    from embeddings import create_product_embeddings
+    progress_tracker = SafeProgress(progress, desc="Voyage category matching")
+    progress_tracker(0.1, desc="Stage 1: Finding candidate categories with embeddings")
+    # Stage 1: Same as before - use embeddings to find candidates
+    if expanded_descriptions:
+        # Use expanded descriptions for embedding creation when available
+        products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
+        # Map expanded descriptions back to original product names for consistent keys
+        product_embeddings = {}
+        temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker)
+        # Ensure we use original product names as keys
+        for i, product_name in enumerate(products):
+            if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
+                product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
+    else:
+        # Standard embedding creation with just product names
+        product_embeddings = create_product_embeddings(products, progress=progress_tracker)
+    from similarity import compute_similarities
+    similarities = compute_similarities(categories_dict, product_embeddings)
+    # Filter to top N candidates per product
+    embedding_results = {}
+    for product, product_similarities in similarities.items():
+        embedding_results[product] = product_similarities[:embedding_top_n]
+    progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI")
+    # Initialize Voyage client
+    voyage_client = get_voyage_client()
+    # Stage 2: Re-rank using Voyage AI
+    final_results = {}
+    for i, product in enumerate(products):
+        progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
+        if product not in embedding_results or not embedding_results[product]:
+            final_results[product] = []
+            continue
+        candidates = embedding_results[product]
+        candidate_categories = [c[0] for c in candidates]
+        try:
+            # Use expanded description if available
+            product_text = product
+            if expanded_descriptions and product in expanded_descriptions:
+                product_text = expanded_descriptions[product]
+            # Use plain strings for the documents
+            documents = candidate_categories
+            # Use Voyage reranking
+            reranked = voyage_client.rerank(
+                query=f"Which food category best matches: {product_text}",
+                documents=documents,
+                model="rerank-2"
+            )
+            # Process results - include all results but keep the threshold for later filtering
+            voyage_results = []
+            for result in reranked["results"]:
+                score = result["relevance_score"]
+                text = result["document"]
+                voyage_results.append((text, score))
+            # Limit to final_top_n but don't filter by threshold here
+            final_results[product] = voyage_results[:final_top_n]
+        except Exception as e:
+            print(f"Error during Voyage category reranking for '{product}': {e}")
+            # Fall back to embedding results
+            final_results[product] = candidates[:1]
+    progress_tracker(1.0, desc="Voyage category matching complete")
     return final_results

ui_ingredient_matching.py CHANGED Viewed

@@ -72,7 +72,8 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
             product,
             top_similarities,
             chicory_result=chicory_data,
-            explanation=expansion_text
         )
         output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"

             product,
             top_similarities,
             chicory_result=chicory_data,
+            explanation=expansion_text,
+            match_type="ingredients",
         )
         output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"