Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

esilver commited on Mar 24

Commit

39f78ce

1 Parent(s): 1e737a1

Fixed pipelines bugs

Browse files

Files changed (6) hide show

api_utils.py +25 -2
openai_expansion.py +4 -2
ui_category_matching.py +16 -4
ui_expanded_matching.py +35 -3
ui_hybrid_matching.py +20 -5
ui_ingredient_matching.py +15 -2

api_utils.py CHANGED Viewed

@@ -235,7 +235,28 @@ def rank_ingredients_openai(
             model=model,
             # reasoning={"effort": "low"},
             input=[
-                {"role": "system", "content": f"You are a food ingredient matching expert. Rank the top {max_results} ingredient based on how well they match the given product. Only include ingredients with relevance score >= {confidence_threshold}."},
                 {"role": "user", "content": prompt}
             ],
             text={
@@ -367,7 +388,9 @@ def rank_categories_openai(
             model=model,
             # reasoning={"effort": "low"},
             input=[
-                {"role": "system", "content": f"You are a food categorization expert. Think this through step by step: Rank the top category based on how well it match the given product. Only include categories with relevance score >= {confidence_threshold}."},
                 {"role": "user", "content": prompt}
             ],
             text={

             model=model,
             # reasoning={"effort": "low"},
             input=[
+                {"role": "system", "content": f"""
+                    You are a product categorization expert. Your task is to match product descriptions to the most relevant categories from the PROVIDED LIST ONLY.
+                    CRITICAL RULES:
+                    1. You MUST ONLY select from the exact items listed in "Potential ingredients" - DO NOT create or invent new categories
+                    2. Do not combine items from the list or add any words to them
+                    3. Choose the items from the list that best match what the product IS or CONTAINS
+                    4. If none of the items perfectly match, choose the closest matches from the provided list
+                    For the rankings:
+                    - Select ONLY from the exact items in the "Potential ingredients" list
+                    - Assign relevance scores from 0.0 to 1.0
+                    - Rank the top {max_results} matching ingredients.
+                    - Provide brief explanations for why each item is relevant
+                    - Do not suggest alternatives outside the provided list
+                    Aim to identify the specific product category a consumer would look for when shopping for this exact item.
+                    Only include ingredients with relevance score >= {confidence_threshold}.
+                    Remember: Your ONLY options are the exact items listed in "Potential ingredients" - no additions, modifications, or combinations.
+                 """},
                 {"role": "user", "content": prompt}
             ],
             text={
             model=model,
             # reasoning={"effort": "low"},
             input=[
+                {"role": "system", "content": f"""You are a food ingredient matching expert. Rank the top 5 ingredients that are CONTAINED WITHIN the given product itself - not items related to or used with the product.
+                 Only include ingredients with relevance score >= 0.0. Higher scores should be given to primary ingredients that make up the majority of the product's composition.
+                  Only include categories with relevance score >= {confidence_threshold}."""},
                 {"role": "user", "content": prompt}
             ],
             text={

openai_expansion.py CHANGED Viewed

@@ -34,8 +34,10 @@ def expand_product_descriptions(products: List[str],
                 # max_output_tokens=100,
                 # reasoning={"effort": "low"},
                 input=[
-                    {"role": "system", "content": """You are a product description expert. Your task is to expand product names into a one sentence descriptions that would help an embedding model categorize them correctly.
-                     if multiple variations are possible, provide the most common one. instead of listing the different variations. For example, if the product is in the fresh aisle mostly, it is not frozen.
                      """},
                     {"role": "user", "content": f'Describe "{product}" to an embedding model categorizing products'}
                 ],

                 # max_output_tokens=100,
                 # reasoning={"effort": "low"},
                 input=[
+                    {"role": "system", "content": """You are a product description expert. Your task is to convert brand names or abbreviated product mentions into clear product descriptions that help
+                     categorization models understand WHAT the item is, not just who makes it. Focus on the product category, form, and usage rather
+                      than brand attributes. When brands are mentioned (like Green Mountain), identify the actual product type they represent (coffee pods)
+                      rather than describing the brand itself. Always provide the most common product interpretation and avoid mentioning brands in your description when possible.
                      """},
                     {"role": "user", "content": f'Describe "{product}" to an embedding model categorizing products'}
                 ],

ui_category_matching.py CHANGED Viewed

@@ -30,6 +30,12 @@ def categorize_products_by_category(product_input, is_file=False, use_expansion=
     progress_tracker(0.2, desc="Loading categories...")
     categories = load_categories()
     # Match products to categories
     progress_tracker(0.3, desc="Matching products to categories...")
     match_results = match_products_to_categories(
@@ -39,14 +45,20 @@ def categorize_products_by_category(product_input, is_file=False, use_expansion=
         confidence_threshold=confidence_threshold,
         progress=progress_tracker
     )
     # Format results
     progress_tracker(0.9, desc="Formatting results...")
     output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
     output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
-    for i, product in enumerate(product_names):
-        categories = match_results.get(products_to_match[i], [])
         expansion_text = ""
         if use_expansion and product in expanded_descriptions:
             expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"

     progress_tracker(0.2, desc="Loading categories...")
     categories = load_categories()
+    # Create a mapping from original product names to expanded versions
+    product_to_expanded = {}
+    for i, product in enumerate(product_names):
+        if i < len(products_to_match):
+            product_to_expanded[product] = products_to_match[i]
     # Match products to categories
     progress_tracker(0.3, desc="Matching products to categories...")
     match_results = match_products_to_categories(
         confidence_threshold=confidence_threshold,
         progress=progress_tracker
     )
+    # Create a new dictionary mapping original product names to their results
+    original_product_results = {}
+    for product, expanded in product_to_expanded.items():
+        if expanded in match_results:
+            original_product_results[product] = match_results[expanded]
     # Format results
     progress_tracker(0.9, desc="Formatting results...")
     output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
     output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
+    for product in product_names:
+        categories = original_product_results.get(product, [])
         expansion_text = ""
         if use_expansion and product in expanded_descriptions:
             expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"

ui_expanded_matching.py CHANGED Viewed

@@ -34,15 +34,34 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
     # Get shared OpenAI client
     openai_client = get_openai_client()
     if match_type == "ingredients":
         # Generate product embeddings
         progress_tracker(0.4, desc="Generating product embeddings...")
-        product_embeddings = create_product_embeddings(product_names, progress=progress)
         # Compute embedding similarities for ingredients
         progress_tracker(0.6, desc="Computing ingredient similarities...")
         all_similarities = compute_similarities(embeddings, product_embeddings)
         if not all_similarities:
             return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
@@ -68,7 +87,7 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
                     candidates=candidate_ingredients,
                     expanded_description=expanded_text,
                     client=openai_client,
-                    model="o3-mini",
                     max_results=top_n,
                     confidence_threshold=0.0,  # Don't filter here, do it at display time
                     debug=True
@@ -102,7 +121,20 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
         # Generate product embeddings
         progress_tracker(0.6, desc="Generating product embeddings...")
-        product_embeddings = create_product_embeddings(product_names, progress=progress)
         # Compute embedding similarities for categories
         progress_tracker(0.7, desc="Computing category similarities...")

     # Get shared OpenAI client
     openai_client = get_openai_client()
+    products_for_embedding = ''
     if match_type == "ingredients":
         # Generate product embeddings
         progress_tracker(0.4, desc="Generating product embeddings...")
+        if use_expansion and expanded_descriptions:
+            # Use expanded descriptions for embedding creation when available
+            products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
+            # Map expanded descriptions back to original product names for consistent keys
+            product_embeddings = {}
+            temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress)
+            # Ensure we use original product names as keys
+            for i, product_name in enumerate(product_names):
+                if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
+                    product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
+        else:
+            # Standard embedding creation with just product names
+            product_embeddings = create_product_embeddings(product_names, progress=progress)
         # Compute embedding similarities for ingredients
         progress_tracker(0.6, desc="Computing ingredient similarities...")
         all_similarities = compute_similarities(embeddings, product_embeddings)
+        print(f"product_names: {product_names}")
+        print(f"products_for_embedding: {products_for_embedding}")
+        # print(f"all_similarities: {all_similarities}")
         if not all_similarities:
             return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
                     candidates=candidate_ingredients,
                     expanded_description=expanded_text,
                     client=openai_client,
+                    model="gpt-4o-mini",
                     max_results=top_n,
                     confidence_threshold=0.0,  # Don't filter here, do it at display time
                     debug=True
         # Generate product embeddings
         progress_tracker(0.6, desc="Generating product embeddings...")
+        if use_expansion and expanded_descriptions:
+            # Use expanded descriptions for embedding creation when available
+            products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
+            # Map expanded descriptions back to original product names for consistent keys
+            product_embeddings = {}
+            temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress)
+            # Ensure we use original product names as keys
+            for i, product_name in enumerate(product_names):
+                if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
+                    product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
+        else:
+            # Standard embedding creation with just product names
+            product_embeddings = create_product_embeddings(product_names, progress=progress)
         # Compute embedding similarities for categories
         progress_tracker(0.7, desc="Computing category similarities...")

ui_hybrid_matching.py CHANGED Viewed

@@ -124,7 +124,21 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
     progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
     # Stage 1: Same as before - use embeddings to find candidates
-    product_embeddings = create_product_embeddings(products, progress=progress_tracker)
     similarities = compute_similarities(ingredients_dict, product_embeddings)
     # Filter to top N candidates per product
@@ -139,7 +153,7 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
     # Stage 2: Re-rank using Voyage instead of OpenAI
     final_results = {}
     for i, product in enumerate(products):
         progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
@@ -156,8 +170,8 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
             if expanded_descriptions and product in expanded_descriptions:
                 product_text = expanded_descriptions[product]
-            # Build documents for reranking
-            documents = [{"text": ingredient} for ingredient in candidate_ingredients]
             # Use Voyage reranking
             reranked = voyage_client.rerank(
@@ -170,7 +184,8 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
             voyage_results = []
             for result in reranked["results"]:
                 score = result["relevance_score"]
-                voyage_results.append((result["document"]["text"], score))
             # Still limit to final_top_n but don't filter by threshold here
             final_results[product] = voyage_results[:final_top_n]

     progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
     # Stage 1: Same as before - use embeddings to find candidates
+    if expanded_descriptions:
+        # Use expanded descriptions for embedding creation when available
+        products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
+        # Map expanded descriptions back to original product names for consistent keys
+        product_embeddings = {}
+        temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker)
+        # Ensure we use original product names as keys
+        for i, product_name in enumerate(products):
+            if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
+                product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
+    else:
+        # Standard embedding creation with just product names
+        product_embeddings = create_product_embeddings(products, progress=progress_tracker)
     similarities = compute_similarities(ingredients_dict, product_embeddings)
     # Filter to top N candidates per product
     # Stage 2: Re-rank using Voyage instead of OpenAI
     final_results = {}
     for i, product in enumerate(products):
         progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
             if expanded_descriptions and product in expanded_descriptions:
                 product_text = expanded_descriptions[product]
+            # Use plain strings for the documents instead of objects with text property
+            documents = candidate_ingredients  # Simply use the list of strings directly
             # Use Voyage reranking
             reranked = voyage_client.rerank(
             voyage_results = []
             for result in reranked["results"]:
                 score = result["relevance_score"]
+                text = result["document"]  # Now this is the direct string, not an object
+                voyage_results.append((text, score))
             # Still limit to final_top_n but don't filter by threshold here
             final_results[product] = voyage_results[:final_top_n]

ui_ingredient_matching.py CHANGED Viewed

@@ -29,7 +29,20 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
     # Create embeddings
     progress_tracker(0.4, desc="Generating product embeddings...")
-    products_embeddings = create_product_embeddings(product_names, progress=gr.Progress())
     if not products_embeddings:
         return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
@@ -69,4 +82,4 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
     progress_tracker(1.0, desc="Done!")
     return create_results_container(output_html)

     # Create embeddings
     progress_tracker(0.4, desc="Generating product embeddings...")
+    if use_expansion and expanded_descriptions:
+        # Use expanded descriptions for embedding creation when available
+        products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
+        # Map expanded descriptions back to original product names for consistent keys
+        products_embeddings = {}
+        temp_embeddings = create_product_embeddings(products_for_embedding, progress=gr.Progress())
+        # Ensure we use original product names as keys
+        for i, product_name in enumerate(product_names):
+            if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
+                products_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
+    else:
+        # Standard embedding creation with just product names
+        products_embeddings = create_product_embeddings(product_names, progress=gr.Progress())
     if not products_embeddings:
         return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
     progress_tracker(1.0, desc="Done!")
     return create_results_container(output_html)