esilver commited on
Commit
39f78ce
·
1 Parent(s): 1e737a1

Fixed pipelines bugs

Browse files
api_utils.py CHANGED
@@ -235,7 +235,28 @@ def rank_ingredients_openai(
235
  model=model,
236
  # reasoning={"effort": "low"},
237
  input=[
238
- {"role": "system", "content": f"You are a food ingredient matching expert. Rank the top {max_results} ingredient based on how well they match the given product. Only include ingredients with relevance score >= {confidence_threshold}."},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  {"role": "user", "content": prompt}
240
  ],
241
  text={
@@ -367,7 +388,9 @@ def rank_categories_openai(
367
  model=model,
368
  # reasoning={"effort": "low"},
369
  input=[
370
- {"role": "system", "content": f"You are a food categorization expert. Think this through step by step: Rank the top category based on how well it match the given product. Only include categories with relevance score >= {confidence_threshold}."},
 
 
371
  {"role": "user", "content": prompt}
372
  ],
373
  text={
 
235
  model=model,
236
  # reasoning={"effort": "low"},
237
  input=[
238
+ {"role": "system", "content": f"""
239
+ You are a product categorization expert. Your task is to match product descriptions to the most relevant categories from the PROVIDED LIST ONLY.
240
+
241
+ CRITICAL RULES:
242
+ 1. You MUST ONLY select from the exact items listed in "Potential ingredients" - DO NOT create or invent new categories
243
+ 2. Do not combine items from the list or add any words to them
244
+ 3. Choose the items from the list that best match what the product IS or CONTAINS
245
+ 4. If none of the items perfectly match, choose the closest matches from the provided list
246
+
247
+ For the rankings:
248
+ - Select ONLY from the exact items in the "Potential ingredients" list
249
+ - Assign relevance scores from 0.0 to 1.0
250
+ - Rank the top {max_results} matching ingredients.
251
+ - Provide brief explanations for why each item is relevant
252
+ - Do not suggest alternatives outside the provided list
253
+
254
+ Aim to identify the specific product category a consumer would look for when shopping for this exact item.
255
+ Only include ingredients with relevance score >= {confidence_threshold}.
256
+
257
+ Remember: Your ONLY options are the exact items listed in "Potential ingredients" - no additions, modifications, or combinations.
258
+
259
+ """},
260
  {"role": "user", "content": prompt}
261
  ],
262
  text={
 
388
  model=model,
389
  # reasoning={"effort": "low"},
390
  input=[
391
+ {"role": "system", "content": f"""You are a food ingredient matching expert. Rank the top 5 ingredients that are CONTAINED WITHIN the given product itself - not items related to or used with the product.
392
+ Only include ingredients with relevance score >= 0.0. Higher scores should be given to primary ingredients that make up the majority of the product's composition.
393
+ Only include categories with relevance score >= {confidence_threshold}."""},
394
  {"role": "user", "content": prompt}
395
  ],
396
  text={
openai_expansion.py CHANGED
@@ -34,8 +34,10 @@ def expand_product_descriptions(products: List[str],
34
  # max_output_tokens=100,
35
  # reasoning={"effort": "low"},
36
  input=[
37
- {"role": "system", "content": """You are a product description expert. Your task is to expand product names into a one sentence descriptions that would help an embedding model categorize them correctly.
38
- if multiple variations are possible, provide the most common one. instead of listing the different variations. For example, if the product is in the fresh aisle mostly, it is not frozen.
 
 
39
  """},
40
  {"role": "user", "content": f'Describe "{product}" to an embedding model categorizing products'}
41
  ],
 
34
  # max_output_tokens=100,
35
  # reasoning={"effort": "low"},
36
  input=[
37
+ {"role": "system", "content": """You are a product description expert. Your task is to convert brand names or abbreviated product mentions into clear product descriptions that help
38
+ categorization models understand WHAT the item is, not just who makes it. Focus on the product category, form, and usage rather
39
+ than brand attributes. When brands are mentioned (like Green Mountain), identify the actual product type they represent (coffee pods)
40
+ rather than describing the brand itself. Always provide the most common product interpretation and avoid mentioning brands in your description when possible.
41
  """},
42
  {"role": "user", "content": f'Describe "{product}" to an embedding model categorizing products'}
43
  ],
ui_category_matching.py CHANGED
@@ -30,6 +30,12 @@ def categorize_products_by_category(product_input, is_file=False, use_expansion=
30
  progress_tracker(0.2, desc="Loading categories...")
31
  categories = load_categories()
32
 
 
 
 
 
 
 
33
  # Match products to categories
34
  progress_tracker(0.3, desc="Matching products to categories...")
35
  match_results = match_products_to_categories(
@@ -39,14 +45,20 @@ def categorize_products_by_category(product_input, is_file=False, use_expansion=
39
  confidence_threshold=confidence_threshold,
40
  progress=progress_tracker
41
  )
42
-
 
 
 
 
 
 
43
  # Format results
44
  progress_tracker(0.9, desc="Formatting results...")
45
  output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
46
  output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
47
-
48
- for i, product in enumerate(product_names):
49
- categories = match_results.get(products_to_match[i], [])
50
  expansion_text = ""
51
  if use_expansion and product in expanded_descriptions:
52
  expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"
 
30
  progress_tracker(0.2, desc="Loading categories...")
31
  categories = load_categories()
32
 
33
+ # Create a mapping from original product names to expanded versions
34
+ product_to_expanded = {}
35
+ for i, product in enumerate(product_names):
36
+ if i < len(products_to_match):
37
+ product_to_expanded[product] = products_to_match[i]
38
+
39
  # Match products to categories
40
  progress_tracker(0.3, desc="Matching products to categories...")
41
  match_results = match_products_to_categories(
 
45
  confidence_threshold=confidence_threshold,
46
  progress=progress_tracker
47
  )
48
+
49
+ # Create a new dictionary mapping original product names to their results
50
+ original_product_results = {}
51
+ for product, expanded in product_to_expanded.items():
52
+ if expanded in match_results:
53
+ original_product_results[product] = match_results[expanded]
54
+
55
  # Format results
56
  progress_tracker(0.9, desc="Formatting results...")
57
  output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
58
  output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
59
+
60
+ for product in product_names:
61
+ categories = original_product_results.get(product, [])
62
  expansion_text = ""
63
  if use_expansion and product in expanded_descriptions:
64
  expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"
ui_expanded_matching.py CHANGED
@@ -34,15 +34,34 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
34
  # Get shared OpenAI client
35
  openai_client = get_openai_client()
36
 
 
 
37
  if match_type == "ingredients":
38
  # Generate product embeddings
39
  progress_tracker(0.4, desc="Generating product embeddings...")
40
- product_embeddings = create_product_embeddings(product_names, progress=progress)
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Compute embedding similarities for ingredients
43
  progress_tracker(0.6, desc="Computing ingredient similarities...")
44
  all_similarities = compute_similarities(embeddings, product_embeddings)
45
 
 
 
 
 
46
  if not all_similarities:
47
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
48
 
@@ -68,7 +87,7 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
68
  candidates=candidate_ingredients,
69
  expanded_description=expanded_text,
70
  client=openai_client,
71
- model="o3-mini",
72
  max_results=top_n,
73
  confidence_threshold=0.0, # Don't filter here, do it at display time
74
  debug=True
@@ -102,7 +121,20 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
102
 
103
  # Generate product embeddings
104
  progress_tracker(0.6, desc="Generating product embeddings...")
105
- product_embeddings = create_product_embeddings(product_names, progress=progress)
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
  # Compute embedding similarities for categories
108
  progress_tracker(0.7, desc="Computing category similarities...")
 
34
  # Get shared OpenAI client
35
  openai_client = get_openai_client()
36
 
37
+ products_for_embedding = ''
38
+
39
  if match_type == "ingredients":
40
  # Generate product embeddings
41
  progress_tracker(0.4, desc="Generating product embeddings...")
42
+ if use_expansion and expanded_descriptions:
43
+ # Use expanded descriptions for embedding creation when available
44
+ products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
45
+ # Map expanded descriptions back to original product names for consistent keys
46
+ product_embeddings = {}
47
+ temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress)
48
+
49
+ # Ensure we use original product names as keys
50
+ for i, product_name in enumerate(product_names):
51
+ if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
52
+ product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
53
+ else:
54
+ # Standard embedding creation with just product names
55
+ product_embeddings = create_product_embeddings(product_names, progress=progress)
56
 
57
  # Compute embedding similarities for ingredients
58
  progress_tracker(0.6, desc="Computing ingredient similarities...")
59
  all_similarities = compute_similarities(embeddings, product_embeddings)
60
 
61
+ print(f"product_names: {product_names}")
62
+ print(f"products_for_embedding: {products_for_embedding}")
63
+ # print(f"all_similarities: {all_similarities}")
64
+
65
  if not all_similarities:
66
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
67
 
 
87
  candidates=candidate_ingredients,
88
  expanded_description=expanded_text,
89
  client=openai_client,
90
+ model="gpt-4o-mini",
91
  max_results=top_n,
92
  confidence_threshold=0.0, # Don't filter here, do it at display time
93
  debug=True
 
121
 
122
  # Generate product embeddings
123
  progress_tracker(0.6, desc="Generating product embeddings...")
124
+ if use_expansion and expanded_descriptions:
125
+ # Use expanded descriptions for embedding creation when available
126
+ products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
127
+ # Map expanded descriptions back to original product names for consistent keys
128
+ product_embeddings = {}
129
+ temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress)
130
+
131
+ # Ensure we use original product names as keys
132
+ for i, product_name in enumerate(product_names):
133
+ if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
134
+ product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
135
+ else:
136
+ # Standard embedding creation with just product names
137
+ product_embeddings = create_product_embeddings(product_names, progress=progress)
138
 
139
  # Compute embedding similarities for categories
140
  progress_tracker(0.7, desc="Computing category similarities...")
ui_hybrid_matching.py CHANGED
@@ -124,7 +124,21 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
124
  progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
125
 
126
  # Stage 1: Same as before - use embeddings to find candidates
127
- product_embeddings = create_product_embeddings(products, progress=progress_tracker)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  similarities = compute_similarities(ingredients_dict, product_embeddings)
129
 
130
  # Filter to top N candidates per product
@@ -139,7 +153,7 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
139
 
140
  # Stage 2: Re-rank using Voyage instead of OpenAI
141
  final_results = {}
142
-
143
  for i, product in enumerate(products):
144
  progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
145
 
@@ -156,8 +170,8 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
156
  if expanded_descriptions and product in expanded_descriptions:
157
  product_text = expanded_descriptions[product]
158
 
159
- # Build documents for reranking
160
- documents = [{"text": ingredient} for ingredient in candidate_ingredients]
161
 
162
  # Use Voyage reranking
163
  reranked = voyage_client.rerank(
@@ -170,7 +184,8 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
170
  voyage_results = []
171
  for result in reranked["results"]:
172
  score = result["relevance_score"]
173
- voyage_results.append((result["document"]["text"], score))
 
174
 
175
  # Still limit to final_top_n but don't filter by threshold here
176
  final_results[product] = voyage_results[:final_top_n]
 
124
  progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
125
 
126
  # Stage 1: Same as before - use embeddings to find candidates
127
+ if expanded_descriptions:
128
+ # Use expanded descriptions for embedding creation when available
129
+ products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
130
+ # Map expanded descriptions back to original product names for consistent keys
131
+ product_embeddings = {}
132
+ temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker)
133
+
134
+ # Ensure we use original product names as keys
135
+ for i, product_name in enumerate(products):
136
+ if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
137
+ product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
138
+ else:
139
+ # Standard embedding creation with just product names
140
+ product_embeddings = create_product_embeddings(products, progress=progress_tracker)
141
+
142
  similarities = compute_similarities(ingredients_dict, product_embeddings)
143
 
144
  # Filter to top N candidates per product
 
153
 
154
  # Stage 2: Re-rank using Voyage instead of OpenAI
155
  final_results = {}
156
+
157
  for i, product in enumerate(products):
158
  progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
159
 
 
170
  if expanded_descriptions and product in expanded_descriptions:
171
  product_text = expanded_descriptions[product]
172
 
173
+ # Use plain strings for the documents instead of objects with text property
174
+ documents = candidate_ingredients # Simply use the list of strings directly
175
 
176
  # Use Voyage reranking
177
  reranked = voyage_client.rerank(
 
184
  voyage_results = []
185
  for result in reranked["results"]:
186
  score = result["relevance_score"]
187
+ text = result["document"] # Now this is the direct string, not an object
188
+ voyage_results.append((text, score))
189
 
190
  # Still limit to final_top_n but don't filter by threshold here
191
  final_results[product] = voyage_results[:final_top_n]
ui_ingredient_matching.py CHANGED
@@ -29,7 +29,20 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
29
 
30
  # Create embeddings
31
  progress_tracker(0.4, desc="Generating product embeddings...")
32
- products_embeddings = create_product_embeddings(product_names, progress=gr.Progress())
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  if not products_embeddings:
35
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
@@ -69,4 +82,4 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
69
 
70
  progress_tracker(1.0, desc="Done!")
71
  return create_results_container(output_html)
72
-
 
29
 
30
  # Create embeddings
31
  progress_tracker(0.4, desc="Generating product embeddings...")
32
+ if use_expansion and expanded_descriptions:
33
+ # Use expanded descriptions for embedding creation when available
34
+ products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
35
+ # Map expanded descriptions back to original product names for consistent keys
36
+ products_embeddings = {}
37
+ temp_embeddings = create_product_embeddings(products_for_embedding, progress=gr.Progress())
38
+
39
+ # Ensure we use original product names as keys
40
+ for i, product_name in enumerate(product_names):
41
+ if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
42
+ products_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
43
+ else:
44
+ # Standard embedding creation with just product names
45
+ products_embeddings = create_product_embeddings(product_names, progress=gr.Progress())
46
 
47
  if not products_embeddings:
48
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
 
82
 
83
  progress_tracker(1.0, desc="Done!")
84
  return create_results_container(output_html)
85
+