esilver commited on
Commit
3b88add
·
1 Parent(s): 8007258

Fixed bugs

Browse files
comparison.py CHANGED
@@ -1,40 +1,40 @@
1
  import json
2
  import numpy as np
3
  from typing import Dict, List, Tuple, Any
4
- import concurrent.futures
5
- import time
6
- import os
7
- from api_utils import get_openai_client, get_voyage_client, process_in_parallel, rank_ingredients_openai
 
8
  from ui_formatters import format_comparison_html, create_results_container
9
 
 
 
 
 
 
10
  def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str, Any],
11
  embedding_top_n: int = 20, final_top_n: int = 3,
12
- confidence_threshold: float = 0.5,
13
- progress=None) -> Dict[str, Dict[str, List[Tuple]]]:
14
  """
15
- Compare four different methods for ingredient matching:
16
- 1. Base embeddings (without re-ranking)
17
- 2. Voyage AI reranker (via hybrid approach)
18
- 3. Chicory parser
19
- 4. GPT-4o structured output
20
 
21
  Args:
22
- products: List of product names to categorize
23
- ingredients_dict: Dictionary of ingredient names to embeddings
24
  embedding_top_n: Number of top ingredients to retrieve using embeddings
25
  final_top_n: Number of final results to show for each method
26
  confidence_threshold: Minimum score threshold for final results
 
27
  progress: Optional progress tracking object
28
 
29
  Returns:
30
- Dictionary mapping products to results from each method
31
  """
32
- from utils import SafeProgress, preprocess_product_for_matching
33
- from embeddings import create_product_embeddings
34
- from chicory_api import call_chicory_parser
35
- from similarity import compute_similarities
36
 
37
- progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods")
38
 
39
  # Step 1: Generate embeddings for all products (used by multiple methods)
40
  progress_tracker(0.1, desc="Generating product embeddings")
@@ -49,112 +49,144 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
49
  for product, product_similarities in similarities.items():
50
  embedding_results[product] = product_similarities[:embedding_top_n]
51
 
52
- # Step 3: Call Chicory Parser API (this is done for all products at once)
53
- progress_tracker(0.3, desc="Calling Chicory Parser API")
 
 
 
54
  chicory_results = call_chicory_parser(products, progress=progress_tracker)
55
 
56
- # Create final results dictionary with base embeddings (which don't need any further processing)
57
  comparison_results = {}
58
  for product in products:
 
 
 
 
 
 
 
 
59
  if product in embedding_results:
60
- # Initialize with base embeddings already calculated
61
- candidates = embedding_results[product]
62
- base_results = [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
63
- comparison_results[product] = {
64
- "base": base_results,
65
- "voyage": [],
66
- "chicory": [],
67
- "openai": []
68
- }
69
 
70
- # Also process Chicory results immediately as they're already fetched
71
- chicory_matches = []
72
- if product in chicory_results:
73
- chicory_data = chicory_results[product]
74
- if isinstance(chicory_data, dict):
 
 
75
  ingredient = chicory_data.get("ingredient", "")
76
  confidence = chicory_data.get("confidence", 0)
77
  if ingredient and confidence >= confidence_threshold:
78
  chicory_matches.append((ingredient, confidence))
79
- comparison_results[product]["chicory"] = chicory_matches
80
- else:
81
- comparison_results[product] = {
82
- "base": [],
83
- "voyage": [],
84
- "chicory": [],
85
- "openai": []
86
- }
87
-
88
- # Initialize clients for reranking - REPLACED WITH UTILITY FUNCTIONS
89
- voyage_client = get_voyage_client()
90
- openai_client = get_openai_client()
91
-
92
- # Define the methods that will be executed in parallel (now focused only on the API-heavy tasks)
93
- def process_voyage_reranking(product):
94
- if product not in embedding_results or not embedding_results[product]:
95
- return product, []
96
-
97
- candidates = embedding_results[product]
98
- candidate_ingredients = [c[0] for c in candidates]
99
- candidate_texts = [f"Ingredient: {c[0]}" for c in candidates]
100
-
101
  try:
102
- # Apply Voyage reranking to the candidates
103
- query = product # Use product directly as query
104
- reranking = voyage_client.rerank(
105
- query=query,
106
- documents=candidate_texts,
107
- model="rerank-2",
108
- top_k=final_top_n
109
- )
110
 
111
- # Process reranking results
112
- voyage_ingredients = []
113
- for result in reranking.results:
114
- # Find the ingredient for this result
115
- candidate_index = candidate_texts.index(result.document)
116
- ingredient = candidate_ingredients[candidate_index]
117
- score = float(result.relevance_score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
- # Only include results above the confidence threshold
120
- if score >= confidence_threshold:
121
- voyage_ingredients.append((ingredient, score))
122
-
123
- return product, voyage_ingredients
124
  except Exception as e:
125
- print(f"Error during Voyage reranking for '{product}': {e}")
126
- # Fall back to embedding results
127
- return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
128
-
129
- def process_openai(product):
130
- if product not in embedding_results or not embedding_results[product]:
131
  return product, []
132
-
133
- candidates = embedding_results[product]
134
- candidate_ingredients = [c[0] for c in candidates]
135
-
136
- try:
137
- # Use the shared utility function
138
- openai_ingredients = rank_ingredients_openai(
139
- product=product,
140
- candidates=candidate_ingredients,
141
- client=openai_client,
142
- model="gpt-4o-mini",
143
- max_results=final_top_n,
144
- confidence_threshold=confidence_threshold
145
- )
146
-
147
- return product, openai_ingredients
148
- except Exception as e:
149
- print(f"Error during OpenAI processing for '{product}': {e}")
150
- # Fall back to embedding results
151
- return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
152
 
153
- # Process Voyage AI reranking in parallel - REPLACED WITH SHARED UTILITY
154
- progress_tracker(0.4, desc="Running Voyage AI reranking in parallel")
155
  voyage_results = process_in_parallel(
156
  items=products,
157
- processor_func=process_voyage_reranking,
158
  max_workers=min(20, len(products)),
159
  progress_tracker=progress_tracker,
160
  progress_start=0.4,
@@ -167,8 +199,44 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
167
  if product in comparison_results:
168
  comparison_results[product]["voyage"] = results
169
 
170
- # Process OpenAI queries in parallel - REPLACED WITH SHARED UTILITY
171
  progress_tracker(0.7, desc="Running OpenAI processing in parallel")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  openai_results = process_in_parallel(
173
  items=products,
174
  processor_func=process_openai,
@@ -184,20 +252,52 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
184
  if product in comparison_results:
185
  comparison_results[product]["openai"] = results
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  progress_tracker(1.0, desc="Comparison complete")
188
  return comparison_results
189
 
190
- def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=20,
191
- final_top_n=3, confidence_threshold=0.5, progress=None):
 
192
  """
193
  Compare multiple ingredient matching methods on the same products
194
 
195
  Args:
196
  product_input: Text input with product names or file path
197
- is_file: Whether the input is a file
198
  embedding_top_n: Number of top ingredients to retrieve using embeddings
199
  final_top_n: Number of final results to show for each method
200
  confidence_threshold: Minimum score threshold for final results
 
 
201
  progress: Optional progress tracking object
202
 
203
  Returns:
@@ -205,10 +305,9 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
205
  """
206
  from utils import SafeProgress, load_embeddings
207
 
208
- progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods")
209
  progress_tracker(0.1, desc="Processing input")
210
 
211
-
212
  # Split text input by lines and remove empty lines
213
  if not product_input:
214
  return "Please enter at least one product."
@@ -216,19 +315,37 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
216
  if not product_names:
217
  return "Please enter at least one product."
218
 
219
- # Load ingredient embeddings
220
  try:
221
- progress_tracker(0.2, desc="Loading ingredient embeddings")
222
- ingredients_dict = load_embeddings("data/ingredient_embeddings_voyageai.pkl")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  progress_tracker(0.3, desc="Comparing methods")
225
  comparison_results = compare_ingredient_methods(
226
  products=product_names,
227
- ingredients_dict=ingredients_dict,
228
  embedding_top_n=embedding_top_n,
229
  final_top_n=final_top_n,
230
  confidence_threshold=confidence_threshold,
231
- progress=progress_tracker
 
 
232
  )
233
  except Exception as e:
234
  import traceback
@@ -237,7 +354,6 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
237
 
238
  # Format results as HTML using centralized formatters
239
  progress_tracker(0.9, desc="Formatting results")
240
-
241
  result_elements = []
242
  for product in product_names:
243
  if product in comparison_results:
@@ -245,7 +361,7 @@ def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=
245
 
246
  output_html = create_results_container(
247
  result_elements,
248
- header_text=f"Comparing {len(product_names)} products using multiple ingredient matching methods."
249
  )
250
 
251
  progress_tracker(1.0, desc="Complete")
 
1
  import json
2
  import numpy as np
3
  from typing import Dict, List, Tuple, Any
4
+
5
+ from category_matching import hybrid_category_matching
6
+ from similarity import hybrid_ingredient_matching
7
+
8
+ from api_utils import process_in_parallel, rank_ingredients_openai
9
  from ui_formatters import format_comparison_html, create_results_container
10
 
11
+ from utils import SafeProgress
12
+ from chicory_api import call_chicory_parser
13
+ from embeddings import create_product_embeddings
14
+ from similarity import compute_similarities
15
+
16
  def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str, Any],
17
  embedding_top_n: int = 20, final_top_n: int = 3,
18
+ confidence_threshold: float = 0.5, match_type="ingredients",
19
+ progress=None, expanded_descriptions=None) -> Dict[str, Dict[str, List[Tuple]]]:
20
  """
21
+ Compare multiple ingredient/category matching methods on the same products
 
 
 
 
22
 
23
  Args:
24
+ products: List of product names to process
25
+ ingredients_dict: Dictionary with ingredient embeddings
26
  embedding_top_n: Number of top ingredients to retrieve using embeddings
27
  final_top_n: Number of final results to show for each method
28
  confidence_threshold: Minimum score threshold for final results
29
+ match_type: Type of matching to perform ('ingredients' or 'categories')
30
  progress: Optional progress tracking object
31
 
32
  Returns:
33
+ Dictionary mapping products to methods and their results
34
  """
35
+
 
 
 
36
 
37
+ progress_tracker = SafeProgress(progress, desc="Comparing matching methods")
38
 
39
  # Step 1: Generate embeddings for all products (used by multiple methods)
40
  progress_tracker(0.1, desc="Generating product embeddings")
 
49
  for product, product_similarities in similarities.items():
50
  embedding_results[product] = product_similarities[:embedding_top_n]
51
 
52
+ # Step 3: Process with Chicory Parser
53
+ progress_tracker(0.3, desc="Running Chicory Parser")
54
+ # Import here to avoid circular imports
55
+ # from chicory_parser import parse_products
56
+
57
  chicory_results = call_chicory_parser(products, progress=progress_tracker)
58
 
59
+ # Initialize result structure
60
  comparison_results = {}
61
  for product in products:
62
+ comparison_results[product] = {
63
+ "base": [],
64
+ "voyage": [],
65
+ "chicory": [],
66
+ "openai": []
67
+ }
68
+
69
+ # Add basic embedding results
70
  if product in embedding_results:
71
+ base_results = []
72
+ for name, score in embedding_results[product]:
73
+ if score >= confidence_threshold:
74
+ base_results.append((name, score))
75
+ comparison_results[product]["base"] = base_results[:final_top_n]
 
 
 
 
76
 
77
+ # Process Chicory results
78
+ chicory_matches = []
79
+ if product in chicory_results:
80
+ chicory_data = chicory_results[product]
81
+ if isinstance(chicory_data, dict):
82
+ # Handle different response formats based on match type
83
+ if match_type == "ingredients":
84
  ingredient = chicory_data.get("ingredient", "")
85
  confidence = chicory_data.get("confidence", 0)
86
  if ingredient and confidence >= confidence_threshold:
87
  chicory_matches.append((ingredient, confidence))
88
+ else: # categories
89
+ category = chicory_data.get("category", "")
90
+ confidence = chicory_data.get("confidence", 0)
91
+ if category and confidence >= confidence_threshold:
92
+ chicory_matches.append((category, confidence))
93
+
94
+ comparison_results[product]["chicory"] = chicory_matches
95
+
96
+ # Step 4: Process with Voyage AI
97
+ progress_tracker(0.4, desc="Processing with Voyage AI")
98
+
99
+ # Define processing function for Voyage
100
+ def process_voyage(product):
 
 
 
 
 
 
 
 
 
101
  try:
102
+ # Get candidates from embedding results
103
+ candidates = []
104
+ if product in embedding_results:
105
+ candidates = embedding_results[product]
 
 
 
 
106
 
107
+ if not candidates:
108
+ print(f"No candidates found for product: {product}")
109
+ return product, []
110
+
111
+ # Rerank using Voyage
112
+ try:
113
+ if match_type == "ingredients":
114
+ # Create a proper dictionary with just this product if expanded_descriptions exists
115
+ expanded_product_desc = None
116
+ if expanded_descriptions and product in expanded_descriptions:
117
+ expanded_product_desc = {product: expanded_descriptions.get(product)}
118
+
119
+ # Convert candidates to the expected dictionary format
120
+ ingredient_dict = {}
121
+ for c in candidates:
122
+ if c[0] in ingredients_dict: # Get from the original embeddings
123
+ ingredient_dict[c[0]] = ingredients_dict[c[0]]
124
+
125
+ results = hybrid_ingredient_matching(
126
+ [product], # Pass as a list of one product
127
+ ingredient_dict,
128
+ expanded_descriptions=expanded_product_desc
129
+ )
130
+ else:
131
+ # Convert candidates to the expected format
132
+ candidate_dict = {c[0]: c[0] for c in candidates}
133
+ results = hybrid_category_matching(
134
+ products=[product],
135
+ categories=candidate_dict,
136
+ embedding_top_n=embedding_top_n,
137
+ final_top_n=final_top_n,
138
+ confidence_threshold=confidence_threshold,
139
+ expanded_descriptions=expanded_descriptions
140
+ )
141
+
142
+ # Handle special case: if results is a dictionary with product as key
143
+ if isinstance(results, dict):
144
+ results = results.get(product, [])
145
+ # No need to check 'product in results' when results is not a dict
146
+
147
+ # Ensure results are in the expected format
148
+ formatted_results = []
149
+ for r in results[:final_top_n]:
150
+ if isinstance(r, dict) and "name" in r and "score" in r:
151
+ # Convert score to float to ensure type compatibility
152
+ try:
153
+ score = float(r["score"])
154
+ if score >= confidence_threshold:
155
+ formatted_results.append((r["name"], score))
156
+ except (ValueError, TypeError):
157
+ print(f"Invalid score format in result: {r}")
158
+ elif isinstance(r, tuple) and len(r) >= 2:
159
+ try:
160
+ # Handle 3-element tuples from category matching (id, description, score)
161
+ if len(r) >= 3:
162
+ score = float(r[2]) # Score is the third element
163
+ name = r[0] # Use category ID as the name
164
+ else:
165
+ # Standard 2-element tuple (name, score)
166
+ score = float(r[1])
167
+ name = r[0]
168
+
169
+ if score >= confidence_threshold:
170
+ formatted_results.append((name, score))
171
+ except (ValueError, TypeError):
172
+ print(f"Invalid score format in tuple: {r}")
173
+
174
+ return product, formatted_results
175
+ except Exception as e:
176
+ print(f"Error in Voyage AI reranking for {product}: {str(e)}")
177
+ # Fall back to embedding results
178
+ return product, [(c[0], c[1]) for c in candidates[:final_top_n]
179
+ if c[1] >= confidence_threshold]
180
 
 
 
 
 
 
181
  except Exception as e:
182
+ print(f"Error processing {product} with Voyage: {str(e)}")
183
+ # Return an empty result as the ultimate fallback
 
 
 
 
184
  return product, []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
+ # Process all products with Voyage in parallel
 
187
  voyage_results = process_in_parallel(
188
  items=products,
189
+ processor_func=process_voyage,
190
  max_workers=min(20, len(products)),
191
  progress_tracker=progress_tracker,
192
  progress_start=0.4,
 
199
  if product in comparison_results:
200
  comparison_results[product]["voyage"] = results
201
 
202
+ # Step 5: Process with OpenAI
203
  progress_tracker(0.7, desc="Running OpenAI processing in parallel")
204
+
205
+ # Define processing function for OpenAI
206
+ def process_openai(product):
207
+ try:
208
+ # Get candidates from embedding results
209
+ candidates = []
210
+ if product in embedding_results:
211
+ candidates = embedding_results[product]
212
+
213
+ if not candidates:
214
+ return product, []
215
+
216
+ from api_utils import rank_ingredients_openai
217
+
218
+ # Extract just the names for OpenAI
219
+ candidate_names = [c[0] for c in candidates]
220
+
221
+ # Use appropriate function based on match type
222
+ if match_type == "ingredients":
223
+ ranked_candidates = rank_ingredients_openai(product, candidate_names)
224
+ else:
225
+ # For categories, use a similar function but with category prompt
226
+ from api_utils import rank_categories_openai
227
+
228
+ # Convert the list of names to the dictionary format expected by rank_categories_openai
229
+ categories_dict = {name: name for name in candidate_names}
230
+
231
+ ranked_candidates = rank_categories_openai(product, categories_dict)
232
+
233
+ return product, [(c[0], c[1]) for c in ranked_candidates[:final_top_n]
234
+ if c[1] >= confidence_threshold]
235
+ except Exception as e:
236
+ print(f"Error processing {product} with OpenAI: {str(e)}")
237
+ return product, []
238
+
239
+ # Process all products with OpenAI in parallel
240
  openai_results = process_in_parallel(
241
  items=products,
242
  processor_func=process_openai,
 
252
  if product in comparison_results:
253
  comparison_results[product]["openai"] = results
254
 
255
+ # After processing with each method, ensure consistent format
256
+ for product, method_results in comparison_results.items():
257
+ # Ensure all results are in the same format
258
+ for method in method_results:
259
+ formatted_results = []
260
+ for item in method_results[method]:
261
+ # Convert all results to (name, score) tuples
262
+ if isinstance(item, tuple) and len(item) >= 2:
263
+ formatted_results.append((str(item[0]), float(item[1])))
264
+ elif isinstance(item, dict):
265
+ if "ingredient" in item:
266
+ name = item["ingredient"]
267
+ elif "category" in item:
268
+ name = item["category"]
269
+ else:
270
+ name = str(item)
271
+
272
+ if "relevance_score" in item:
273
+ score = float(item["relevance_score"])
274
+ elif "confidence" in item:
275
+ score = float(item["confidence"])
276
+ else:
277
+ score = 0.0
278
+
279
+ formatted_results.append((name, score))
280
+ else:
281
+ formatted_results.append((str(item), 0.0))
282
+
283
+ method_results[method] = formatted_results
284
+
285
  progress_tracker(1.0, desc="Comparison complete")
286
  return comparison_results
287
 
288
+ def compare_ingredient_methods_ui(product_input, embedding_top_n=20,
289
+ final_top_n=3, confidence_threshold=0.5,
290
+ match_type="categories", use_expansion=False, progress=None):
291
  """
292
  Compare multiple ingredient matching methods on the same products
293
 
294
  Args:
295
  product_input: Text input with product names or file path
 
296
  embedding_top_n: Number of top ingredients to retrieve using embeddings
297
  final_top_n: Number of final results to show for each method
298
  confidence_threshold: Minimum score threshold for final results
299
+ match_type: Type of matching to perform ('ingredients' or 'categories')
300
+ use_expansion: Whether to use description expansion
301
  progress: Optional progress tracking object
302
 
303
  Returns:
 
305
  """
306
  from utils import SafeProgress, load_embeddings
307
 
308
+ progress_tracker = SafeProgress(progress, desc="Comparing matching methods")
309
  progress_tracker(0.1, desc="Processing input")
310
 
 
311
  # Split text input by lines and remove empty lines
312
  if not product_input:
313
  return "Please enter at least one product."
 
315
  if not product_names:
316
  return "Please enter at least one product."
317
 
318
+ # Load appropriate embeddings based on match type
319
  try:
320
+ progress_tracker(0.2, desc="Loading embeddings")
321
+ if match_type == "ingredients":
322
+ embeddings_path = "data/ingredient_embeddings_voyageai.pkl"
323
+ embeddings_dict = load_embeddings(embeddings_path)
324
+ header_text = f"Comparing {len(product_names)} products using multiple ingredient matching methods."
325
+ else: # categories
326
+ embeddings_path = "data/category_embeddings.pickle"
327
+ embeddings_dict = load_embeddings(embeddings_path)
328
+ header_text = f"Comparing {len(product_names)} products using multiple category matching methods."
329
+
330
+ # Initialize expanded_products variable
331
+ expanded_products = None
332
+
333
+ # Expand descriptions if requested
334
+ if use_expansion:
335
+ from openai_expansion import expand_product_descriptions
336
+ progress_tracker(0.25, desc="Expanding product descriptions")
337
+ expanded_products = expand_product_descriptions(product_names, progress=progress_tracker)
338
 
339
  progress_tracker(0.3, desc="Comparing methods")
340
  comparison_results = compare_ingredient_methods(
341
  products=product_names,
342
+ ingredients_dict=embeddings_dict,
343
  embedding_top_n=embedding_top_n,
344
  final_top_n=final_top_n,
345
  confidence_threshold=confidence_threshold,
346
+ match_type=match_type,
347
+ progress=progress_tracker,
348
+ expanded_descriptions=expanded_products
349
  )
350
  except Exception as e:
351
  import traceback
 
354
 
355
  # Format results as HTML using centralized formatters
356
  progress_tracker(0.9, desc="Formatting results")
 
357
  result_elements = []
358
  for product in product_names:
359
  if product in comparison_results:
 
361
 
362
  output_html = create_results_container(
363
  result_elements,
364
+ header_text=header_text
365
  )
366
 
367
  progress_tracker(1.0, desc="Complete")
data/category_embeddings_voyageai.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c51642451d7f5853975e974b46d7466c1a4c238f9caaa302c7ad454111c4fed
3
+ size 1275461
ui.py CHANGED
@@ -149,6 +149,20 @@ def create_demo():
149
  label="Confidence threshold"
150
  )
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  compare_btn = gr.Button("Compare Methods", variant="primary")
153
  compare_examples_btn = gr.Button("Load Examples", variant="secondary")
154
 
@@ -160,10 +174,11 @@ def create_demo():
160
  fn=compare_ingredient_methods_ui,
161
  inputs=[
162
  compare_product_input,
163
- gr.State(False), # Always text input mode
164
  compare_embedding_top_n,
165
  compare_final_top_n,
166
- compare_confidence_threshold
 
 
167
  ],
168
  outputs=comparison_output
169
  )
 
149
  label="Confidence threshold"
150
  )
151
 
152
+ compare_match_type = gr.Radio(
153
+ choices=["categories", "ingredients"],
154
+ value="categories",
155
+ label="Match Type",
156
+ info="Choose whether to match against ingredients or categories"
157
+ )
158
+
159
+ # Add expansion checkbox
160
+ compare_expansion = gr.Checkbox(
161
+ value=False,
162
+ label="Use Description Expansion",
163
+ info="Expand product descriptions using AI before matching"
164
+ )
165
+
166
  compare_btn = gr.Button("Compare Methods", variant="primary")
167
  compare_examples_btn = gr.Button("Load Examples", variant="secondary")
168
 
 
174
  fn=compare_ingredient_methods_ui,
175
  inputs=[
176
  compare_product_input,
 
177
  compare_embedding_top_n,
178
  compare_final_top_n,
179
+ compare_confidence_threshold,
180
+ compare_match_type,
181
+ compare_expansion
182
  ],
183
  outputs=comparison_output
184
  )
ui_formatters.py CHANGED
@@ -37,7 +37,7 @@ STYLES = {
37
  "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
38
  "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
39
  "flex_container": "display: flex; flex-wrap: wrap;",
40
- "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
41
  "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
42
  "item_list": "list-style-type: none; padding-left: 0;",
43
  "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
@@ -64,7 +64,8 @@ METHOD_NAMES = {
64
  "openai": "OpenAI",
65
  "expanded": "Expanded Description",
66
  "hybrid": "Hybrid Matching",
67
- "categories": "Category Matches"
 
68
  }
69
 
70
  def format_method_results(method_key, results, color_hex=None):
@@ -175,8 +176,8 @@ def format_comparison_html(product, method_results):
175
  Returns:
176
  HTML string
177
  """
178
- # Create the methods comparison content
179
- methods_html = f"<div class='methods-comparison' style='{STYLES['flex_container']}'>"
180
 
181
  # Add results for each method
182
  for method_key in ["base", "voyage", "chicory", "openai"]:
@@ -502,7 +503,7 @@ def set_theme(theme_name):
502
  "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
503
  "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
504
  "flex_container": "display: flex; flex-wrap: wrap;",
505
- "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
506
  "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
507
  "item_list": "list-style-type: none; padding-left: 0;",
508
  "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
@@ -512,7 +513,7 @@ def set_theme(theme_name):
512
  return True
513
  return False
514
 
515
- def format_categories_html(product, categories, chicory_result=None, header_color=None, explanation=""):
516
  """
517
  Format category matching results as HTML
518
 
@@ -522,6 +523,7 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
522
  chicory_result: Optional chicory parser result for the product
523
  header_color: Optional header background color
524
  explanation: Optional expanded description text
 
525
 
526
  Returns:
527
  HTML string
@@ -556,9 +558,9 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
556
 
557
  # Add the category results
558
  content += format_method_results(
559
- method_key="categories",
560
  results=categories,
561
- color_hex=header_color or METHOD_COLORS.get("categories", "#1abc9c")
562
  )
563
 
564
  return format_result_card(title=product, content=content)
 
37
  "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
38
  "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
39
  "flex_container": "display: flex; flex-wrap: wrap;",
40
+ "method_container": f"flex: 1; width: 100%; padding: 15px; border-bottom: 1px solid {COLORS['card_border']};",
41
  "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
42
  "item_list": "list-style-type: none; padding-left: 0;",
43
  "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
 
64
  "openai": "OpenAI",
65
  "expanded": "Expanded Description",
66
  "hybrid": "Hybrid Matching",
67
+ "categories": "Category Matches",
68
+ "ingredients": "Ingredient Matches"
69
  }
70
 
71
  def format_method_results(method_key, results, color_hex=None):
 
176
  Returns:
177
  HTML string
178
  """
179
+ # Create the methods comparison content with column direction
180
+ methods_html = f"<div class='methods-comparison' style='{STYLES['flex_container']}; flex-direction: column;'>"
181
 
182
  # Add results for each method
183
  for method_key in ["base", "voyage", "chicory", "openai"]:
 
503
  "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
504
  "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
505
  "flex_container": "display: flex; flex-wrap: wrap;",
506
+ "method_container": f"flex: 1; width: 100%; padding: 15px; border-bottom: 1px solid {COLORS['card_border']};",
507
  "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
508
  "item_list": "list-style-type: none; padding-left: 0;",
509
  "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
 
513
  return True
514
  return False
515
 
516
+ def format_categories_html(product, categories, chicory_result=None, header_color=None, explanation="", match_type="categories"):
517
  """
518
  Format category matching results as HTML
519
 
 
523
  chicory_result: Optional chicory parser result for the product
524
  header_color: Optional header background color
525
  explanation: Optional expanded description text
526
+ match_type: Either "ingredients" or "categories"
527
 
528
  Returns:
529
  HTML string
 
558
 
559
  # Add the category results
560
  content += format_method_results(
561
+ method_key=match_type,
562
  results=categories,
563
+ color_hex=header_color or METHOD_COLORS.get(match_type, "#1abc9c")
564
  )
565
 
566
  return format_result_card(title=product, content=content)
ui_hybrid_matching.py CHANGED
@@ -50,7 +50,7 @@ def categorize_products_with_voyage_reranking(product_input, is_file=False, use_
50
 
51
  # Use hybrid approach for ingredients with optional expanded descriptions
52
  progress_tracker(0.5, desc="Finding and re-ranking ingredients...")
53
- match_results = hybrid_ingredient_matching_voyage(
54
  product_names, embeddings,
55
  embedding_top_n=int(embedding_top_n),
56
  final_top_n=int(final_top_n),
@@ -196,4 +196,93 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
196
  final_results[product] = candidates[:1]
197
 
198
  progress_tracker(1.0, desc="Voyage ingredient matching complete")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  return final_results
 
50
 
51
  # Use hybrid approach for ingredients with optional expanded descriptions
52
  progress_tracker(0.5, desc="Finding and re-ranking ingredients...")
53
+ match_results = hybrid_ingredient_matching(
54
  product_names, embeddings,
55
  embedding_top_n=int(embedding_top_n),
56
  final_top_n=int(final_top_n),
 
196
  final_results[product] = candidates[:1]
197
 
198
  progress_tracker(1.0, desc="Voyage ingredient matching complete")
199
+ return final_results
200
+
201
+ # Add this function to ui_hybrid_matching.py
202
+
203
+ def hybrid_category_matching_voyage(products, categories_dict,
204
+ embedding_top_n=20, final_top_n=5,
205
+ confidence_threshold=0.5,
206
+ expanded_descriptions=None,
207
+ progress=None):
208
+ """Use Voyage AI for reranking categories instead of OpenAI"""
209
+ from utils import SafeProgress
210
+ from embeddings import create_product_embeddings
211
+
212
+ progress_tracker = SafeProgress(progress, desc="Voyage category matching")
213
+ progress_tracker(0.1, desc="Stage 1: Finding candidate categories with embeddings")
214
+
215
+ # Stage 1: Same as before - use embeddings to find candidates
216
+ if expanded_descriptions:
217
+ # Use expanded descriptions for embedding creation when available
218
+ products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
219
+ # Map expanded descriptions back to original product names for consistent keys
220
+ product_embeddings = {}
221
+ temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker)
222
+
223
+ # Ensure we use original product names as keys
224
+ for i, product_name in enumerate(products):
225
+ if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
226
+ product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
227
+ else:
228
+ # Standard embedding creation with just product names
229
+ product_embeddings = create_product_embeddings(products, progress=progress_tracker)
230
+
231
+ from similarity import compute_similarities
232
+ similarities = compute_similarities(categories_dict, product_embeddings)
233
+
234
+ # Filter to top N candidates per product
235
+ embedding_results = {}
236
+ for product, product_similarities in similarities.items():
237
+ embedding_results[product] = product_similarities[:embedding_top_n]
238
+
239
+ progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI")
240
+
241
+ # Initialize Voyage client
242
+ voyage_client = get_voyage_client()
243
+
244
+ # Stage 2: Re-rank using Voyage AI
245
+ final_results = {}
246
+ for i, product in enumerate(products):
247
+ progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
248
+
249
+ if product not in embedding_results or not embedding_results[product]:
250
+ final_results[product] = []
251
+ continue
252
+
253
+ candidates = embedding_results[product]
254
+ candidate_categories = [c[0] for c in candidates]
255
+
256
+ try:
257
+ # Use expanded description if available
258
+ product_text = product
259
+ if expanded_descriptions and product in expanded_descriptions:
260
+ product_text = expanded_descriptions[product]
261
+
262
+ # Use plain strings for the documents
263
+ documents = candidate_categories
264
+
265
+ # Use Voyage reranking
266
+ reranked = voyage_client.rerank(
267
+ query=f"Which food category best matches: {product_text}",
268
+ documents=documents,
269
+ model="rerank-2"
270
+ )
271
+
272
+ # Process results - include all results but keep the threshold for later filtering
273
+ voyage_results = []
274
+ for result in reranked["results"]:
275
+ score = result["relevance_score"]
276
+ text = result["document"]
277
+ voyage_results.append((text, score))
278
+
279
+ # Limit to final_top_n but don't filter by threshold here
280
+ final_results[product] = voyage_results[:final_top_n]
281
+
282
+ except Exception as e:
283
+ print(f"Error during Voyage category reranking for '{product}': {e}")
284
+ # Fall back to embedding results
285
+ final_results[product] = candidates[:1]
286
+
287
+ progress_tracker(1.0, desc="Voyage category matching complete")
288
  return final_results
ui_ingredient_matching.py CHANGED
@@ -72,7 +72,8 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
72
  product,
73
  top_similarities,
74
  chicory_result=chicory_data,
75
- explanation=expansion_text
 
76
  )
77
  output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
78
 
 
72
  product,
73
  top_similarities,
74
  chicory_result=chicory_data,
75
+ explanation=expansion_text,
76
+ match_type="ingredients",
77
  )
78
  output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
79