esilver commited on
Commit
9a56a50
·
1 Parent(s): 8f1969e

some consilidation

Browse files
Files changed (6) hide show
  1. api_utils.py +2 -2
  2. category_matching.py +1 -1
  3. ui.py +54 -101
  4. ui_expanded_matching.py +50 -268
  5. ui_formatters.py +241 -122
  6. ui_hybrid_matching.py +50 -36
api_utils.py CHANGED
@@ -112,7 +112,7 @@ def openai_structured_query(
112
  prompt: str,
113
  system_message: str = "You are a helpful assistant.",
114
  schema: dict = None,
115
- model: str = "o3-mini",
116
  client=None,
117
  schema_name: str = "structured_output"
118
  ) -> dict:
@@ -233,7 +233,7 @@ def rank_ingredients_openai(
233
  # Make the API call directly for more control
234
  response = client.responses.create(
235
  model=model,
236
- reasoning={"effort": "low"},
237
  input=[
238
  {"role": "system", "content": f"You are a food ingredient matching expert. Rank the top {max_results} ingredient based on how well they match the given product. Only include ingredients with relevance score >= {confidence_threshold}."},
239
  {"role": "user", "content": prompt}
 
112
  prompt: str,
113
  system_message: str = "You are a helpful assistant.",
114
  schema: dict = None,
115
+ model: str = "gpt-4o-mini",
116
  client=None,
117
  schema_name: str = "structured_output"
118
  ) -> dict:
 
233
  # Make the API call directly for more control
234
  response = client.responses.create(
235
  model=model,
236
+ # reasoning={"effort": "low"},
237
  input=[
238
  {"role": "system", "content": f"You are a food ingredient matching expert. Rank the top {max_results} ingredient based on how well they match the given product. Only include ingredients with relevance score >= {confidence_threshold}."},
239
  {"role": "user", "content": prompt}
category_matching.py CHANGED
@@ -220,7 +220,7 @@ def hybrid_category_matching(products: List[str], categories: Dict[str, str],
220
 
221
  # Extract just the category descriptions for re-ranking
222
  candidate_ids = [c[0] for c in candidates]
223
- candidate_texts = [f"Category: {c[1]}" for c in candidates]
224
 
225
  try:
226
  # Apply re-ranking to the candidates
 
220
 
221
  # Extract just the category descriptions for re-ranking
222
  candidate_ids = [c[0] for c in candidates]
223
+ candidate_texts = [f"{c[1]}" for c in candidates]
224
 
225
  try:
226
  # Apply re-ranking to the candidates
ui.py CHANGED
@@ -1,16 +1,12 @@
1
  import gradio as gr
2
  from comparison import compare_ingredient_methods_ui
3
 
4
- # Import from our new UI modules
5
  from ui_core import embeddings, get_css, load_examples
6
  from ui_ingredient_matching import categorize_products
7
  from ui_category_matching import categorize_products_by_category
8
-
9
  from ui_hybrid_matching import categorize_products_with_voyage_reranking
10
-
11
-
12
  from ui_expanded_matching import categorize_products_with_openai_reranking
13
- from ui_formatters import get_formatted_css
14
 
15
  def create_demo():
16
  """Create the Gradio interface"""
@@ -19,7 +15,7 @@ def create_demo():
19
 
20
  with gr.Tabs() as tabs:
21
  # Original Ingredient Matching Tab
22
- with gr.TabItem("Ingredient Matching"):
23
  with gr.Row():
24
  with gr.Column(scale=1):
25
  # Input section
@@ -43,7 +39,7 @@ def create_demo():
43
 
44
 
45
  # New Category Matching Tab
46
- with gr.TabItem("Category Matching"):
47
  with gr.Row():
48
  with gr.Column(scale=1):
49
  # Input section
@@ -65,73 +61,59 @@ def create_demo():
65
  # Results section
66
  category_output = gr.HTML(label="Category Matching Results", elem_id="results-container")
67
 
68
- # Replace the "Hybrid Category Matching" tab
69
- with gr.TabItem("Voyage AI Reranking"):
70
- with gr.Row():
71
- with gr.Column(scale=1):
72
- # Input section
73
- voyage_text_input = gr.Textbox(
74
- lines=10,
75
- placeholder="Enter product names, one per line",
76
- label="Product Names"
77
- )
78
- voyage_input_controls = gr.Row()
79
- with voyage_input_controls:
80
- voyage_expansion_switch = gr.Checkbox(value=False, label="Use Description Expansion",
81
- info="Expand product descriptions using AI before matching")
82
-
83
- voyage_embedding_top_n = gr.Slider(1, 50, 20, step=1, label="Embedding Top N Results")
84
- voyage_final_top_n = gr.Slider(1, 10, 5, step=1, label="Final Top N Categories")
85
- voyage_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
86
-
87
- # Add this to the Voyage AI tab, similar to the OpenAI tab:
88
- voyage_match_type = gr.Radio(
89
- choices=["ingredients", "categories"],
90
- value="categories", # Default to categories since that was the original focus
91
- label="Match Type",
92
- info="Choose whether to match against ingredients or categories"
93
- )
 
 
 
 
 
94
 
95
- with gr.Row():
96
- voyage_examples_btn = gr.Button("Load Examples", variant="secondary")
97
- voyage_match_btn = gr.Button("Match using Voyage Reranking", variant="primary")
98
 
99
- with gr.Column(scale=1):
100
- # Results section
101
- voyage_output = gr.HTML(label="Voyage Reranking Results", elem_id="results-container")
102
-
103
- # Replace the "Expanded Description Matching" tab
104
- with gr.TabItem("OpenAI Reranking"):
105
- with gr.Row():
106
- with gr.Column(scale=1):
107
- # Input section
108
- openai_text_input = gr.Textbox(
109
- lines=10,
110
- placeholder="Enter product names, one per line",
111
- label="Product Names"
112
- )
113
- openai_input_controls = gr.Row()
114
- with openai_input_controls:
115
- openai_expansion_switch = gr.Checkbox(value=False, label="Use Description Expansion",
116
- info="Expand product descriptions using AI before matching")
117
- openai_top_n = gr.Slider(1, 20, 10, step=1, label="Top N Results")
118
- openai_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
119
-
120
- # Add toggle here for matching type
121
- openai_match_type = gr.Radio(
122
- choices=["ingredients", "categories"],
123
- value="ingredients",
124
- label="Match Type",
125
- info="Choose whether to match against ingredients or categories"
126
- )
127
-
128
- with gr.Row():
129
- openai_match_btn = gr.Button("Match with OpenAI Reranking", variant="primary")
130
- openai_examples_btn = gr.Button("Load Examples")
131
 
132
- with gr.Column(scale=1):
133
- # Results section
134
- openai_output = gr.HTML(label="OpenAI Reranking Results", elem_id="results-container")
 
 
 
 
 
 
135
 
136
  # New Comparison Tab
137
  with gr.TabItem("Compare Methods"):
@@ -196,37 +178,8 @@ def create_demo():
196
  inputs=[category_text_input, gr.State(False), category_top_n, category_confidence],
197
  outputs=[category_output],
198
  )
199
-
200
-
201
- # Connect buttons for Voyage reranking (previously hybrid matching)
202
- voyage_match_btn.click(
203
- fn=categorize_products_with_voyage_reranking, # New function to create
204
- inputs=[voyage_text_input, gr.State(False), voyage_expansion_switch, voyage_embedding_top_n,
205
- voyage_final_top_n, voyage_confidence, voyage_match_type],
206
- outputs=[voyage_output],
207
- )
208
-
209
- voyage_examples_btn.click(
210
- fn=load_examples,
211
- inputs=[],
212
- outputs=voyage_text_input
213
- )
214
-
215
- # Connect buttons for OpenAI reranking (previously expanded description matching)
216
- openai_match_btn.click(
217
- fn=categorize_products_with_openai_reranking, # New function to create
218
- inputs=[openai_text_input, gr.State(False), openai_expansion_switch,
219
- openai_top_n, openai_confidence, openai_match_type],
220
- outputs=[openai_output],
221
- )
222
-
223
- openai_examples_btn.click(
224
- fn=load_examples,
225
- inputs=[],
226
- outputs=openai_text_input
227
- )
228
 
229
- # Examples buttons
230
  examples_btn.click(
231
  fn=load_examples,
232
  inputs=[],
 
1
  import gradio as gr
2
  from comparison import compare_ingredient_methods_ui
3
 
4
+ # Import from our UI modules
5
  from ui_core import embeddings, get_css, load_examples
6
  from ui_ingredient_matching import categorize_products
7
  from ui_category_matching import categorize_products_by_category
 
8
  from ui_hybrid_matching import categorize_products_with_voyage_reranking
 
 
9
  from ui_expanded_matching import categorize_products_with_openai_reranking
 
10
 
11
  def create_demo():
12
  """Create the Gradio interface"""
 
15
 
16
  with gr.Tabs() as tabs:
17
  # Original Ingredient Matching Tab
18
+ with gr.TabItem("Ingredient Embeddings"):
19
  with gr.Row():
20
  with gr.Column(scale=1):
21
  # Input section
 
39
 
40
 
41
  # New Category Matching Tab
42
+ with gr.TabItem("Category Embeddings"):
43
  with gr.Row():
44
  with gr.Column(scale=1):
45
  # Input section
 
61
  # Results section
62
  category_output = gr.HTML(label="Category Matching Results", elem_id="results-container")
63
 
64
+ # Common function to create reranking UI tabs
65
+ def create_reranking_tab(tab_name, fn_name, default_match="ingredients"):
66
+ with gr.TabItem(tab_name):
67
+ with gr.Row():
68
+ with gr.Column(scale=1):
69
+ # Input section
70
+ tab_input = gr.Textbox(
71
+ lines=10,
72
+ placeholder="Enter product names, one per line",
73
+ label="Product Names"
74
+ )
75
+ with gr.Row():
76
+ tab_expansion = gr.Checkbox(
77
+ value=False,
78
+ label="Use Description Expansion",
79
+ info="Expand product descriptions using AI before matching"
80
+ )
81
+ tab_emb_top_n = gr.Slider(1, 50, 20, step=1, label="Embedding Top N Results")
82
+ tab_top_n = gr.Slider(1, 10, 5, step=1, label="Final Top N Results")
83
+ tab_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
84
+
85
+ tab_match_type = gr.Radio(
86
+ choices=["ingredients", "categories"],
87
+ value=default_match,
88
+ label="Match Type",
89
+ info="Choose whether to match against ingredients or categories"
90
+ )
91
+
92
+ with gr.Row():
93
+ tab_examples_btn = gr.Button("Load Examples", variant="secondary")
94
+ tab_match_btn = gr.Button(f"Match using {tab_name}", variant="primary")
95
 
96
+ with gr.Column(scale=1):
97
+ # Results section
98
+ tab_output = gr.HTML(label=f"{tab_name} Results", elem_id="results-container")
99
 
100
+ # Connect button events
101
+ tab_match_btn.click(
102
+ fn=fn_name,
103
+ inputs=[tab_input, gr.State(False), tab_expansion, tab_emb_top_n,
104
+ tab_top_n, tab_confidence, tab_match_type],
105
+ outputs=[tab_output],
106
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ tab_examples_btn.click(
109
+ fn=load_examples,
110
+ inputs=[],
111
+ outputs=tab_input
112
+ )
113
+
114
+ # Create the reranking tabs using the shared function
115
+ create_reranking_tab("Voyage AI Reranking", categorize_products_with_voyage_reranking, "categories")
116
+ create_reranking_tab("OpenAI Reranking", categorize_products_with_openai_reranking, "ingredients")
117
 
118
  # New Comparison Tab
119
  with gr.TabItem("Compare Methods"):
 
178
  inputs=[category_text_input, gr.State(False), category_top_n, category_confidence],
179
  outputs=[category_output],
180
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ # Examples buttons for the first two tabs
183
  examples_btn.click(
184
  fn=load_examples,
185
  inputs=[],
ui_expanded_matching.py CHANGED
@@ -4,248 +4,19 @@ from embeddings import create_product_embeddings
4
  from similarity import compute_similarities
5
  from openai_expansion import expand_product_descriptions
6
  from ui_core import embeddings, parse_input, CATEGORY_EMBEDDINGS_PATH
7
- from ui_formatters import format_expanded_results_html, create_results_container
8
  from api_utils import get_openai_client, process_in_parallel, rank_ingredients_openai, rank_categories_openai
9
  from category_matching import load_categories, load_category_embeddings
10
  import json
11
- import os
12
 
13
-
14
- def categorize_products_with_expansion(product_input, is_file=False, top_n=10, confidence_threshold=0.5, match_type="ingredients", progress=gr.Progress()):
15
- """
16
- Categorize products using expanded descriptions from OpenAI
17
-
18
- Args:
19
- product_input: Text input with product names
20
- is_file: Whether the input is a file
21
- top_n: Number of top results to show
22
- confidence_threshold: Confidence threshold for matches
23
- match_type: Either "ingredients" or "categories"
24
- progress: Progress tracking object
25
-
26
- Returns:
27
- HTML formatted results
28
- """
29
- progress_tracker = SafeProgress(progress)
30
- progress_tracker(0, desc="Starting...")
31
-
32
- # Parse input
33
- product_names, error = parse_input(product_input, is_file)
34
- if error:
35
- return error
36
-
37
- # Validate embeddings are loaded if doing ingredient matching
38
- if match_type == "ingredients" and not embeddings:
39
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
40
-
41
- # Expand product descriptions
42
- progress_tracker(0.2, desc="Expanding product descriptions...")
43
- expanded_descriptions = expand_product_descriptions(product_names, progress=progress)
44
-
45
- if not expanded_descriptions:
46
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to expand product descriptions. Please try again or check your OpenAI API key.</div>"
47
-
48
- # Get shared OpenAI client
49
- openai_client = get_openai_client()
50
-
51
- if match_type == "ingredients":
52
- # Generate product embeddings
53
- progress_tracker(0.4, desc="Generating product embeddings...")
54
- product_embeddings = create_product_embeddings(product_names, progress=progress)
55
-
56
- # Compute embedding similarities for ingredients
57
- progress_tracker(0.6, desc="Computing ingredient similarities...")
58
- all_similarities = compute_similarities(embeddings, product_embeddings)
59
-
60
- if not all_similarities:
61
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
62
-
63
- # Setup for OpenAI reranking
64
- embedding_top_n = 20 # Number of candidates to consider from embeddings
65
-
66
- progress_tracker(0.7, desc="Re-ranking with expanded descriptions...")
67
-
68
- # Function for processing each product
69
- def process_reranking(product):
70
- if product not in all_similarities:
71
- return product, []
72
-
73
- candidates = all_similarities[product][:embedding_top_n]
74
- if not candidates:
75
- return product, []
76
-
77
- candidate_ingredients = [c[0] for c in candidates]
78
- expanded_text = expanded_descriptions.get(product, "")
79
-
80
- try:
81
- # Use the shared utility function
82
- reranked_ingredients = rank_ingredients_openai(
83
- product=product,
84
- candidates=candidate_ingredients,
85
- expanded_description=expanded_text,
86
- client=openai_client,
87
- model="o3-mini",
88
- max_results=top_n,
89
- confidence_threshold=confidence_threshold,
90
- debug=True
91
- )
92
-
93
- return product, reranked_ingredients
94
-
95
- except Exception as e:
96
- print(f"Error reranking {product}: {e}")
97
- # Fall back to top embedding match
98
- return product, candidates[:1] if candidates[0][1] >= confidence_threshold else []
99
-
100
- # Process all products in parallel
101
- final_results = process_in_parallel(
102
- items=product_names,
103
- processor_func=process_reranking,
104
- max_workers=min(10, len(product_names)),
105
- progress_tracker=progress_tracker,
106
- progress_start=0.7,
107
- progress_end=0.9,
108
- progress_desc="Re-ranking"
109
- )
110
-
111
- else: # categories
112
- # Load category embeddings instead of JSON categories
113
- progress_tracker(0.5, desc="Loading category embeddings...")
114
- category_embeddings = load_category_embeddings()
115
-
116
- if not category_embeddings:
117
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category embeddings found. Please check that the embeddings file exists at data/category_embeddings.pickle.</div>"
118
-
119
- # Generate product embeddings
120
- progress_tracker(0.6, desc="Generating product embeddings...")
121
- product_embeddings = create_product_embeddings(product_names, progress=progress)
122
-
123
- # Compute embedding similarities for categories
124
- progress_tracker(0.7, desc="Computing category similarities...")
125
- all_similarities = compute_similarities(category_embeddings, product_embeddings)
126
- print(f'All similarities: {all_similarities}')
127
- if not all_similarities:
128
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category similarities found. Please try different product names.</div>"
129
-
130
- embedding_top_n = min(20, top_n * 2) # Number of candidates to consider from embeddings
131
-
132
- # Collect all needed category IDs first
133
- needed_category_ids = set()
134
- for product, similarities in all_similarities.items():
135
- for category_id, score in similarities[:embedding_top_n]:
136
- if score >= confidence_threshold:
137
- needed_category_ids.add(category_id)
138
-
139
- # Load only the needed categories from JSON
140
- progress_tracker(0.75, desc="Loading category descriptions...")
141
- category_descriptions = {}
142
- if needed_category_ids:
143
- try:
144
- with open("categories.json", 'r') as f:
145
- categories_list = json.load(f)
146
- for item in categories_list:
147
- if item["id"] in needed_category_ids:
148
- category_descriptions[item["id"]] = item["text"]
149
- except Exception as e:
150
- print(f"Error loading category descriptions: {e}")
151
-
152
- # Function to process each product
153
- def process_category_matching(product):
154
- if product not in all_similarities:
155
- return product, []
156
-
157
- # candidates = all_similarities[product][:embedding_top_n]
158
- candidates = all_similarities[product][:embedding_top_n]
159
- print(f'candidates: {candidates}')
160
- if not candidates:
161
- return product, []
162
-
163
- # Get the expanded description
164
- expanded_text = expanded_descriptions.get(product, "")
165
-
166
- try:
167
- # Use rank_categories_openai instead of match_products_to_categories_with_description
168
- category_matches = rank_categories_openai(
169
- product=product,
170
- categories=category_descriptions,
171
- expanded_description=expanded_text,
172
- client=openai_client,
173
- # model="o3-mini",
174
- model="gpt-4o-mini",
175
- # model="gpt-4o",
176
- max_results=top_n,
177
- confidence_threshold=confidence_threshold,
178
- debug=True
179
- )
180
-
181
- # Format results with category descriptions if needed
182
- formatted_matches = []
183
- for category_id, score in category_matches:
184
- category_text = category_descriptions.get(category_id, "Unknown category")
185
- formatted_matches.append((category_id, category_text, score))
186
-
187
- return product, formatted_matches
188
- except Exception as e:
189
- print(f"Error matching {product} to categories: {e}")
190
- return product, []
191
-
192
- # Process all products in parallel
193
- final_results = process_in_parallel(
194
- items=product_names,
195
- processor_func=process_category_matching,
196
- max_workers=min(10, len(product_names)),
197
- progress_tracker=progress_tracker,
198
- progress_start=0.7,
199
- progress_end=0.9,
200
- progress_desc="Category matching"
201
- )
202
-
203
- # Format results
204
- progress_tracker(0.9, desc="Formatting results...")
205
-
206
- result_elements = []
207
- for product, matches in final_results.items():
208
- result_elements.append(
209
- format_expanded_results_html(
210
- product=product,
211
- results=matches,
212
- expanded_description=expanded_descriptions.get(product, ""),
213
- match_type=match_type
214
- )
215
- )
216
-
217
- output_html = create_results_container(
218
- result_elements,
219
- header_text=f"Matched {len(product_names)} products to {match_type} using expanded descriptions."
220
- )
221
-
222
- if not final_results:
223
- output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
224
-
225
- progress_tracker(1.0, desc="Done!")
226
- return output_html
227
-
228
- def categorize_products_with_openai_reranking(product_input, is_file=False, expansion_strength=0.0,
229
- top_n=10, confidence_threshold=0.5, match_type="ingredients",
230
- progress=gr.Progress()):
231
  """
232
  Categorize products using OpenAI reranking with optional description expansion
233
-
234
- Args:
235
- product_input: Text input with product names
236
- is_file: Whether the input is a file
237
- expansion_strength: 0.0-1.0 slider value for description expansion (0=none, 1=full)
238
- top_n: Number of top results to show
239
- confidence_threshold: Confidence threshold for matches
240
- match_type: Either "ingredients" or "categories"
241
- progress: Progress tracking object
242
-
243
- Returns:
244
- HTML formatted results
245
  """
246
  progress_tracker = SafeProgress(progress)
247
  progress_tracker(0, desc="Starting OpenAI reranking...")
248
-
249
  # Parse input
250
  product_names, error = parse_input(product_input, is_file)
251
  if error:
@@ -254,15 +25,11 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
254
  # Validate embeddings are loaded if doing ingredient matching
255
  if match_type == "ingredients" and not embeddings:
256
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
257
-
258
  # Optional description expansion
259
  expanded_descriptions = {}
260
- if expansion_strength > 0:
261
  progress_tracker(0.2, desc="Expanding product descriptions...")
262
  expanded_descriptions = expand_product_descriptions(product_names, progress=progress)
263
- else:
264
- # If no expansion, use product names as is (minimal descriptions)
265
- expanded_descriptions = {product: product for product in product_names}
266
 
267
  # Get shared OpenAI client
268
  openai_client = get_openai_client()
@@ -279,9 +46,6 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
279
  if not all_similarities:
280
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
281
 
282
- # Setup for OpenAI reranking
283
- embedding_top_n = 20 # Number of candidates to consider from embeddings
284
-
285
  progress_tracker(0.7, desc="Re-ranking with OpenAI...")
286
 
287
  # Function for processing each product
@@ -294,10 +58,11 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
294
  return product, []
295
 
296
  candidate_ingredients = [c[0] for c in candidates]
297
- expanded_text = expanded_descriptions.get(product, product)
298
 
299
  try:
300
- # Use the shared utility function
 
301
  reranked_ingredients = rank_ingredients_openai(
302
  product=product,
303
  candidates=candidate_ingredients,
@@ -305,7 +70,7 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
305
  client=openai_client,
306
  model="o3-mini",
307
  max_results=top_n,
308
- confidence_threshold=confidence_threshold,
309
  debug=True
310
  )
311
 
@@ -314,7 +79,7 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
314
  except Exception as e:
315
  print(f"Error reranking {product}: {e}")
316
  # Fall back to top embedding match
317
- return product, candidates[:1] if candidates[0][1] >= confidence_threshold else []
318
 
319
  # Process all products in parallel
320
  final_results = process_in_parallel(
@@ -346,14 +111,11 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
346
  if not all_similarities:
347
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category similarities found. Please try different product names.</div>"
348
 
349
- embedding_top_n = min(20, top_n * 2) # Number of candidates to consider from embeddings
350
-
351
- # Collect all needed category IDs first
352
  needed_category_ids = set()
353
  for product, similarities in all_similarities.items():
354
  for category_id, score in similarities[:embedding_top_n]:
355
- if score >= confidence_threshold:
356
- needed_category_ids.add(category_id)
357
 
358
  # Load only the needed categories from JSON
359
  progress_tracker(0.75, desc="Loading category descriptions...")
@@ -378,9 +140,10 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
378
  return product, []
379
 
380
  # Get the expanded description or use product name if no expansion
381
- expanded_text = expanded_descriptions.get(product, product)
382
 
383
  try:
 
384
  category_matches = rank_categories_openai(
385
  product=product,
386
  categories=category_descriptions,
@@ -388,7 +151,7 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
388
  client=openai_client,
389
  model="gpt-4o-mini",
390
  max_results=top_n,
391
- confidence_threshold=confidence_threshold,
392
  debug=True
393
  )
394
 
@@ -417,24 +180,43 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, expa
417
  # Format results
418
  progress_tracker(0.9, desc="Formatting results...")
419
 
420
- result_elements = []
 
 
421
  for product, matches in final_results.items():
422
- result_elements.append(
423
- format_expanded_results_html(
424
- product=product,
425
- results=matches,
426
- expanded_description=expanded_descriptions.get(product, ""),
427
- match_type=match_type
428
- )
429
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
- output_html = create_results_container(
432
- result_elements,
433
- header_text=f"Matched {len(product_names)} products to {match_type} using OpenAI reranking."
434
- )
435
 
436
- if not final_results:
437
- output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
 
 
 
 
 
 
438
 
439
  progress_tracker(1.0, desc="Done!")
440
- return output_html
 
4
  from similarity import compute_similarities
5
  from openai_expansion import expand_product_descriptions
6
  from ui_core import embeddings, parse_input, CATEGORY_EMBEDDINGS_PATH
7
+ from ui_formatters import format_reranking_results_html
8
  from api_utils import get_openai_client, process_in_parallel, rank_ingredients_openai, rank_categories_openai
9
  from category_matching import load_categories, load_category_embeddings
10
  import json
 
11
 
12
+ def categorize_products_with_openai_reranking(product_input, is_file=False, use_expansion=False,
13
+ embedding_top_n=20, top_n=10, confidence_threshold=0.5,
14
+ match_type="ingredients", progress=gr.Progress()):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  """
16
  Categorize products using OpenAI reranking with optional description expansion
 
 
 
 
 
 
 
 
 
 
 
 
17
  """
18
  progress_tracker = SafeProgress(progress)
19
  progress_tracker(0, desc="Starting OpenAI reranking...")
 
20
  # Parse input
21
  product_names, error = parse_input(product_input, is_file)
22
  if error:
 
25
  # Validate embeddings are loaded if doing ingredient matching
26
  if match_type == "ingredients" and not embeddings:
27
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
 
28
  # Optional description expansion
29
  expanded_descriptions = {}
30
+ if use_expansion:
31
  progress_tracker(0.2, desc="Expanding product descriptions...")
32
  expanded_descriptions = expand_product_descriptions(product_names, progress=progress)
 
 
 
33
 
34
  # Get shared OpenAI client
35
  openai_client = get_openai_client()
 
46
  if not all_similarities:
47
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
48
 
 
 
 
49
  progress_tracker(0.7, desc="Re-ranking with OpenAI...")
50
 
51
  # Function for processing each product
 
58
  return product, []
59
 
60
  candidate_ingredients = [c[0] for c in candidates]
61
+ expanded_text = expanded_descriptions.get(product, product) if use_expansion else product
62
 
63
  try:
64
+ # Use the shared utility function - now passing 0.0 as threshold to get all results
65
+ # We'll apply the threshold at display time
66
  reranked_ingredients = rank_ingredients_openai(
67
  product=product,
68
  candidates=candidate_ingredients,
 
70
  client=openai_client,
71
  model="o3-mini",
72
  max_results=top_n,
73
+ confidence_threshold=0.0, # Don't filter here, do it at display time
74
  debug=True
75
  )
76
 
 
79
  except Exception as e:
80
  print(f"Error reranking {product}: {e}")
81
  # Fall back to top embedding match
82
+ return product, candidates[:1] # Don't filter here
83
 
84
  # Process all products in parallel
85
  final_results = process_in_parallel(
 
111
  if not all_similarities:
112
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category similarities found. Please try different product names.</div>"
113
 
114
+ # Collect all needed category IDs first - don't filter by threshold here
 
 
115
  needed_category_ids = set()
116
  for product, similarities in all_similarities.items():
117
  for category_id, score in similarities[:embedding_top_n]:
118
+ needed_category_ids.add(category_id)
 
119
 
120
  # Load only the needed categories from JSON
121
  progress_tracker(0.75, desc="Loading category descriptions...")
 
140
  return product, []
141
 
142
  # Get the expanded description or use product name if no expansion
143
+ expanded_text = expanded_descriptions.get(product, product) if use_expansion else product
144
 
145
  try:
146
+ # Pass 0.0 as threshold to get all results - apply threshold at display time
147
  category_matches = rank_categories_openai(
148
  product=product,
149
  categories=category_descriptions,
 
151
  client=openai_client,
152
  model="gpt-4o-mini",
153
  max_results=top_n,
154
+ confidence_threshold=0.0, # Don't filter here
155
  debug=True
156
  )
157
 
 
180
  # Format results
181
  progress_tracker(0.9, desc="Formatting results...")
182
 
183
+ # Create a list of result dictionaries in consistent format
184
+ formatted_results = []
185
+
186
  for product, matches in final_results.items():
187
+ # Include all products, even with no matches
188
+ formatted_result = {
189
+ "product_name": product,
190
+ "confidence": max([item[-1] for item in matches]) if matches else 0,
191
+ "matching_items": [],
192
+ "item_scores": [], # Add item_scores to align with Voyage implementation
193
+ "explanation": expanded_descriptions.get(product, "") if use_expansion else ""
194
+ }
195
+
196
+ # Format matching items based on match type
197
+ if match_type == "ingredients":
198
+ formatted_result["matching_items"] = [item for item, score in matches]
199
+ formatted_result["item_scores"] = [score for item, score in matches]
200
+ else: # categories
201
+ for cat_id, cat_desc, score in matches:
202
+ formatted_result["matching_items"].append(
203
+ f"{cat_id}: {cat_desc}" if cat_desc else f"{cat_id}"
204
+ )
205
+ formatted_result["item_scores"].append(score)
206
+
207
+ formatted_results.append(formatted_result)
208
 
209
+ if not formatted_results:
210
+ return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
 
 
211
 
212
+ result_html = format_reranking_results_html(
213
+ results=formatted_results,
214
+ match_type=match_type,
215
+ show_scores=True,
216
+ include_explanation=use_expansion,
217
+ method="openai",
218
+ confidence_threshold=confidence_threshold # Pass the threshold to the formatter
219
+ )
220
 
221
  progress_tracker(1.0, desc="Done!")
222
+ return result_html
ui_formatters.py CHANGED
@@ -190,34 +190,92 @@ def format_comparison_html(product, method_results):
190
  # Create the full card with the methods content
191
  return format_result_card(title=product, content=methods_html)
192
 
193
- def format_expanded_results_html(product, results, expanded_description, match_type="ingredients"):
194
- """Format results using expanded descriptions"""
195
- content = ""
 
 
196
 
197
- # Add expanded description section using shared function
198
- content += format_info_panel("Expanded Description", expanded_description)
 
 
 
 
 
 
 
 
 
 
 
199
 
200
- # Add results section using shared function
201
- title_text = "Ingredients" if match_type == "ingredients" else "Categories"
202
- content += format_results_section(results, title_text, match_type)
203
 
204
- return format_result_card(title=product, content=content)
205
-
206
- def format_hybrid_results_html(product, results, summary, expanded_description=""):
207
- """Format the hybrid matching results as HTML."""
208
- content = ""
209
-
210
- # Add expanded description if provided
211
- if expanded_description:
212
- content += format_info_panel("Expanded Description", expanded_description)
213
 
214
- # Add summary
215
- content += f"<p>{summary}</p>"
216
 
217
- # Add results section using shared function
218
- content += format_results_section(results, "Results", "hybrid")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
 
220
- return format_result_card(title=product, content=content)
 
221
 
222
  def create_results_container(html_elements, header_text=None):
223
  """
@@ -240,46 +298,136 @@ def create_results_container(html_elements, header_text=None):
240
 
241
  return container
242
 
243
- def format_categories_html(product, categories, chicory_result=None, header_color=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  """
245
- Format category matching results as HTML
246
 
247
  Args:
248
  product: Product name
249
- categories: List of (category, score) tuples
250
- chicory_result: Optional chicory parser result for the product
251
- header_color: Optional header background color
 
252
 
253
  Returns:
254
- HTML string
255
  """
256
  content = ""
257
 
258
- # Add Chicory results if available
259
- if chicory_result:
260
  content += f"<div style='{STYLES['info_panel']}'>"
261
- content += "<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>Chicory Parser Results</h4>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262
 
263
- if isinstance(chicory_result, dict):
264
- ingredient = chicory_result.get("ingredient", "Not found")
265
- confidence = chicory_result.get("confidence", 0)
266
- confidence_percent = int(confidence * 100)
 
267
 
268
- content += f"<div style='display: flex; justify-content: space-between; align-items: center; padding: 8px; border-radius: 4px;'>"
269
- content += f"<span style='font-weight: bold;'>{ingredient}</span>"
270
- content += f"<span style='background-color: {get_confidence_bg_color(confidence)}; border: 1px solid {get_confidence_color(confidence)}; color: #000; font-weight: 600; padding: 2px 6px; border-radius: 4px; min-width: 70px; text-align: center;'>Confidence: {confidence_percent}%</span>"
271
- content += "</div>"
272
- else:
273
- content += f"<p style='{STYLES['empty_message']}'>No Chicory results available</p>"
 
274
 
 
275
  content += "</div>"
276
-
277
- # Add the category results
278
- content += format_method_results(
279
- method_key="categories",
280
- results=categories,
281
- color_hex=header_color or METHOD_COLORS.get("categories", "#1abc9c")
282
- )
283
 
284
  return format_result_card(title=product, content=content)
285
 
@@ -288,7 +436,7 @@ def get_formatted_css():
288
  Generate CSS for the UI based on current theme
289
 
290
  Returns:
291
- CSS string ready to use in Gradio
292
  """
293
  return f"""
294
  .gradio-container .prose {{
@@ -336,102 +484,73 @@ def get_formatted_css():
336
 
337
  def set_theme(theme_name):
338
  """
339
- Set the UI theme (light or dark)
340
 
341
  Args:
342
- theme_name: 'light' or 'dark'
343
 
344
  Returns:
345
- None - updates global variables
346
  """
347
  global THEME, COLORS, STYLES
348
-
349
  if theme_name in THEMES:
350
  THEME = theme_name
351
  COLORS = THEMES[THEME]
352
-
353
- # Update styles with new theme colors
354
- STYLES.update({
355
  "card": f"margin-bottom: 20px; border: 1px solid {COLORS['card_border']}; border-radius: 8px; overflow: hidden; background-color: {COLORS['card_bg']};",
356
  "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
357
  "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
 
358
  "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
359
  "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
 
 
 
360
  "info_panel": f"padding: 10px; background-color: {COLORS['section_bg']}; margin-bottom: 10px; border-radius: 4px;"
361
- })
 
 
362
 
363
- def format_result_item(result):
364
- """Format a single result item with confidence badge.
365
-
366
- Args:
367
- result: Tuple containing (name, score) or (id, name, score)
368
-
369
- Returns:
370
- HTML string for the result item or None if invalid format
371
  """
372
- # Handle both 2-tuple and 3-tuple formats
373
- if len(result) == 3:
374
- category_id, category_name, score = result
375
- display_text = f"<strong>{category_id}</strong>: {category_name}"
376
- elif len(result) == 2:
377
- display_text, score = result
378
- else:
379
- return None # Skip any invalid formats
380
-
381
- confidence_percent = int(score * 100)
382
- confidence_color = get_confidence_color(score)
383
- bg_color = get_confidence_bg_color(score)
384
-
385
- item_html = f"<li style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 4px;'>"
386
- item_html += f"<span style='font-weight: 500; flex: 1;'>{display_text}</span>"
387
- item_html += f"<span style='background-color: {bg_color}; border: 1px solid {confidence_color}; color: #000; font-weight: 600; padding: 2px 6px; border-radius: 4px; min-width: 70px; text-align: center; margin-left: 8px;'>Confidence: {confidence_percent}%</span>"
388
- item_html += "</li>"
389
-
390
- return item_html
391
-
392
- def format_info_panel(title, content):
393
- """Format an information panel with title and content.
394
 
395
  Args:
396
- title: Panel title
397
- content: Panel content (HTML or text)
 
 
398
 
399
  Returns:
400
- HTML string for the info panel
401
  """
402
- panel_html = f"<div style='{STYLES['info_panel']}'>"
403
- panel_html += f"<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>{title}</h4>"
404
- panel_html += f"<p style='margin-bottom: 8px;'>{content}</p>"
405
- panel_html += "</div>"
406
-
407
- return panel_html
408
-
409
- def format_results_section(results, section_title, match_type="ingredients"):
410
- """Format a results section with title and result items.
411
 
412
- Args:
413
- results: List of result tuples
414
- section_title: Title for the results section
415
- match_type: Type of matching used (for color styling)
416
 
417
- Returns:
418
- HTML string for the results section
419
- """
420
- color_hex = METHOD_COLORS.get(match_type, "#1abc9c")
421
-
422
- section_html = f"<div class='method-results' style='margin-top: 15px; border-left: 3px solid {color_hex}; padding-left: 15px;'>"
423
- section_html += f"<h4 style='margin-top: 0; color: {color_hex};'>{section_title}</h4>"
 
 
 
 
 
 
424
 
425
- if results:
426
- section_html += "<ul style='margin-top: 5px; padding-left: 20px;'>"
427
- for result in results:
428
- item_html = format_result_item(result)
429
- if item_html:
430
- section_html += item_html
431
- section_html += "</ul>"
432
- else:
433
- section_html += "<p style='color: #777; font-style: italic; margin: 5px 0;'>No matches found above confidence threshold.</p>"
434
-
435
- section_html += "</div>"
436
 
437
- return section_html
 
190
  # Create the full card with the methods content
191
  return format_result_card(title=product, content=methods_html)
192
 
193
+ def format_reranking_results_html(results, match_type="ingredients", show_scores=True, include_explanation=False,
194
+ method="voyage", confidence_threshold=0.0):
195
+ """
196
+ Unified formatter that works for both Voyage and OpenAI results, using the individual elements approach
197
+ with the original visual style.
198
 
199
+ Args:
200
+ results: List of result dictionaries
201
+ match_type: Either "ingredients" or "categories"
202
+ show_scores: Whether to show confidence scores
203
+ include_explanation: Whether to include expanded descriptions
204
+ method: Method used for ranking ("voyage" or "openai")
205
+ confidence_threshold: Threshold for filtering individual items (default 0.0 shows all)
206
+
207
+ Returns:
208
+ HTML string for displaying results
209
+ """
210
+ if not results or len(results) == 0:
211
+ return f"No {match_type.lower()} matches found."
212
 
213
+ # Method-specific styling
214
+ method_color = METHOD_COLORS.get(method, "#777777")
215
+ method_name = METHOD_NAMES.get(method, method.capitalize())
216
 
217
+ # Create a header text
218
+ header_text = f"Matched {len(results)} products to {match_type} using {method_name}"
 
 
 
 
 
 
 
219
 
220
+ # Generate individual HTML elements for each result - using the old style approach
221
+ html_elements = []
222
 
223
+ for result in results:
224
+ product_name = result.get("product_name", "")
225
+ matching_items = result.get("matching_items", [])
226
+ item_scores = result.get("item_scores", [])
227
+ explanation = result.get("explanation", "") if include_explanation else ""
228
+
229
+ # Convert matching items into tuples with scores for format_expanded_results_html
230
+ formatted_matches = []
231
+
232
+ # Make sure we have scores for all items
233
+ if len(item_scores) != len(matching_items):
234
+ # If scores are missing, use overall confidence for all
235
+ result_confidence = result.get("confidence", 0.5)
236
+ item_scores = [result_confidence] * len(matching_items)
237
+
238
+ for i, item in enumerate(matching_items):
239
+ score = item_scores[i]
240
+ if ":" in item and match_type == "categories":
241
+ # Handle category format "id: description"
242
+ parts = item.split(":", 1)
243
+ cat_id = parts[0].strip()
244
+ cat_text = parts[1].strip() if len(parts) > 1 else ""
245
+ formatted_matches.append((cat_id, cat_text, score))
246
+ else:
247
+ # Handle ingredient format (just name and score)
248
+ formatted_matches.append((item, score))
249
+
250
+ # Only skip if there are no matches at all
251
+ if not formatted_matches:
252
+ continue
253
+
254
+ # Use the older style formatter with threshold
255
+ if include_explanation:
256
+ # Use expanded_results_html for the old style with expanded descriptions
257
+ element_html = format_expanded_results_html(
258
+ product=product_name,
259
+ results=formatted_matches,
260
+ expanded_description=explanation,
261
+ match_type=match_type,
262
+ confidence_threshold=confidence_threshold
263
+ )
264
+ else:
265
+ # Use hybrid_results_html when no expanded description is available
266
+ summary_text = f"{match_type.capitalize()} matches using {method_name}."
267
+ element_html = format_hybrid_results_html(
268
+ product=product_name,
269
+ results=formatted_matches,
270
+ summary=summary_text,
271
+ expanded_description="",
272
+ confidence_threshold=confidence_threshold
273
+ )
274
+
275
+ html_elements.append(element_html)
276
 
277
+ # Combine all elements into a container
278
+ return create_results_container(html_elements, header_text=header_text)
279
 
280
  def create_results_container(html_elements, header_text=None):
281
  """
 
298
 
299
  return container
300
 
301
+ def filter_results_by_threshold(results, confidence_threshold=0.0):
302
+ """Helper function to filter results by confidence threshold"""
303
+ filtered_results = []
304
+ for item in results:
305
+ # Handle both 2-value (match, score) and 3-value (id, text, score) tuples
306
+ score = item[-1] if isinstance(item, tuple) and len(item) >= 2 else 0.0
307
+ # Only include results above the threshold
308
+ if score >= confidence_threshold:
309
+ filtered_results.append(item)
310
+ return filtered_results
311
+
312
+ def parse_result_item(item):
313
+ """Helper function to parse result items into display text and score"""
314
+ # Handle both 2-value (match, score) and 3-value (id, text, score) tuples
315
+ if isinstance(item, tuple):
316
+ if len(item) == 2:
317
+ match, score = item
318
+ display_text = match
319
+ elif len(item) == 3:
320
+ cat_id, cat_text, score = item
321
+ display_text = f"{cat_id}: {cat_text}" if cat_text else cat_id
322
+ else:
323
+ display_text = str(item)
324
+ score = 0.0
325
+ else:
326
+ display_text = str(item)
327
+ score = 0.0
328
+ return display_text, score
329
+
330
+ def format_expanded_results_html(product, results, expanded_description, match_type="ingredients", confidence_threshold=0.0):
331
+ """Format results using expanded descriptions"""
332
+ content = ""
333
+
334
+ # Add expanded description section
335
+ content += f"<div style='{STYLES['info_panel']}'>"
336
+ content += "<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>Expanded Description</h4>"
337
+ content += f"<p style='margin-bottom: 8px;'>{expanded_description}</p>"
338
+ content += "</div>"
339
+
340
+ # Format the results section - create custom section
341
+ color_hex = METHOD_COLORS.get(match_type, "#1abc9c")
342
+
343
+ # Add results section with custom title
344
+ content += f"<div class='method-results' style='margin-top: 15px; border-left: 3px solid {color_hex}; padding-left: 15px;'>"
345
+ title_text = "Ingredients" if match_type == "ingredients" else "Categories"
346
+ content += f"<h4 style='margin-top: 0; color: {color_hex};'>{title_text}</h4>"
347
+
348
+ # Filter results by confidence threshold
349
+ filtered_results = filter_results_by_threshold(results, confidence_threshold)
350
+
351
+ if filtered_results:
352
+ content += "<ul style='margin-top: 5px; padding-left: 20px;'>"
353
+ for item in filtered_results:
354
+ display_text, score = parse_result_item(item)
355
+ confidence_percent = int(score * 100)
356
+ # Improved styling for confidence percentage - using black text for better contrast
357
+ confidence_color = get_confidence_color(score)
358
+ bg_color = get_confidence_bg_color(score)
359
+ content += f"<li style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 4px;'>"
360
+ content += f"<span style='font-weight: 500; flex: 1;'>{display_text}</span>"
361
+ content += f"<span style='background-color: {bg_color}; border: 1px solid {confidence_color}; color: #000; font-weight: 600; padding: 2px 6px; border-radius: 4px; min-width: 70px; text-align: center; margin-left: 8px;'>Confidence: {confidence_percent}%</span>"
362
+ content += "</li>"
363
+ content += "</ul>"
364
+ else:
365
+ content += "<p style='color: #777; font-style: italic; margin: 5px 0;'>No matches found above confidence threshold.</p>"
366
+
367
+ content += "</div>"
368
+
369
+ return format_result_card(title=product, content=content)
370
+
371
+ def format_hybrid_results_html(product, results, summary, expanded_description="", confidence_threshold=0.0):
372
  """
373
+ Format results for hybrid matching
374
 
375
  Args:
376
  product: Product name
377
+ results: List of result tuples (name, score) or (id, name, score)
378
+ summary: Summary text to display
379
+ expanded_description: Optional expanded description
380
+ confidence_threshold: Threshold for filtering individual items
381
 
382
  Returns:
383
+ HTML string for displaying results
384
  """
385
  content = ""
386
 
387
+ # Add summary text
388
+ if summary:
389
  content += f"<div style='{STYLES['info_panel']}'>"
390
+ content += f"<p style='margin: 0;'>{summary}</p>"
391
+ content += "</div>"
392
+
393
+ # Add expanded description if provided
394
+ if expanded_description:
395
+ content += f"<div style='{STYLES['info_panel']}'>"
396
+ content += "<h4 style='margin-top: 0; margin-bottom: 8px; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 5px;'>Expanded Description</h4>"
397
+ content += f"<p style='margin: 0;'>{expanded_description}</p>"
398
+ content += "</div>"
399
+
400
+ # Filter results by confidence threshold
401
+ filtered_results = filter_results_by_threshold(results, confidence_threshold)
402
+
403
+ # Format the results
404
+ if filtered_results:
405
+ content += "<div style='padding: 10px;'>"
406
+ content += "<table style='width: 100%; border-collapse: collapse;'>"
407
+ content += "<thead><tr>"
408
+ content += "<th style='text-align: left; padding: 8px; border-bottom: 2px solid #ddd;'>Match</th>"
409
+ content += "<th style='text-align: right; padding: 8px; border-bottom: 2px solid #ddd; width: 100px;'>Confidence</th>"
410
+ content += "</tr></thead>"
411
+ content += "<tbody>"
412
 
413
+ for item in filtered_results:
414
+ display_text, score = parse_result_item(item)
415
+ confidence_percent = int(score * 100)
416
+ confidence_color = get_confidence_color(score)
417
+ bg_color = get_confidence_bg_color(score)
418
 
419
+ content += "<tr>"
420
+ content += f"<td style='text-align: left; padding: 8px; border-bottom: 1px solid #ddd;'>{display_text}</td>"
421
+ content += f"<td style='text-align: center; padding: 8px; border-bottom: 1px solid #ddd;'>"
422
+ content += f"<span style='background-color: {bg_color}; border: 1px solid {confidence_color}; color: #000;"
423
+ content += f"font-weight: 600; padding: 2px 6px; border-radius: 4px; display: inline-block; width: 70px;'>"
424
+ content += f"{confidence_percent}%</span></td>"
425
+ content += "</tr>"
426
 
427
+ content += "</tbody></table>"
428
  content += "</div>"
429
+ else:
430
+ content += "<p style='color: #777; font-style: italic; padding: 10px; margin: 0;'>No matches found above confidence threshold.</p>"
 
 
 
 
 
431
 
432
  return format_result_card(title=product, content=content)
433
 
 
436
  Generate CSS for the UI based on current theme
437
 
438
  Returns:
439
+ CSS string for styling the UI
440
  """
441
  return f"""
442
  .gradio-container .prose {{
 
484
 
485
  def set_theme(theme_name):
486
  """
487
+ Update the global theme setting
488
 
489
  Args:
490
+ theme_name: Theme name to set ("light" or "dark")
491
 
492
  Returns:
493
+ Boolean indicating success
494
  """
495
  global THEME, COLORS, STYLES
 
496
  if theme_name in THEMES:
497
  THEME = theme_name
498
  COLORS = THEMES[THEME]
499
+ # Update styles with new colors
500
+ STYLES = {
 
501
  "card": f"margin-bottom: 20px; border: 1px solid {COLORS['card_border']}; border-radius: 8px; overflow: hidden; background-color: {COLORS['card_bg']};",
502
  "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
503
  "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
504
+ "flex_container": "display: flex; flex-wrap: wrap;",
505
  "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
506
  "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
507
+ "item_list": "list-style-type: none; padding-left: 0;",
508
+ "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
509
+ "empty_message": "color: #7f8c8d; font-style: italic;",
510
  "info_panel": f"padding: 10px; background-color: {COLORS['section_bg']}; margin-bottom: 10px; border-radius: 4px;"
511
+ }
512
+ return True
513
+ return False
514
 
515
+ def format_categories_html(product, categories, chicory_result=None, header_color=None):
 
 
 
 
 
 
 
516
  """
517
+ Format category matching results as HTML
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
518
 
519
  Args:
520
+ product: Product name
521
+ categories: List of (category, score) tuples
522
+ chicory_result: Optional chicory parser result for the product
523
+ header_color: Optional header background color
524
 
525
  Returns:
526
+ HTML string
527
  """
528
+ content = ""
 
 
 
 
 
 
 
 
529
 
530
+ # Add Chicory results if available
531
+ if chicory_result:
532
+ content += f"<div style='{STYLES['info_panel']}'>"
533
+ content += "<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>Chicory Parser Results</h4>"
534
 
535
+ if isinstance(chicory_result, dict):
536
+ ingredient = chicory_result.get("ingredient", "Not found")
537
+ confidence = chicory_result.get("confidence", 0)
538
+ confidence_percent = int(confidence * 100)
539
+
540
+ content += f"<div style='display: flex; justify-content: space-between; align-items: center; padding: 8px; border-radius: 4px;'>"
541
+ content += f"<span style='font-weight: bold;'>{ingredient}</span>"
542
+ content += f"<span style='background-color: {get_confidence_bg_color(confidence)}; border: 1px solid {get_confidence_color(confidence)}; color: #000; font-weight: 600; padding: 2px 6px; border-radius: 4px; min-width: 70px; text-align: center;'>Confidence: {confidence_percent}%</span>"
543
+ content += "</div>"
544
+ else:
545
+ content += f"<p style='{STYLES['empty_message']}'>No Chicory results available</p>"
546
+
547
+ content += "</div>"
548
 
549
+ # Add the category results
550
+ content += format_method_results(
551
+ method_key="categories",
552
+ results=categories,
553
+ color_hex=header_color or METHOD_COLORS.get("categories", "#1abc9c")
554
+ )
 
 
 
 
 
555
 
556
+ return format_result_card(title=product, content=content)
ui_hybrid_matching.py CHANGED
@@ -3,7 +3,7 @@ from utils import SafeProgress
3
  from category_matching import load_categories, hybrid_category_matching
4
  from similarity import hybrid_ingredient_matching, compute_similarities
5
  from ui_core import embeddings, parse_input
6
- from ui_formatters import format_hybrid_results_html, create_results_container
7
  from openai_expansion import expand_product_descriptions
8
  from api_utils import get_voyage_client
9
 
@@ -12,16 +12,6 @@ def categorize_products_with_voyage_reranking(product_input, is_file=False, use_
12
  match_type="categories", progress=gr.Progress()):
13
  """
14
  Categorize products using Voyage reranking with optional description expansion
15
-
16
- Args:
17
- product_input: Text input with product names
18
- is_file: Whether the input is a file
19
- use_expansion: Whether to use AI description expansion (boolean switch)
20
- embedding_top_n: Number of embedding candidates to consider
21
- final_top_n: Final number of results to return
22
- confidence_threshold: Minimum confidence threshold
23
- match_type: Either "ingredients" or "categories"
24
- progress: Progress tracking object
25
  """
26
  progress_tracker = SafeProgress(progress)
27
  progress_tracker(0, desc=f"Starting Voyage reranking for {match_type}...")
@@ -49,7 +39,7 @@ def categorize_products_with_voyage_reranking(product_input, is_file=False, use_
49
  product_names, categories,
50
  embedding_top_n=int(embedding_top_n),
51
  final_top_n=int(final_top_n),
52
- confidence_threshold=confidence_threshold,
53
  expanded_descriptions=expanded_descriptions if use_expansion else None,
54
  progress=progress
55
  )
@@ -64,7 +54,7 @@ def categorize_products_with_voyage_reranking(product_input, is_file=False, use_
64
  product_names, embeddings,
65
  embedding_top_n=int(embedding_top_n),
66
  final_top_n=int(final_top_n),
67
- confidence_threshold=confidence_threshold,
68
  expanded_descriptions=expanded_descriptions if use_expansion else None,
69
  progress=progress
70
  )
@@ -72,31 +62,55 @@ def categorize_products_with_voyage_reranking(product_input, is_file=False, use_
72
  # Format results
73
  progress_tracker(0.9, desc="Formatting results...")
74
 
75
- result_elements = []
 
76
  for product, matches in match_results.items():
77
- # Include the expanded description in the results if used
78
- expanded_text = expanded_descriptions.get(product, "") if use_expansion else ""
79
- result_elements.append(
80
- format_hybrid_results_html(
81
- product=product,
82
- results=matches,
83
- summary=f"{match_type.capitalize()} matches using Voyage AI reranking.",
84
- expanded_description=expanded_text
85
- )
86
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- output_html = create_results_container(
89
- result_elements,
90
- header_text=f"Matched {len(product_names)} products to {match_type} using Voyage AI reranking."
 
 
 
 
91
  )
92
 
93
- if not match_results:
94
- output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
95
-
96
  progress_tracker(1.0, desc="Done!")
97
- return output_html
98
 
99
- # Add this function for Voyage reranking
100
  def hybrid_ingredient_matching_voyage(products, ingredients_dict,
101
  embedding_top_n=20, final_top_n=5,
102
  confidence_threshold=0.5,
@@ -152,13 +166,13 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
152
  model="rerank-2"
153
  )
154
 
155
- # Process results
156
  voyage_results = []
157
  for result in reranked["results"]:
158
  score = result["relevance_score"]
159
- if score >= confidence_threshold:
160
- voyage_results.append((result["document"]["text"], score))
161
 
 
162
  final_results[product] = voyage_results[:final_top_n]
163
 
164
  except Exception as e:
@@ -167,4 +181,4 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
167
  final_results[product] = candidates[:1]
168
 
169
  progress_tracker(1.0, desc="Voyage ingredient matching complete")
170
- return final_results
 
3
  from category_matching import load_categories, hybrid_category_matching
4
  from similarity import hybrid_ingredient_matching, compute_similarities
5
  from ui_core import embeddings, parse_input
6
+ from ui_formatters import format_hybrid_results_html, create_results_container, format_reranking_results_html
7
  from openai_expansion import expand_product_descriptions
8
  from api_utils import get_voyage_client
9
 
 
12
  match_type="categories", progress=gr.Progress()):
13
  """
14
  Categorize products using Voyage reranking with optional description expansion
 
 
 
 
 
 
 
 
 
 
15
  """
16
  progress_tracker = SafeProgress(progress)
17
  progress_tracker(0, desc=f"Starting Voyage reranking for {match_type}...")
 
39
  product_names, categories,
40
  embedding_top_n=int(embedding_top_n),
41
  final_top_n=int(final_top_n),
42
+ confidence_threshold=0.0, # Don't apply threshold here - do it in display
43
  expanded_descriptions=expanded_descriptions if use_expansion else None,
44
  progress=progress
45
  )
 
54
  product_names, embeddings,
55
  embedding_top_n=int(embedding_top_n),
56
  final_top_n=int(final_top_n),
57
+ confidence_threshold=0.0, # Don't apply threshold here - do it in display
58
  expanded_descriptions=expanded_descriptions if use_expansion else None,
59
  progress=progress
60
  )
 
62
  # Format results
63
  progress_tracker(0.9, desc="Formatting results...")
64
 
65
+ # Convert to unified format for formatter
66
+ formatted_results = []
67
  for product, matches in match_results.items():
68
+ # Include all products, even with no matches
69
+ formatted_result = {
70
+ "product_name": product,
71
+ "confidence": max([item[-1] for item in matches]) if matches else 0,
72
+ "matching_items": [],
73
+ "item_scores": [],
74
+ "explanation": expanded_descriptions.get(product, "") if use_expansion else ""
75
+ }
76
+
77
+ # Format matching items based on match type
78
+ if match_type == "ingredients":
79
+ # Extract ingredient names and scores
80
+ formatted_result["matching_items"] = [item[0] for item in matches]
81
+ formatted_result["item_scores"] = [item[1] for item in matches]
82
+ else: # categories
83
+ for match in matches:
84
+ if len(match) >= 2:
85
+ cat_id = match[0]
86
+ # Some category matches might include a text description
87
+ cat_text = match[1] if len(match) > 2 else ""
88
+ score = match[-1]
89
+ if isinstance(cat_text, (int, float)): # This is not text but a score
90
+ cat_text = ""
91
+ formatted_result["matching_items"].append(
92
+ f"{cat_id}: {cat_text}" if cat_text else f"{cat_id}"
93
+ )
94
+ formatted_result["item_scores"].append(score)
95
+
96
+ formatted_results.append(formatted_result)
97
+
98
+ if not formatted_results:
99
+ return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
100
 
101
+ result_html = format_reranking_results_html(
102
+ results=formatted_results,
103
+ match_type=match_type,
104
+ show_scores=True,
105
+ include_explanation=use_expansion,
106
+ method="voyage",
107
+ confidence_threshold=confidence_threshold # Pass the threshold to the formatter
108
  )
109
 
 
 
 
110
  progress_tracker(1.0, desc="Done!")
111
+ return result_html
112
 
113
+ # Update the function in ui_hybrid_matching.py
114
  def hybrid_ingredient_matching_voyage(products, ingredients_dict,
115
  embedding_top_n=20, final_top_n=5,
116
  confidence_threshold=0.5,
 
166
  model="rerank-2"
167
  )
168
 
169
+ # Process results - include all results but keep the threshold for later filtering
170
  voyage_results = []
171
  for result in reranked["results"]:
172
  score = result["relevance_score"]
173
+ voyage_results.append((result["document"]["text"], score))
 
174
 
175
+ # Still limit to final_top_n but don't filter by threshold here
176
  final_results[product] = voyage_results[:final_top_n]
177
 
178
  except Exception as e:
 
181
  final_results[product] = candidates[:1]
182
 
183
  progress_tracker(1.0, desc="Voyage ingredient matching complete")
184
+ return final_results