esilver commited on
Commit
5e72e96
·
1 Parent(s): c15c118
Files changed (2) hide show
  1. ui_expanded_matching.py +123 -163
  2. ui_ingredient_matching.py +12 -11
ui_expanded_matching.py CHANGED
@@ -8,9 +8,10 @@ from ui_formatters import format_reranking_results_html
8
  from api_utils import get_openai_client, process_in_parallel, rank_ingredients_openai, rank_categories_openai
9
  from category_matching import load_categories, load_category_embeddings
10
  import json
 
11
 
12
  def categorize_products_with_openai_reranking(product_input, is_file=False, use_expansion=False,
13
- embedding_top_n=20, top_n=10, confidence_threshold=0.5,
14
  match_type="ingredients"): # Removed progress parameter
15
  """
16
  Categorize products using OpenAI reranking with optional description expansion
@@ -22,133 +23,119 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
22
  product_names, error = parse_input(product_input, is_file)
23
  if error:
24
  return error
25
-
26
  # Validate embeddings are loaded if doing ingredient matching
27
  if match_type == "ingredients" and not embeddings:
28
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
 
29
  # Optional description expansion
30
  expanded_descriptions = {}
31
  if use_expansion:
32
  # progress_tracker(0.2, desc="Expanding product descriptions...") # Removed progress
33
- expanded_descriptions = expand_product_descriptions(product_names) # Removed progress argument
34
-
 
 
 
 
 
 
35
  # Get shared OpenAI client
36
  openai_client = get_openai_client()
37
-
38
- products_for_embedding = ''
39
 
40
- if match_type == "ingredients":
41
- # Generate product embeddings
42
- # progress_tracker(0.4, desc="Generating product embeddings...") # Removed progress
43
- if use_expansion and expanded_descriptions:
44
- # Use expanded descriptions for embedding creation when available
45
- products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
46
- # Map expanded descriptions back to original product names for consistent keys
47
- product_embeddings = {}
48
- temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names) # Removed progress, pass original names
49
-
50
- # Ensure we use original product names as keys
51
- for i, product_name in enumerate(product_names):
52
- if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
53
- product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
54
- else:
55
- # Standard embedding creation with just product names
56
- product_embeddings = create_product_embeddings(product_names) # Removed progress
57
-
58
- # Compute embedding similarities for ingredients
59
- # progress_tracker(0.6, desc="Computing ingredient similarities...") # Removed progress
60
- all_similarities = compute_similarities(embeddings, product_embeddings)
61
-
62
- print(f"product_names: {product_names}")
63
- print(f"products_for_embedding: {products_for_embedding}")
64
- # print(f"all_similarities: {all_similarities}")
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  if not all_similarities:
 
67
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
68
-
69
- # progress_tracker(0.7, desc="Re-ranking with OpenAI...") # Removed progress
70
-
71
- # Function for processing each product
72
- def process_reranking(product):
73
- if product not in all_similarities:
74
- return product, []
75
-
 
 
 
 
 
76
  candidates = all_similarities[product][:embedding_top_n]
77
- if not candidates:
78
- return product, []
79
-
80
  candidate_ingredients = [c[0] for c in candidates]
81
  expanded_text = expanded_descriptions.get(product, product) if use_expansion else product
82
-
83
  try:
84
- # Use the shared utility function - now passing 0.0 as threshold to get all results
85
- # We'll apply the threshold at display time
86
  reranked_ingredients = rank_ingredients_openai(
87
- product=product,
88
- candidates=candidate_ingredients,
89
- expanded_description=expanded_text,
90
- client=openai_client,
91
- model="gpt-4o-mini",
92
- max_results=top_n,
93
- confidence_threshold=0.0, # Don't filter here, do it at display time
94
- debug=True
95
  )
96
-
97
  return product, reranked_ingredients
98
-
99
  except Exception as e:
100
- print(f"Error reranking {product}: {e}")
101
- # Fall back to top embedding match
102
- return product, candidates[:1] # Don't filter here
103
-
104
  # Process all products in parallel
105
  final_results = process_in_parallel(
106
- items=product_names,
107
- processor_func=process_reranking,
108
- max_workers=min(10, len(product_names)) # Moved max_workers inside
109
- # Removed progress tracking arguments
110
- ) # Corrected closing parenthesis
111
-
112
- else: # categories
113
- # Load category embeddings instead of JSON categories
114
- # progress_tracker(0.5, desc="Loading category embeddings...") # Removed progress
115
- category_embeddings = load_category_embeddings()
116
-
117
- if not category_embeddings:
118
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category embeddings found. Please check that the embeddings file exists at data/category_embeddings.pickle.</div>"
119
-
120
- # Generate product embeddings
121
- # progress_tracker(0.6, desc="Generating product embeddings...") # Removed progress
122
- if use_expansion and expanded_descriptions:
123
- # Use expanded descriptions for embedding creation when available
124
- products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
125
- # Map expanded descriptions back to original product names for consistent keys
126
- product_embeddings = {}
127
- temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names) # Removed progress, pass original names
128
-
129
- # Ensure we use original product names as keys
130
- for i, product_name in enumerate(product_names):
131
- if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
132
- product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
133
- else:
134
- # Standard embedding creation with just product names
135
- product_embeddings = create_product_embeddings(product_names) # Removed progress
136
-
137
- # Compute embedding similarities for categories
138
- # progress_tracker(0.7, desc="Computing category similarities...") # Removed progress
139
- all_similarities = compute_similarities(category_embeddings, product_embeddings)
140
-
141
- if not all_similarities:
142
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category similarities found. Please try different product names.</div>"
143
-
144
- # Collect all needed category IDs first - don't filter by threshold here
145
  needed_category_ids = set()
146
  for product, similarities in all_similarities.items():
147
  for category_id, score in similarities[:embedding_top_n]:
148
  needed_category_ids.add(category_id)
149
-
150
- # Load only the needed categories from JSON
151
- # progress_tracker(0.75, desc="Loading category descriptions...") # Removed progress
152
  category_descriptions = {}
153
  if needed_category_ids:
154
  try:
@@ -158,101 +145,74 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
158
  if item["id"] in needed_category_ids:
159
  category_descriptions[item["id"]] = item["text"]
160
  except Exception as e:
161
- print(f"Error loading category descriptions: {e}")
162
-
163
- # Function to process each product
164
- def process_category_matching(product):
165
- if product not in all_similarities:
166
- return product, []
167
-
168
- candidates = all_similarities[product][:embedding_top_n]
169
 
170
- print(f"candidates: {candidates}")
171
-
172
- if not candidates:
173
- return product, []
174
-
175
- # Get the expanded description or use product name if no expansion
 
 
176
  expanded_text = expanded_descriptions.get(product, product) if use_expansion else product
177
-
178
  try:
179
- # FIXED: Filter categories to only include those in the current product's candidates
180
- product_category_ids = [cat_id for cat_id, _ in candidates]
181
- filtered_categories = {cat_id: category_descriptions[cat_id]
182
- for cat_id in product_category_ids
183
- if cat_id in category_descriptions}
184
-
185
- # Pass 0.0 as threshold to get all results - apply threshold at display time
186
  category_matches = rank_categories_openai(
187
- product=product,
188
- categories=filtered_categories, # Pass only this product's relevant categories
189
- expanded_description=expanded_text,
190
- client=openai_client,
191
- model="gpt-4o-mini",
192
- max_results=top_n,
193
- confidence_threshold=0.0, # Don't filter here
194
- debug=True
195
  )
196
-
197
- # Format results with category descriptions if needed
198
  formatted_matches = []
199
  for category_id, score in category_matches:
200
  category_text = category_descriptions.get(category_id, "Unknown category")
201
  formatted_matches.append((category_id, category_text, score))
202
-
203
  return product, formatted_matches
204
  except Exception as e:
205
- print(f"Error matching {product} to categories: {e}")
206
- return product, []
207
-
 
 
 
 
 
 
208
  # Process all products in parallel
209
  final_results = process_in_parallel(
210
- items=product_names,
211
- processor_func=process_category_matching,
212
- max_workers=min(10, len(product_names)) # Restored max_workers inside the call
213
- # Removed progress tracking arguments
214
- ) # Correctly placed closing parenthesis
215
-
216
- # Format results
217
- # progress_tracker(0.9, desc="Formatting results...") # Removed progress
218
-
219
- # Create a list of result dictionaries in consistent format
220
  formatted_results = []
221
-
222
  for product, matches in final_results.items():
223
- # Include all products, even with no matches
224
  formatted_result = {
225
  "product_name": product,
226
  "confidence": max([item[-1] for item in matches]) if matches else 0,
227
  "matching_items": [],
228
- "item_scores": [], # Add item_scores to align with Voyage implementation
229
  "explanation": expanded_descriptions.get(product, "") if use_expansion else ""
230
  }
231
-
232
- # Format matching items based on match type
233
  if match_type == "ingredients":
234
  formatted_result["matching_items"] = [item for item, score in matches]
235
  formatted_result["item_scores"] = [score for item, score in matches]
236
- else: # categories
237
  for cat_id, cat_desc, score in matches:
238
- formatted_result["matching_items"].append(
239
- f"{cat_id}: {cat_desc}" if cat_desc else f"{cat_id}"
240
- )
241
  formatted_result["item_scores"].append(score)
242
-
243
  formatted_results.append(formatted_result)
244
-
245
  if not formatted_results:
246
- return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
247
-
248
  result_html = format_reranking_results_html(
249
  results=formatted_results,
250
  match_type=match_type,
251
  show_scores=True,
252
  include_explanation=use_expansion,
253
  method="openai",
254
- confidence_threshold=confidence_threshold # Pass the threshold to the formatter
255
  )
256
-
257
- # progress_tracker(1.0, desc="Done!") # Removed progress
258
  return result_html
 
8
  from api_utils import get_openai_client, process_in_parallel, rank_ingredients_openai, rank_categories_openai
9
  from category_matching import load_categories, load_category_embeddings
10
  import json
11
+ import traceback # Import traceback for detailed error logging
12
 
13
  def categorize_products_with_openai_reranking(product_input, is_file=False, use_expansion=False,
14
+ embedding_top_n=20, top_n=10, confidence_threshold=0.5,
15
  match_type="ingredients"): # Removed progress parameter
16
  """
17
  Categorize products using OpenAI reranking with optional description expansion
 
23
  product_names, error = parse_input(product_input, is_file)
24
  if error:
25
  return error
26
+
27
  # Validate embeddings are loaded if doing ingredient matching
28
  if match_type == "ingredients" and not embeddings:
29
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
30
+
31
  # Optional description expansion
32
  expanded_descriptions = {}
33
  if use_expansion:
34
  # progress_tracker(0.2, desc="Expanding product descriptions...") # Removed progress
35
+ try:
36
+ expanded_descriptions = expand_product_descriptions(product_names) # Removed progress argument
37
+ except Exception as e:
38
+ print(f"ERROR during description expansion: {e}")
39
+ print(traceback.format_exc())
40
+ return f"<div style='color: red;'>Error during description expansion: {e}</div>"
41
+
42
+
43
  # Get shared OpenAI client
44
  openai_client = get_openai_client()
 
 
45
 
46
+ product_embeddings = {} # Initialize here for broader scope
47
+ all_similarities = {} # Initialize here
48
+
49
+ try: # Wrap embedding generation and similarity computation
50
+ if match_type == "ingredients":
51
+ # --- Ingredient Matching Logic ---
52
+ # Generate product embeddings
53
+ if use_expansion and expanded_descriptions:
54
+ products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
55
+ temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names)
56
+ # Correctly map using original product names as keys
57
+ for product_name in product_names:
58
+ if product_name in temp_embeddings:
59
+ product_embeddings[product_name] = temp_embeddings[product_name]
60
+ else:
61
+ product_embeddings = create_product_embeddings(product_names)
62
+
63
+ # Check if embeddings were successfully generated/mapped
64
+ if not product_embeddings:
65
+ return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings for ingredients. Please try again.</div>"
 
 
 
 
 
66
 
67
+ # Compute embedding similarities for ingredients
68
+ all_similarities = compute_similarities(embeddings, product_embeddings)
69
+
70
+ else: # categories
71
+ # --- Category Matching Logic ---
72
+ category_embeddings = load_category_embeddings()
73
+ if not category_embeddings:
74
+ return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category embeddings found. Please check 'data/category_embeddings.pickle'.</div>"
75
+
76
+ # Generate product embeddings
77
+ if use_expansion and expanded_descriptions:
78
+ products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
79
+ temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names)
80
+ # Correctly map using original product names as keys
81
+ for product_name in product_names:
82
+ if product_name in temp_embeddings:
83
+ product_embeddings[product_name] = temp_embeddings[product_name]
84
+ else:
85
+ product_embeddings = create_product_embeddings(product_names)
86
+
87
+ # Check if embeddings were successfully generated/mapped
88
+ if not product_embeddings:
89
+ return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings for categories. Please try again.</div>"
90
+
91
+ # Compute embedding similarities for categories
92
+ all_similarities = compute_similarities(category_embeddings, product_embeddings)
93
+
94
+ # --- Common Logic Post Similarity ---
95
  if not all_similarities:
96
+ # This check might be redundant if product_embeddings check catches the issue earlier, but keep for safety
97
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
98
+
99
+ except Exception as e: # Catch errors during embedding/similarity
100
+ print(f"ERROR during embedding generation or similarity computation: {e}")
101
+ print(traceback.format_exc())
102
+ return f"<div style='color: red;'>Error during processing: {e}<br><pre>{traceback.format_exc()}</pre></div>"
103
+
104
+
105
+ # --- Reranking Logic ---
106
+ final_results = {}
107
+ if match_type == "ingredients":
108
+ # Function for processing each product (Ingredients)
109
+ def process_reranking_ingredients(product):
110
+ if product not in all_similarities: return product, []
111
  candidates = all_similarities[product][:embedding_top_n]
112
+ if not candidates: return product, []
 
 
113
  candidate_ingredients = [c[0] for c in candidates]
114
  expanded_text = expanded_descriptions.get(product, product) if use_expansion else product
 
115
  try:
 
 
116
  reranked_ingredients = rank_ingredients_openai(
117
+ product=product, candidates=candidate_ingredients, expanded_description=expanded_text,
118
+ client=openai_client, model="gpt-4o-mini", max_results=top_n,
119
+ confidence_threshold=0.0, debug=True
 
 
 
 
 
120
  )
 
121
  return product, reranked_ingredients
 
122
  except Exception as e:
123
+ print(f"Error reranking ingredients for {product}: {e}")
124
+ return product, candidates[:1] # Fallback
125
+
 
126
  # Process all products in parallel
127
  final_results = process_in_parallel(
128
+ items=product_names, processor_func=process_reranking_ingredients,
129
+ max_workers=min(10, len(product_names))
130
+ )
131
+
132
+ else: # categories
133
+ # Load category descriptions needed for reranking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  needed_category_ids = set()
135
  for product, similarities in all_similarities.items():
136
  for category_id, score in similarities[:embedding_top_n]:
137
  needed_category_ids.add(category_id)
138
+
 
 
139
  category_descriptions = {}
140
  if needed_category_ids:
141
  try:
 
145
  if item["id"] in needed_category_ids:
146
  category_descriptions[item["id"]] = item["text"]
147
  except Exception as e:
148
+ print(f"Error loading category descriptions: {e}") # Non-fatal, continue without descriptions
 
 
 
 
 
 
 
149
 
150
+ # Function to process each product (Categories)
151
+ def process_reranking_categories(product):
152
+ if product not in all_similarities: return product, []
153
+ candidates = all_similarities[product][:embedding_top_n]
154
+ if not candidates: return product, []
155
+ product_category_ids = [cat_id for cat_id, _ in candidates]
156
+ filtered_categories = {cat_id: category_descriptions.get(cat_id, f"Category {cat_id}") # Use get with fallback
157
+ for cat_id in product_category_ids}
158
  expanded_text = expanded_descriptions.get(product, product) if use_expansion else product
 
159
  try:
 
 
 
 
 
 
 
160
  category_matches = rank_categories_openai(
161
+ product=product, categories=filtered_categories, expanded_description=expanded_text,
162
+ client=openai_client, model="gpt-4o-mini", max_results=top_n,
163
+ confidence_threshold=0.0, debug=True
 
 
 
 
 
164
  )
165
+ # Format results with category descriptions
 
166
  formatted_matches = []
167
  for category_id, score in category_matches:
168
  category_text = category_descriptions.get(category_id, "Unknown category")
169
  formatted_matches.append((category_id, category_text, score))
 
170
  return product, formatted_matches
171
  except Exception as e:
172
+ print(f"Error reranking categories for {product}: {e}")
173
+ # Fallback: Format top embedding candidates (without reranking score)
174
+ fallback_matches = []
175
+ for cat_id, score in candidates[:1]: # Take top 1 embedding match as fallback
176
+ category_text = category_descriptions.get(cat_id, "Unknown category")
177
+ fallback_matches.append((cat_id, category_text, score)) # Use embedding score
178
+ return product, fallback_matches
179
+
180
+
181
  # Process all products in parallel
182
  final_results = process_in_parallel(
183
+ items=product_names, processor_func=process_reranking_categories,
184
+ max_workers=min(10, len(product_names))
185
+ )
186
+
187
+ # --- Format final results ---
 
 
 
 
 
188
  formatted_results = []
 
189
  for product, matches in final_results.items():
 
190
  formatted_result = {
191
  "product_name": product,
192
  "confidence": max([item[-1] for item in matches]) if matches else 0,
193
  "matching_items": [],
194
+ "item_scores": [],
195
  "explanation": expanded_descriptions.get(product, "") if use_expansion else ""
196
  }
 
 
197
  if match_type == "ingredients":
198
  formatted_result["matching_items"] = [item for item, score in matches]
199
  formatted_result["item_scores"] = [score for item, score in matches]
200
+ else: # categories
201
  for cat_id, cat_desc, score in matches:
202
+ formatted_result["matching_items"].append(f"{cat_id}: {cat_desc}")
 
 
203
  formatted_result["item_scores"].append(score)
 
204
  formatted_results.append(formatted_result)
205
+
206
  if not formatted_results:
207
+ return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found after processing.</div>"
208
+
209
  result_html = format_reranking_results_html(
210
  results=formatted_results,
211
  match_type=match_type,
212
  show_scores=True,
213
  include_explanation=use_expansion,
214
  method="openai",
215
+ confidence_threshold=confidence_threshold
216
  )
217
+
 
218
  return result_html
ui_ingredient_matching.py CHANGED
@@ -35,15 +35,17 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
35
  # Map expanded descriptions back to original product names for consistent keys
36
  products_embeddings = {}
37
  temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names) # Removed progress, pass original names for keys
38
-
39
  # Ensure we use original product names as keys
40
- for i, product_name in enumerate(product_names):
41
- if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
42
- products_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
 
 
43
  else:
44
  # Standard embedding creation with just product names
45
  products_embeddings = create_product_embeddings(product_names) # Removed progress
46
-
47
  if not products_embeddings:
48
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
49
 
@@ -58,19 +60,19 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
58
  # Format results
59
  progress_tracker(0.9, desc="Formatting results...")
60
  output_html = f"<p style='color: #555;'>Processing {len(product_names)} products.</p>"
61
-
62
  for product, similarities in all_similarities.items():
63
  filtered_similarities = [(ingredient, score) for ingredient, score in similarities if score >= confidence_threshold]
64
  top_similarities = filtered_similarities[:int(top_n)]
65
-
66
  # Add expansion explanation if available
67
  expansion_text = expanded_descriptions.get(product, "") if use_expansion else ""
68
-
69
  # Debug info for Chicory results
70
  chicory_data = chicory_results.get(product, [])
71
  output_html += format_categories_html(
72
- product,
73
- top_similarities,
74
  chicory_result=chicory_data,
75
  explanation=expansion_text,
76
  match_type="ingredients",
@@ -83,4 +85,3 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
83
 
84
  progress_tracker(1.0, desc="Done!")
85
  return output_html # Return the generated HTML directly
86
-
 
35
  # Map expanded descriptions back to original product names for consistent keys
36
  products_embeddings = {}
37
  temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names) # Removed progress, pass original names for keys
38
+
39
  # Ensure we use original product names as keys
40
+ # Corrected loop: Iterate through original names and use them as keys
41
+ for product_name in product_names:
42
+ # Check if the original product name exists as a key in the returned embeddings
43
+ if product_name in temp_embeddings:
44
+ products_embeddings[product_name] = temp_embeddings[product_name]
45
  else:
46
  # Standard embedding creation with just product names
47
  products_embeddings = create_product_embeddings(product_names) # Removed progress
48
+
49
  if not products_embeddings:
50
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
51
 
 
60
  # Format results
61
  progress_tracker(0.9, desc="Formatting results...")
62
  output_html = f"<p style='color: #555;'>Processing {len(product_names)} products.</p>"
63
+
64
  for product, similarities in all_similarities.items():
65
  filtered_similarities = [(ingredient, score) for ingredient, score in similarities if score >= confidence_threshold]
66
  top_similarities = filtered_similarities[:int(top_n)]
67
+
68
  # Add expansion explanation if available
69
  expansion_text = expanded_descriptions.get(product, "") if use_expansion else ""
70
+
71
  # Debug info for Chicory results
72
  chicory_data = chicory_results.get(product, [])
73
  output_html += format_categories_html(
74
+ product,
75
+ top_similarities,
76
  chicory_result=chicory_data,
77
  explanation=expansion_text,
78
  match_type="ingredients",
 
85
 
86
  progress_tracker(1.0, desc="Done!")
87
  return output_html # Return the generated HTML directly