esilver commited on
Commit
1e737a1
·
1 Parent(s): 9a56a50

Added expansion for embeddings

Browse files
api_utils.py CHANGED
@@ -161,7 +161,7 @@ def rank_ingredients_openai(
161
  candidates: List[str],
162
  expanded_description: str = None,
163
  client=None,
164
- model: str = "o3-mini",
165
  max_results: int = 3,
166
  confidence_threshold: float = 0.5,
167
  debug: bool = False
@@ -273,7 +273,7 @@ def rank_categories_openai(
273
  categories: dict,
274
  expanded_description: str = None,
275
  client=None,
276
- model: str = "o3-mini",
277
  max_results: int = 5,
278
  confidence_threshold: float = 0.5,
279
  debug: bool = False
 
161
  candidates: List[str],
162
  expanded_description: str = None,
163
  client=None,
164
+ model: str = "gpt-4o-mini",
165
  max_results: int = 3,
166
  confidence_threshold: float = 0.5,
167
  debug: bool = False
 
273
  categories: dict,
274
  expanded_description: str = None,
275
  client=None,
276
+ model: str = "gpt-4o-mini",
277
  max_results: int = 5,
278
  confidence_threshold: float = 0.5,
279
  debug: bool = False
comparison.py CHANGED
@@ -139,7 +139,7 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
139
  product=product,
140
  candidates=candidate_ingredients,
141
  client=openai_client,
142
- model="o3-mini",
143
  max_results=final_top_n,
144
  confidence_threshold=confidence_threshold
145
  )
 
139
  product=product,
140
  candidates=candidate_ingredients,
141
  client=openai_client,
142
+ model="gpt-4o-mini",
143
  max_results=final_top_n,
144
  confidence_threshold=confidence_threshold
145
  )
similarity.py CHANGED
@@ -205,7 +205,7 @@ def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str,
205
 
206
  # Apply re-ranking using OpenAI's structured output
207
  response = openai_client.responses.create(
208
- model="o3-mini",
209
  # reasoning={"effort": "low"},
210
  input=[
211
  {"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
 
205
 
206
  # Apply re-ranking using OpenAI's structured output
207
  response = openai_client.responses.create(
208
+ model="gpt-4o-mini",
209
  # reasoning={"effort": "low"},
210
  input=[
211
  {"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
ui.py CHANGED
@@ -26,6 +26,11 @@ def create_demo():
26
  )
27
  input_controls = gr.Row()
28
  with input_controls:
 
 
 
 
 
29
  top_n = gr.Slider(1, 25, 10, step=1, label="Top N Results")
30
  confidence = gr.Slider(0.1, 0.9, 0.5, label="Similarity Threshold")
31
 
@@ -50,6 +55,11 @@ def create_demo():
50
  )
51
  category_input_controls = gr.Row()
52
  with category_input_controls:
 
 
 
 
 
53
  category_top_n = gr.Slider(1, 10, 5, step=1, label="Top N Categories")
54
  category_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
55
 
@@ -168,14 +178,14 @@ def create_demo():
168
  # Connect buttons for ingredient matching
169
  categorize_btn.click(
170
  fn=categorize_products,
171
- inputs=[text_input, gr.State(False), top_n, confidence],
172
  outputs=[text_output],
173
  )
174
 
175
  # Connect buttons for category matching
176
  match_categories_btn.click(
177
  fn=categorize_products_by_category,
178
- inputs=[category_text_input, gr.State(False), category_top_n, category_confidence],
179
  outputs=[category_output],
180
  )
181
 
 
26
  )
27
  input_controls = gr.Row()
28
  with input_controls:
29
+ use_expansion = gr.Checkbox(
30
+ value=False,
31
+ label="Use Description Expansion",
32
+ info="Expand product descriptions using AI before matching"
33
+ )
34
  top_n = gr.Slider(1, 25, 10, step=1, label="Top N Results")
35
  confidence = gr.Slider(0.1, 0.9, 0.5, label="Similarity Threshold")
36
 
 
55
  )
56
  category_input_controls = gr.Row()
57
  with category_input_controls:
58
+ category_use_expansion = gr.Checkbox(
59
+ value=False,
60
+ label="Use Description Expansion",
61
+ info="Expand product descriptions using AI before matching"
62
+ )
63
  category_top_n = gr.Slider(1, 10, 5, step=1, label="Top N Categories")
64
  category_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
65
 
 
178
  # Connect buttons for ingredient matching
179
  categorize_btn.click(
180
  fn=categorize_products,
181
+ inputs=[text_input, gr.State(False), use_expansion, top_n, confidence],
182
  outputs=[text_output],
183
  )
184
 
185
  # Connect buttons for category matching
186
  match_categories_btn.click(
187
  fn=categorize_products_by_category,
188
+ inputs=[category_text_input, gr.State(False), category_use_expansion, category_top_n, category_confidence],
189
  outputs=[category_output],
190
  )
191
 
ui_category_matching.py CHANGED
@@ -3,10 +3,12 @@ from utils import SafeProgress
3
  from category_matching import load_categories, match_products_to_categories
4
  from ui_core import parse_input
5
  from ui_formatters import format_categories_html
 
 
 
6
 
7
- def categorize_products_by_category(product_input, is_file=False, top_n=5, confidence_threshold=0.5, progress=gr.Progress()):
8
  """Categorize products by matching them to predefined categories"""
9
- progress_tracker = SafeProgress(progress)
10
  progress_tracker(0, desc="Starting categorization...")
11
 
12
  # Parse input
@@ -14,6 +16,16 @@ def categorize_products_by_category(product_input, is_file=False, top_n=5, confi
14
  if error:
15
  return error
16
 
 
 
 
 
 
 
 
 
 
 
17
  # Load categories
18
  progress_tracker(0.2, desc="Loading categories...")
19
  categories = load_categories()
@@ -21,11 +33,11 @@ def categorize_products_by_category(product_input, is_file=False, top_n=5, confi
21
  # Match products to categories
22
  progress_tracker(0.3, desc="Matching products to categories...")
23
  match_results = match_products_to_categories(
24
- product_names,
25
  categories,
26
  top_n=int(top_n),
27
  confidence_threshold=confidence_threshold,
28
- progress=progress
29
  )
30
 
31
  # Format results
@@ -33,8 +45,17 @@ def categorize_products_by_category(product_input, is_file=False, top_n=5, confi
33
  output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
34
  output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
35
 
36
- for product, categories in match_results.items():
37
- output_html += format_categories_html(product, categories)
 
 
 
 
 
 
 
 
 
38
  output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
39
 
40
  output_html += "</div>"
 
3
  from category_matching import load_categories, match_products_to_categories
4
  from ui_core import parse_input
5
  from ui_formatters import format_categories_html
6
+ from openai_expansion import expand_product_descriptions
7
+
8
+ def categorize_products_by_category(product_input, is_file=False, use_expansion=False, top_n=10, confidence_threshold=0.5):
9
 
 
10
  """Categorize products by matching them to predefined categories"""
11
+ progress_tracker = SafeProgress(gr.Progress())
12
  progress_tracker(0, desc="Starting categorization...")
13
 
14
  # Parse input
 
16
  if error:
17
  return error
18
 
19
+ # Optional description expansion
20
+ expanded_descriptions = {}
21
+ if use_expansion:
22
+ progress_tracker(0.1, desc="Expanding product descriptions...")
23
+ expanded_descriptions = expand_product_descriptions(product_names, progress=progress_tracker)
24
+ # Use expanded descriptions for matching if available
25
+ products_to_match = [expanded_descriptions.get(p, p) for p in product_names]
26
+ else:
27
+ products_to_match = product_names
28
+
29
  # Load categories
30
  progress_tracker(0.2, desc="Loading categories...")
31
  categories = load_categories()
 
33
  # Match products to categories
34
  progress_tracker(0.3, desc="Matching products to categories...")
35
  match_results = match_products_to_categories(
36
+ products_to_match,
37
  categories,
38
  top_n=int(top_n),
39
  confidence_threshold=confidence_threshold,
40
+ progress=progress_tracker
41
  )
42
 
43
  # Format results
 
45
  output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
46
  output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
47
 
48
+ for i, product in enumerate(product_names):
49
+ categories = match_results.get(products_to_match[i], [])
50
+ expansion_text = ""
51
+ if use_expansion and product in expanded_descriptions:
52
+ expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"
53
+
54
+ output_html += format_categories_html(
55
+ product,
56
+ categories,
57
+ explanation=expansion_text
58
+ )
59
  output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
60
 
61
  output_html += "</div>"
ui_formatters.py CHANGED
@@ -61,7 +61,7 @@ METHOD_NAMES = {
61
  "base": "Base Embeddings",
62
  "voyage": "Voyage AI Reranker",
63
  "chicory": "Chicory Parser",
64
- "openai": "OpenAI o3-mini",
65
  "expanded": "Expanded Description",
66
  "hybrid": "Hybrid Matching",
67
  "categories": "Category Matches"
@@ -512,7 +512,7 @@ def set_theme(theme_name):
512
  return True
513
  return False
514
 
515
- def format_categories_html(product, categories, chicory_result=None, header_color=None):
516
  """
517
  Format category matching results as HTML
518
 
@@ -521,12 +521,20 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
521
  categories: List of (category, score) tuples
522
  chicory_result: Optional chicory parser result for the product
523
  header_color: Optional header background color
 
524
 
525
  Returns:
526
  HTML string
527
  """
528
  content = ""
529
 
 
 
 
 
 
 
 
530
  # Add Chicory results if available
531
  if chicory_result:
532
  content += f"<div style='{STYLES['info_panel']}'>"
@@ -554,3 +562,4 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
554
  )
555
 
556
  return format_result_card(title=product, content=content)
 
 
61
  "base": "Base Embeddings",
62
  "voyage": "Voyage AI Reranker",
63
  "chicory": "Chicory Parser",
64
+ "openai": "OpenAI",
65
  "expanded": "Expanded Description",
66
  "hybrid": "Hybrid Matching",
67
  "categories": "Category Matches"
 
512
  return True
513
  return False
514
 
515
+ def format_categories_html(product, categories, chicory_result=None, header_color=None, explanation=""):
516
  """
517
  Format category matching results as HTML
518
 
 
521
  categories: List of (category, score) tuples
522
  chicory_result: Optional chicory parser result for the product
523
  header_color: Optional header background color
524
+ explanation: Optional expanded description text
525
 
526
  Returns:
527
  HTML string
528
  """
529
  content = ""
530
 
531
+ # Add expanded description if available
532
+ if explanation:
533
+ content += f"<div style='{STYLES['info_panel']}'>"
534
+ content += "<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>Expanded Description</h4>"
535
+ content += f"<p style='margin-bottom: 8px;'>{explanation}</p>"
536
+ content += "</div>"
537
+
538
  # Add Chicory results if available
539
  if chicory_result:
540
  content += f"<div style='{STYLES['info_panel']}'>"
 
562
  )
563
 
564
  return format_result_card(title=product, content=content)
565
+
ui_ingredient_matching.py CHANGED
@@ -5,10 +5,11 @@ from similarity import compute_similarities
5
  from chicory_api import call_chicory_parser
6
  from ui_core import embeddings, parse_input
7
  from ui_formatters import format_categories_html, create_results_container
 
8
 
9
- def categorize_products(product_input, is_file=False, top_n=10, confidence_threshold=0.5, progress=gr.Progress()):
10
  """Categorize products from text input or file"""
11
- progress_tracker = SafeProgress(progress)
12
  progress_tracker(0, desc="Starting...")
13
 
14
  # Parse input
@@ -20,40 +21,52 @@ def categorize_products(product_input, is_file=False, top_n=10, confidence_thres
20
  if not embeddings:
21
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
22
 
 
 
 
 
 
 
23
  # Create embeddings
24
- progress_tracker(0.2, desc="Generating product embeddings...")
25
- products_embeddings = create_product_embeddings(product_names, progress=progress)
26
 
27
  if not products_embeddings:
28
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
29
 
30
  # Call Chicory Parser API
31
- progress_tracker(0.5, desc="Calling Chicory Parser API...")
32
- chicory_results = call_chicory_parser(product_names, progress=progress)
33
 
34
  # Compute similarities
35
- progress_tracker(0.7, desc="Computing similarities...")
36
  all_similarities = compute_similarities(embeddings, products_embeddings)
37
 
38
  # Format results
39
  progress_tracker(0.9, desc="Formatting results...")
40
- output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
41
- output_html += f"<p style='color: #555;'>Processing {len(product_names)} products.</p>"
42
 
43
  for product, similarities in all_similarities.items():
44
  filtered_similarities = [(ingredient, score) for ingredient, score in similarities if score >= confidence_threshold]
45
  top_similarities = filtered_similarities[:int(top_n)]
46
 
 
 
 
47
  # Debug info for Chicory results
48
  chicory_data = chicory_results.get(product, [])
49
-
50
- output_html += format_categories_html(product, top_similarities, chicory_result=chicory_data)
 
 
 
 
51
  output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
52
-
53
- output_html += "</div>"
54
 
 
55
  if not all_similarities:
56
  output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
57
 
58
  progress_tracker(1.0, desc="Done!")
59
- return output_html
 
 
5
  from chicory_api import call_chicory_parser
6
  from ui_core import embeddings, parse_input
7
  from ui_formatters import format_categories_html, create_results_container
8
+ from openai_expansion import expand_product_descriptions
9
 
10
+ def categorize_products(product_input, is_file=False, use_expansion=False, top_n=10, confidence_threshold=0.5):
11
  """Categorize products from text input or file"""
12
+ progress_tracker = SafeProgress(gr.Progress())
13
  progress_tracker(0, desc="Starting...")
14
 
15
  # Parse input
 
21
  if not embeddings:
22
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
23
 
24
+ # Optional description expansion
25
+ expanded_descriptions = {}
26
+ if use_expansion:
27
+ progress_tracker(0.2, desc="Expanding product descriptions...")
28
+ expanded_descriptions = expand_product_descriptions(product_names, progress=gr.Progress())
29
+
30
  # Create embeddings
31
+ progress_tracker(0.4, desc="Generating product embeddings...")
32
+ products_embeddings = create_product_embeddings(product_names, progress=gr.Progress())
33
 
34
  if not products_embeddings:
35
  return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
36
 
37
  # Call Chicory Parser API
38
+ progress_tracker(0.6, desc="Calling Chicory Parser API...")
39
+ chicory_results = call_chicory_parser(product_names, progress=gr.Progress())
40
 
41
  # Compute similarities
42
+ progress_tracker(0.8, desc="Computing similarities...")
43
  all_similarities = compute_similarities(embeddings, products_embeddings)
44
 
45
  # Format results
46
  progress_tracker(0.9, desc="Formatting results...")
47
+ output_html = f"<p style='color: #555;'>Processing {len(product_names)} products.</p>"
 
48
 
49
  for product, similarities in all_similarities.items():
50
  filtered_similarities = [(ingredient, score) for ingredient, score in similarities if score >= confidence_threshold]
51
  top_similarities = filtered_similarities[:int(top_n)]
52
 
53
+ # Add expansion explanation if available
54
+ expansion_text = expanded_descriptions.get(product, "") if use_expansion else ""
55
+
56
  # Debug info for Chicory results
57
  chicory_data = chicory_results.get(product, [])
58
+ output_html += format_categories_html(
59
+ product,
60
+ top_similarities,
61
+ chicory_result=chicory_data,
62
+ explanation=expansion_text
63
+ )
64
  output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
 
 
65
 
66
+ output_html += "</div>"
67
  if not all_similarities:
68
  output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
69
 
70
  progress_tracker(1.0, desc="Done!")
71
+ return create_results_container(output_html)
72
+