Spaces:
Sleeping
Sleeping
Added expansion for embeddings
Browse files- api_utils.py +2 -2
- comparison.py +1 -1
- similarity.py +1 -1
- ui.py +12 -2
- ui_category_matching.py +27 -6
- ui_formatters.py +11 -2
- ui_ingredient_matching.py +27 -14
api_utils.py
CHANGED
@@ -161,7 +161,7 @@ def rank_ingredients_openai(
|
|
161 |
candidates: List[str],
|
162 |
expanded_description: str = None,
|
163 |
client=None,
|
164 |
-
model: str = "
|
165 |
max_results: int = 3,
|
166 |
confidence_threshold: float = 0.5,
|
167 |
debug: bool = False
|
@@ -273,7 +273,7 @@ def rank_categories_openai(
|
|
273 |
categories: dict,
|
274 |
expanded_description: str = None,
|
275 |
client=None,
|
276 |
-
model: str = "
|
277 |
max_results: int = 5,
|
278 |
confidence_threshold: float = 0.5,
|
279 |
debug: bool = False
|
|
|
161 |
candidates: List[str],
|
162 |
expanded_description: str = None,
|
163 |
client=None,
|
164 |
+
model: str = "gpt-4o-mini",
|
165 |
max_results: int = 3,
|
166 |
confidence_threshold: float = 0.5,
|
167 |
debug: bool = False
|
|
|
273 |
categories: dict,
|
274 |
expanded_description: str = None,
|
275 |
client=None,
|
276 |
+
model: str = "gpt-4o-mini",
|
277 |
max_results: int = 5,
|
278 |
confidence_threshold: float = 0.5,
|
279 |
debug: bool = False
|
comparison.py
CHANGED
@@ -139,7 +139,7 @@ def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str,
|
|
139 |
product=product,
|
140 |
candidates=candidate_ingredients,
|
141 |
client=openai_client,
|
142 |
-
model="
|
143 |
max_results=final_top_n,
|
144 |
confidence_threshold=confidence_threshold
|
145 |
)
|
|
|
139 |
product=product,
|
140 |
candidates=candidate_ingredients,
|
141 |
client=openai_client,
|
142 |
+
model="gpt-4o-mini",
|
143 |
max_results=final_top_n,
|
144 |
confidence_threshold=confidence_threshold
|
145 |
)
|
similarity.py
CHANGED
@@ -205,7 +205,7 @@ def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str,
|
|
205 |
|
206 |
# Apply re-ranking using OpenAI's structured output
|
207 |
response = openai_client.responses.create(
|
208 |
-
model="
|
209 |
# reasoning={"effort": "low"},
|
210 |
input=[
|
211 |
{"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
|
|
|
205 |
|
206 |
# Apply re-ranking using OpenAI's structured output
|
207 |
response = openai_client.responses.create(
|
208 |
+
model="gpt-4o-mini",
|
209 |
# reasoning={"effort": "low"},
|
210 |
input=[
|
211 |
{"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
|
ui.py
CHANGED
@@ -26,6 +26,11 @@ def create_demo():
|
|
26 |
)
|
27 |
input_controls = gr.Row()
|
28 |
with input_controls:
|
|
|
|
|
|
|
|
|
|
|
29 |
top_n = gr.Slider(1, 25, 10, step=1, label="Top N Results")
|
30 |
confidence = gr.Slider(0.1, 0.9, 0.5, label="Similarity Threshold")
|
31 |
|
@@ -50,6 +55,11 @@ def create_demo():
|
|
50 |
)
|
51 |
category_input_controls = gr.Row()
|
52 |
with category_input_controls:
|
|
|
|
|
|
|
|
|
|
|
53 |
category_top_n = gr.Slider(1, 10, 5, step=1, label="Top N Categories")
|
54 |
category_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
|
55 |
|
@@ -168,14 +178,14 @@ def create_demo():
|
|
168 |
# Connect buttons for ingredient matching
|
169 |
categorize_btn.click(
|
170 |
fn=categorize_products,
|
171 |
-
inputs=[text_input, gr.State(False), top_n, confidence],
|
172 |
outputs=[text_output],
|
173 |
)
|
174 |
|
175 |
# Connect buttons for category matching
|
176 |
match_categories_btn.click(
|
177 |
fn=categorize_products_by_category,
|
178 |
-
inputs=[category_text_input, gr.State(False), category_top_n, category_confidence],
|
179 |
outputs=[category_output],
|
180 |
)
|
181 |
|
|
|
26 |
)
|
27 |
input_controls = gr.Row()
|
28 |
with input_controls:
|
29 |
+
use_expansion = gr.Checkbox(
|
30 |
+
value=False,
|
31 |
+
label="Use Description Expansion",
|
32 |
+
info="Expand product descriptions using AI before matching"
|
33 |
+
)
|
34 |
top_n = gr.Slider(1, 25, 10, step=1, label="Top N Results")
|
35 |
confidence = gr.Slider(0.1, 0.9, 0.5, label="Similarity Threshold")
|
36 |
|
|
|
55 |
)
|
56 |
category_input_controls = gr.Row()
|
57 |
with category_input_controls:
|
58 |
+
category_use_expansion = gr.Checkbox(
|
59 |
+
value=False,
|
60 |
+
label="Use Description Expansion",
|
61 |
+
info="Expand product descriptions using AI before matching"
|
62 |
+
)
|
63 |
category_top_n = gr.Slider(1, 10, 5, step=1, label="Top N Categories")
|
64 |
category_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
|
65 |
|
|
|
178 |
# Connect buttons for ingredient matching
|
179 |
categorize_btn.click(
|
180 |
fn=categorize_products,
|
181 |
+
inputs=[text_input, gr.State(False), use_expansion, top_n, confidence],
|
182 |
outputs=[text_output],
|
183 |
)
|
184 |
|
185 |
# Connect buttons for category matching
|
186 |
match_categories_btn.click(
|
187 |
fn=categorize_products_by_category,
|
188 |
+
inputs=[category_text_input, gr.State(False), category_use_expansion, category_top_n, category_confidence],
|
189 |
outputs=[category_output],
|
190 |
)
|
191 |
|
ui_category_matching.py
CHANGED
@@ -3,10 +3,12 @@ from utils import SafeProgress
|
|
3 |
from category_matching import load_categories, match_products_to_categories
|
4 |
from ui_core import parse_input
|
5 |
from ui_formatters import format_categories_html
|
|
|
|
|
|
|
6 |
|
7 |
-
def categorize_products_by_category(product_input, is_file=False, top_n=5, confidence_threshold=0.5, progress=gr.Progress()):
|
8 |
"""Categorize products by matching them to predefined categories"""
|
9 |
-
progress_tracker = SafeProgress(
|
10 |
progress_tracker(0, desc="Starting categorization...")
|
11 |
|
12 |
# Parse input
|
@@ -14,6 +16,16 @@ def categorize_products_by_category(product_input, is_file=False, top_n=5, confi
|
|
14 |
if error:
|
15 |
return error
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Load categories
|
18 |
progress_tracker(0.2, desc="Loading categories...")
|
19 |
categories = load_categories()
|
@@ -21,11 +33,11 @@ def categorize_products_by_category(product_input, is_file=False, top_n=5, confi
|
|
21 |
# Match products to categories
|
22 |
progress_tracker(0.3, desc="Matching products to categories...")
|
23 |
match_results = match_products_to_categories(
|
24 |
-
|
25 |
categories,
|
26 |
top_n=int(top_n),
|
27 |
confidence_threshold=confidence_threshold,
|
28 |
-
progress=
|
29 |
)
|
30 |
|
31 |
# Format results
|
@@ -33,8 +45,17 @@ def categorize_products_by_category(product_input, is_file=False, top_n=5, confi
|
|
33 |
output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
|
34 |
output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
|
35 |
|
36 |
-
for
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
|
39 |
|
40 |
output_html += "</div>"
|
|
|
3 |
from category_matching import load_categories, match_products_to_categories
|
4 |
from ui_core import parse_input
|
5 |
from ui_formatters import format_categories_html
|
6 |
+
from openai_expansion import expand_product_descriptions
|
7 |
+
|
8 |
+
def categorize_products_by_category(product_input, is_file=False, use_expansion=False, top_n=10, confidence_threshold=0.5):
|
9 |
|
|
|
10 |
"""Categorize products by matching them to predefined categories"""
|
11 |
+
progress_tracker = SafeProgress(gr.Progress())
|
12 |
progress_tracker(0, desc="Starting categorization...")
|
13 |
|
14 |
# Parse input
|
|
|
16 |
if error:
|
17 |
return error
|
18 |
|
19 |
+
# Optional description expansion
|
20 |
+
expanded_descriptions = {}
|
21 |
+
if use_expansion:
|
22 |
+
progress_tracker(0.1, desc="Expanding product descriptions...")
|
23 |
+
expanded_descriptions = expand_product_descriptions(product_names, progress=progress_tracker)
|
24 |
+
# Use expanded descriptions for matching if available
|
25 |
+
products_to_match = [expanded_descriptions.get(p, p) for p in product_names]
|
26 |
+
else:
|
27 |
+
products_to_match = product_names
|
28 |
+
|
29 |
# Load categories
|
30 |
progress_tracker(0.2, desc="Loading categories...")
|
31 |
categories = load_categories()
|
|
|
33 |
# Match products to categories
|
34 |
progress_tracker(0.3, desc="Matching products to categories...")
|
35 |
match_results = match_products_to_categories(
|
36 |
+
products_to_match,
|
37 |
categories,
|
38 |
top_n=int(top_n),
|
39 |
confidence_threshold=confidence_threshold,
|
40 |
+
progress=progress_tracker
|
41 |
)
|
42 |
|
43 |
# Format results
|
|
|
45 |
output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
|
46 |
output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
|
47 |
|
48 |
+
for i, product in enumerate(product_names):
|
49 |
+
categories = match_results.get(products_to_match[i], [])
|
50 |
+
expansion_text = ""
|
51 |
+
if use_expansion and product in expanded_descriptions:
|
52 |
+
expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"
|
53 |
+
|
54 |
+
output_html += format_categories_html(
|
55 |
+
product,
|
56 |
+
categories,
|
57 |
+
explanation=expansion_text
|
58 |
+
)
|
59 |
output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
|
60 |
|
61 |
output_html += "</div>"
|
ui_formatters.py
CHANGED
@@ -61,7 +61,7 @@ METHOD_NAMES = {
|
|
61 |
"base": "Base Embeddings",
|
62 |
"voyage": "Voyage AI Reranker",
|
63 |
"chicory": "Chicory Parser",
|
64 |
-
"openai": "OpenAI
|
65 |
"expanded": "Expanded Description",
|
66 |
"hybrid": "Hybrid Matching",
|
67 |
"categories": "Category Matches"
|
@@ -512,7 +512,7 @@ def set_theme(theme_name):
|
|
512 |
return True
|
513 |
return False
|
514 |
|
515 |
-
def format_categories_html(product, categories, chicory_result=None, header_color=None):
|
516 |
"""
|
517 |
Format category matching results as HTML
|
518 |
|
@@ -521,12 +521,20 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
|
|
521 |
categories: List of (category, score) tuples
|
522 |
chicory_result: Optional chicory parser result for the product
|
523 |
header_color: Optional header background color
|
|
|
524 |
|
525 |
Returns:
|
526 |
HTML string
|
527 |
"""
|
528 |
content = ""
|
529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
# Add Chicory results if available
|
531 |
if chicory_result:
|
532 |
content += f"<div style='{STYLES['info_panel']}'>"
|
@@ -554,3 +562,4 @@ def format_categories_html(product, categories, chicory_result=None, header_colo
|
|
554 |
)
|
555 |
|
556 |
return format_result_card(title=product, content=content)
|
|
|
|
61 |
"base": "Base Embeddings",
|
62 |
"voyage": "Voyage AI Reranker",
|
63 |
"chicory": "Chicory Parser",
|
64 |
+
"openai": "OpenAI",
|
65 |
"expanded": "Expanded Description",
|
66 |
"hybrid": "Hybrid Matching",
|
67 |
"categories": "Category Matches"
|
|
|
512 |
return True
|
513 |
return False
|
514 |
|
515 |
+
def format_categories_html(product, categories, chicory_result=None, header_color=None, explanation=""):
|
516 |
"""
|
517 |
Format category matching results as HTML
|
518 |
|
|
|
521 |
categories: List of (category, score) tuples
|
522 |
chicory_result: Optional chicory parser result for the product
|
523 |
header_color: Optional header background color
|
524 |
+
explanation: Optional expanded description text
|
525 |
|
526 |
Returns:
|
527 |
HTML string
|
528 |
"""
|
529 |
content = ""
|
530 |
|
531 |
+
# Add expanded description if available
|
532 |
+
if explanation:
|
533 |
+
content += f"<div style='{STYLES['info_panel']}'>"
|
534 |
+
content += "<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>Expanded Description</h4>"
|
535 |
+
content += f"<p style='margin-bottom: 8px;'>{explanation}</p>"
|
536 |
+
content += "</div>"
|
537 |
+
|
538 |
# Add Chicory results if available
|
539 |
if chicory_result:
|
540 |
content += f"<div style='{STYLES['info_panel']}'>"
|
|
|
562 |
)
|
563 |
|
564 |
return format_result_card(title=product, content=content)
|
565 |
+
|
ui_ingredient_matching.py
CHANGED
@@ -5,10 +5,11 @@ from similarity import compute_similarities
|
|
5 |
from chicory_api import call_chicory_parser
|
6 |
from ui_core import embeddings, parse_input
|
7 |
from ui_formatters import format_categories_html, create_results_container
|
|
|
8 |
|
9 |
-
def categorize_products(product_input, is_file=False, top_n=10, confidence_threshold=0.5
|
10 |
"""Categorize products from text input or file"""
|
11 |
-
progress_tracker = SafeProgress(
|
12 |
progress_tracker(0, desc="Starting...")
|
13 |
|
14 |
# Parse input
|
@@ -20,40 +21,52 @@ def categorize_products(product_input, is_file=False, top_n=10, confidence_thres
|
|
20 |
if not embeddings:
|
21 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# Create embeddings
|
24 |
-
progress_tracker(0.
|
25 |
-
products_embeddings = create_product_embeddings(product_names, progress=
|
26 |
|
27 |
if not products_embeddings:
|
28 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
|
29 |
|
30 |
# Call Chicory Parser API
|
31 |
-
progress_tracker(0.
|
32 |
-
chicory_results = call_chicory_parser(product_names, progress=
|
33 |
|
34 |
# Compute similarities
|
35 |
-
progress_tracker(0.
|
36 |
all_similarities = compute_similarities(embeddings, products_embeddings)
|
37 |
|
38 |
# Format results
|
39 |
progress_tracker(0.9, desc="Formatting results...")
|
40 |
-
output_html = "<
|
41 |
-
output_html += f"<p style='color: #555;'>Processing {len(product_names)} products.</p>"
|
42 |
|
43 |
for product, similarities in all_similarities.items():
|
44 |
filtered_similarities = [(ingredient, score) for ingredient, score in similarities if score >= confidence_threshold]
|
45 |
top_similarities = filtered_similarities[:int(top_n)]
|
46 |
|
|
|
|
|
|
|
47 |
# Debug info for Chicory results
|
48 |
chicory_data = chicory_results.get(product, [])
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
|
52 |
-
|
53 |
-
output_html += "</div>"
|
54 |
|
|
|
55 |
if not all_similarities:
|
56 |
output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
|
57 |
|
58 |
progress_tracker(1.0, desc="Done!")
|
59 |
-
return output_html
|
|
|
|
5 |
from chicory_api import call_chicory_parser
|
6 |
from ui_core import embeddings, parse_input
|
7 |
from ui_formatters import format_categories_html, create_results_container
|
8 |
+
from openai_expansion import expand_product_descriptions
|
9 |
|
10 |
+
def categorize_products(product_input, is_file=False, use_expansion=False, top_n=10, confidence_threshold=0.5):
|
11 |
"""Categorize products from text input or file"""
|
12 |
+
progress_tracker = SafeProgress(gr.Progress())
|
13 |
progress_tracker(0, desc="Starting...")
|
14 |
|
15 |
# Parse input
|
|
|
21 |
if not embeddings:
|
22 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
|
23 |
|
24 |
+
# Optional description expansion
|
25 |
+
expanded_descriptions = {}
|
26 |
+
if use_expansion:
|
27 |
+
progress_tracker(0.2, desc="Expanding product descriptions...")
|
28 |
+
expanded_descriptions = expand_product_descriptions(product_names, progress=gr.Progress())
|
29 |
+
|
30 |
# Create embeddings
|
31 |
+
progress_tracker(0.4, desc="Generating product embeddings...")
|
32 |
+
products_embeddings = create_product_embeddings(product_names, progress=gr.Progress())
|
33 |
|
34 |
if not products_embeddings:
|
35 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
|
36 |
|
37 |
# Call Chicory Parser API
|
38 |
+
progress_tracker(0.6, desc="Calling Chicory Parser API...")
|
39 |
+
chicory_results = call_chicory_parser(product_names, progress=gr.Progress())
|
40 |
|
41 |
# Compute similarities
|
42 |
+
progress_tracker(0.8, desc="Computing similarities...")
|
43 |
all_similarities = compute_similarities(embeddings, products_embeddings)
|
44 |
|
45 |
# Format results
|
46 |
progress_tracker(0.9, desc="Formatting results...")
|
47 |
+
output_html = f"<p style='color: #555;'>Processing {len(product_names)} products.</p>"
|
|
|
48 |
|
49 |
for product, similarities in all_similarities.items():
|
50 |
filtered_similarities = [(ingredient, score) for ingredient, score in similarities if score >= confidence_threshold]
|
51 |
top_similarities = filtered_similarities[:int(top_n)]
|
52 |
|
53 |
+
# Add expansion explanation if available
|
54 |
+
expansion_text = expanded_descriptions.get(product, "") if use_expansion else ""
|
55 |
+
|
56 |
# Debug info for Chicory results
|
57 |
chicory_data = chicory_results.get(product, [])
|
58 |
+
output_html += format_categories_html(
|
59 |
+
product,
|
60 |
+
top_similarities,
|
61 |
+
chicory_result=chicory_data,
|
62 |
+
explanation=expansion_text
|
63 |
+
)
|
64 |
output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
|
|
|
|
|
65 |
|
66 |
+
output_html += "</div>"
|
67 |
if not all_similarities:
|
68 |
output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
|
69 |
|
70 |
progress_tracker(1.0, desc="Done!")
|
71 |
+
return create_results_container(output_html)
|
72 |
+
|