Spaces:
Sleeping
Sleeping
Fixed pipelines bugs
Browse files- api_utils.py +25 -2
- openai_expansion.py +4 -2
- ui_category_matching.py +16 -4
- ui_expanded_matching.py +35 -3
- ui_hybrid_matching.py +20 -5
- ui_ingredient_matching.py +15 -2
api_utils.py
CHANGED
@@ -235,7 +235,28 @@ def rank_ingredients_openai(
|
|
235 |
model=model,
|
236 |
# reasoning={"effort": "low"},
|
237 |
input=[
|
238 |
-
{"role": "system", "content": f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
{"role": "user", "content": prompt}
|
240 |
],
|
241 |
text={
|
@@ -367,7 +388,9 @@ def rank_categories_openai(
|
|
367 |
model=model,
|
368 |
# reasoning={"effort": "low"},
|
369 |
input=[
|
370 |
-
{"role": "system", "content": f"You are a food
|
|
|
|
|
371 |
{"role": "user", "content": prompt}
|
372 |
],
|
373 |
text={
|
|
|
235 |
model=model,
|
236 |
# reasoning={"effort": "low"},
|
237 |
input=[
|
238 |
+
{"role": "system", "content": f"""
|
239 |
+
You are a product categorization expert. Your task is to match product descriptions to the most relevant categories from the PROVIDED LIST ONLY.
|
240 |
+
|
241 |
+
CRITICAL RULES:
|
242 |
+
1. You MUST ONLY select from the exact items listed in "Potential ingredients" - DO NOT create or invent new categories
|
243 |
+
2. Do not combine items from the list or add any words to them
|
244 |
+
3. Choose the items from the list that best match what the product IS or CONTAINS
|
245 |
+
4. If none of the items perfectly match, choose the closest matches from the provided list
|
246 |
+
|
247 |
+
For the rankings:
|
248 |
+
- Select ONLY from the exact items in the "Potential ingredients" list
|
249 |
+
- Assign relevance scores from 0.0 to 1.0
|
250 |
+
- Rank the top {max_results} matching ingredients.
|
251 |
+
- Provide brief explanations for why each item is relevant
|
252 |
+
- Do not suggest alternatives outside the provided list
|
253 |
+
|
254 |
+
Aim to identify the specific product category a consumer would look for when shopping for this exact item.
|
255 |
+
Only include ingredients with relevance score >= {confidence_threshold}.
|
256 |
+
|
257 |
+
Remember: Your ONLY options are the exact items listed in "Potential ingredients" - no additions, modifications, or combinations.
|
258 |
+
|
259 |
+
"""},
|
260 |
{"role": "user", "content": prompt}
|
261 |
],
|
262 |
text={
|
|
|
388 |
model=model,
|
389 |
# reasoning={"effort": "low"},
|
390 |
input=[
|
391 |
+
{"role": "system", "content": f"""You are a food ingredient matching expert. Rank the top 5 ingredients that are CONTAINED WITHIN the given product itself - not items related to or used with the product.
|
392 |
+
Only include ingredients with relevance score >= 0.0. Higher scores should be given to primary ingredients that make up the majority of the product's composition.
|
393 |
+
Only include categories with relevance score >= {confidence_threshold}."""},
|
394 |
{"role": "user", "content": prompt}
|
395 |
],
|
396 |
text={
|
openai_expansion.py
CHANGED
@@ -34,8 +34,10 @@ def expand_product_descriptions(products: List[str],
|
|
34 |
# max_output_tokens=100,
|
35 |
# reasoning={"effort": "low"},
|
36 |
input=[
|
37 |
-
{"role": "system", "content": """You are a product description expert. Your task is to
|
38 |
-
|
|
|
|
|
39 |
"""},
|
40 |
{"role": "user", "content": f'Describe "{product}" to an embedding model categorizing products'}
|
41 |
],
|
|
|
34 |
# max_output_tokens=100,
|
35 |
# reasoning={"effort": "low"},
|
36 |
input=[
|
37 |
+
{"role": "system", "content": """You are a product description expert. Your task is to convert brand names or abbreviated product mentions into clear product descriptions that help
|
38 |
+
categorization models understand WHAT the item is, not just who makes it. Focus on the product category, form, and usage rather
|
39 |
+
than brand attributes. When brands are mentioned (like Green Mountain), identify the actual product type they represent (coffee pods)
|
40 |
+
rather than describing the brand itself. Always provide the most common product interpretation and avoid mentioning brands in your description when possible.
|
41 |
"""},
|
42 |
{"role": "user", "content": f'Describe "{product}" to an embedding model categorizing products'}
|
43 |
],
|
ui_category_matching.py
CHANGED
@@ -30,6 +30,12 @@ def categorize_products_by_category(product_input, is_file=False, use_expansion=
|
|
30 |
progress_tracker(0.2, desc="Loading categories...")
|
31 |
categories = load_categories()
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
# Match products to categories
|
34 |
progress_tracker(0.3, desc="Matching products to categories...")
|
35 |
match_results = match_products_to_categories(
|
@@ -39,14 +45,20 @@ def categorize_products_by_category(product_input, is_file=False, use_expansion=
|
|
39 |
confidence_threshold=confidence_threshold,
|
40 |
progress=progress_tracker
|
41 |
)
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
# Format results
|
44 |
progress_tracker(0.9, desc="Formatting results...")
|
45 |
output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
|
46 |
output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
|
47 |
-
|
48 |
-
for
|
49 |
-
categories =
|
50 |
expansion_text = ""
|
51 |
if use_expansion and product in expanded_descriptions:
|
52 |
expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"
|
|
|
30 |
progress_tracker(0.2, desc="Loading categories...")
|
31 |
categories = load_categories()
|
32 |
|
33 |
+
# Create a mapping from original product names to expanded versions
|
34 |
+
product_to_expanded = {}
|
35 |
+
for i, product in enumerate(product_names):
|
36 |
+
if i < len(products_to_match):
|
37 |
+
product_to_expanded[product] = products_to_match[i]
|
38 |
+
|
39 |
# Match products to categories
|
40 |
progress_tracker(0.3, desc="Matching products to categories...")
|
41 |
match_results = match_products_to_categories(
|
|
|
45 |
confidence_threshold=confidence_threshold,
|
46 |
progress=progress_tracker
|
47 |
)
|
48 |
+
|
49 |
+
# Create a new dictionary mapping original product names to their results
|
50 |
+
original_product_results = {}
|
51 |
+
for product, expanded in product_to_expanded.items():
|
52 |
+
if expanded in match_results:
|
53 |
+
original_product_results[product] = match_results[expanded]
|
54 |
+
|
55 |
# Format results
|
56 |
progress_tracker(0.9, desc="Formatting results...")
|
57 |
output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
|
58 |
output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
|
59 |
+
|
60 |
+
for product in product_names:
|
61 |
+
categories = original_product_results.get(product, [])
|
62 |
expansion_text = ""
|
63 |
if use_expansion and product in expanded_descriptions:
|
64 |
expansion_text = f"<div style='color: #666; font-style: italic; margin: 5px 0;'>Expanded description: {expanded_descriptions[product]}</div>"
|
ui_expanded_matching.py
CHANGED
@@ -34,15 +34,34 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
|
|
34 |
# Get shared OpenAI client
|
35 |
openai_client = get_openai_client()
|
36 |
|
|
|
|
|
37 |
if match_type == "ingredients":
|
38 |
# Generate product embeddings
|
39 |
progress_tracker(0.4, desc="Generating product embeddings...")
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Compute embedding similarities for ingredients
|
43 |
progress_tracker(0.6, desc="Computing ingredient similarities...")
|
44 |
all_similarities = compute_similarities(embeddings, product_embeddings)
|
45 |
|
|
|
|
|
|
|
|
|
46 |
if not all_similarities:
|
47 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
|
48 |
|
@@ -68,7 +87,7 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
|
|
68 |
candidates=candidate_ingredients,
|
69 |
expanded_description=expanded_text,
|
70 |
client=openai_client,
|
71 |
-
model="
|
72 |
max_results=top_n,
|
73 |
confidence_threshold=0.0, # Don't filter here, do it at display time
|
74 |
debug=True
|
@@ -102,7 +121,20 @@ def categorize_products_with_openai_reranking(product_input, is_file=False, use_
|
|
102 |
|
103 |
# Generate product embeddings
|
104 |
progress_tracker(0.6, desc="Generating product embeddings...")
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
# Compute embedding similarities for categories
|
108 |
progress_tracker(0.7, desc="Computing category similarities...")
|
|
|
34 |
# Get shared OpenAI client
|
35 |
openai_client = get_openai_client()
|
36 |
|
37 |
+
products_for_embedding = ''
|
38 |
+
|
39 |
if match_type == "ingredients":
|
40 |
# Generate product embeddings
|
41 |
progress_tracker(0.4, desc="Generating product embeddings...")
|
42 |
+
if use_expansion and expanded_descriptions:
|
43 |
+
# Use expanded descriptions for embedding creation when available
|
44 |
+
products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
|
45 |
+
# Map expanded descriptions back to original product names for consistent keys
|
46 |
+
product_embeddings = {}
|
47 |
+
temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress)
|
48 |
+
|
49 |
+
# Ensure we use original product names as keys
|
50 |
+
for i, product_name in enumerate(product_names):
|
51 |
+
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
|
52 |
+
product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
|
53 |
+
else:
|
54 |
+
# Standard embedding creation with just product names
|
55 |
+
product_embeddings = create_product_embeddings(product_names, progress=progress)
|
56 |
|
57 |
# Compute embedding similarities for ingredients
|
58 |
progress_tracker(0.6, desc="Computing ingredient similarities...")
|
59 |
all_similarities = compute_similarities(embeddings, product_embeddings)
|
60 |
|
61 |
+
print(f"product_names: {product_names}")
|
62 |
+
print(f"products_for_embedding: {products_for_embedding}")
|
63 |
+
# print(f"all_similarities: {all_similarities}")
|
64 |
+
|
65 |
if not all_similarities:
|
66 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
|
67 |
|
|
|
87 |
candidates=candidate_ingredients,
|
88 |
expanded_description=expanded_text,
|
89 |
client=openai_client,
|
90 |
+
model="gpt-4o-mini",
|
91 |
max_results=top_n,
|
92 |
confidence_threshold=0.0, # Don't filter here, do it at display time
|
93 |
debug=True
|
|
|
121 |
|
122 |
# Generate product embeddings
|
123 |
progress_tracker(0.6, desc="Generating product embeddings...")
|
124 |
+
if use_expansion and expanded_descriptions:
|
125 |
+
# Use expanded descriptions for embedding creation when available
|
126 |
+
products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
|
127 |
+
# Map expanded descriptions back to original product names for consistent keys
|
128 |
+
product_embeddings = {}
|
129 |
+
temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress)
|
130 |
+
|
131 |
+
# Ensure we use original product names as keys
|
132 |
+
for i, product_name in enumerate(product_names):
|
133 |
+
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
|
134 |
+
product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
|
135 |
+
else:
|
136 |
+
# Standard embedding creation with just product names
|
137 |
+
product_embeddings = create_product_embeddings(product_names, progress=progress)
|
138 |
|
139 |
# Compute embedding similarities for categories
|
140 |
progress_tracker(0.7, desc="Computing category similarities...")
|
ui_hybrid_matching.py
CHANGED
@@ -124,7 +124,21 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
|
|
124 |
progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
|
125 |
|
126 |
# Stage 1: Same as before - use embeddings to find candidates
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
similarities = compute_similarities(ingredients_dict, product_embeddings)
|
129 |
|
130 |
# Filter to top N candidates per product
|
@@ -139,7 +153,7 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
|
|
139 |
|
140 |
# Stage 2: Re-rank using Voyage instead of OpenAI
|
141 |
final_results = {}
|
142 |
-
|
143 |
for i, product in enumerate(products):
|
144 |
progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
|
145 |
|
@@ -156,8 +170,8 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
|
|
156 |
if expanded_descriptions and product in expanded_descriptions:
|
157 |
product_text = expanded_descriptions[product]
|
158 |
|
159 |
-
#
|
160 |
-
documents =
|
161 |
|
162 |
# Use Voyage reranking
|
163 |
reranked = voyage_client.rerank(
|
@@ -170,7 +184,8 @@ def hybrid_ingredient_matching_voyage(products, ingredients_dict,
|
|
170 |
voyage_results = []
|
171 |
for result in reranked["results"]:
|
172 |
score = result["relevance_score"]
|
173 |
-
|
|
|
174 |
|
175 |
# Still limit to final_top_n but don't filter by threshold here
|
176 |
final_results[product] = voyage_results[:final_top_n]
|
|
|
124 |
progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
|
125 |
|
126 |
# Stage 1: Same as before - use embeddings to find candidates
|
127 |
+
if expanded_descriptions:
|
128 |
+
# Use expanded descriptions for embedding creation when available
|
129 |
+
products_for_embedding = [expanded_descriptions.get(name, name) for name in products]
|
130 |
+
# Map expanded descriptions back to original product names for consistent keys
|
131 |
+
product_embeddings = {}
|
132 |
+
temp_embeddings = create_product_embeddings(products_for_embedding, progress=progress_tracker)
|
133 |
+
|
134 |
+
# Ensure we use original product names as keys
|
135 |
+
for i, product_name in enumerate(products):
|
136 |
+
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
|
137 |
+
product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
|
138 |
+
else:
|
139 |
+
# Standard embedding creation with just product names
|
140 |
+
product_embeddings = create_product_embeddings(products, progress=progress_tracker)
|
141 |
+
|
142 |
similarities = compute_similarities(ingredients_dict, product_embeddings)
|
143 |
|
144 |
# Filter to top N candidates per product
|
|
|
153 |
|
154 |
# Stage 2: Re-rank using Voyage instead of OpenAI
|
155 |
final_results = {}
|
156 |
+
|
157 |
for i, product in enumerate(products):
|
158 |
progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
|
159 |
|
|
|
170 |
if expanded_descriptions and product in expanded_descriptions:
|
171 |
product_text = expanded_descriptions[product]
|
172 |
|
173 |
+
# Use plain strings for the documents instead of objects with text property
|
174 |
+
documents = candidate_ingredients # Simply use the list of strings directly
|
175 |
|
176 |
# Use Voyage reranking
|
177 |
reranked = voyage_client.rerank(
|
|
|
184 |
voyage_results = []
|
185 |
for result in reranked["results"]:
|
186 |
score = result["relevance_score"]
|
187 |
+
text = result["document"] # Now this is the direct string, not an object
|
188 |
+
voyage_results.append((text, score))
|
189 |
|
190 |
# Still limit to final_top_n but don't filter by threshold here
|
191 |
final_results[product] = voyage_results[:final_top_n]
|
ui_ingredient_matching.py
CHANGED
@@ -29,7 +29,20 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
|
|
29 |
|
30 |
# Create embeddings
|
31 |
progress_tracker(0.4, desc="Generating product embeddings...")
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
if not products_embeddings:
|
35 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
|
@@ -69,4 +82,4 @@ def categorize_products(product_input, is_file=False, use_expansion=False, top_n
|
|
69 |
|
70 |
progress_tracker(1.0, desc="Done!")
|
71 |
return create_results_container(output_html)
|
72 |
-
|
|
|
29 |
|
30 |
# Create embeddings
|
31 |
progress_tracker(0.4, desc="Generating product embeddings...")
|
32 |
+
if use_expansion and expanded_descriptions:
|
33 |
+
# Use expanded descriptions for embedding creation when available
|
34 |
+
products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
|
35 |
+
# Map expanded descriptions back to original product names for consistent keys
|
36 |
+
products_embeddings = {}
|
37 |
+
temp_embeddings = create_product_embeddings(products_for_embedding, progress=gr.Progress())
|
38 |
+
|
39 |
+
# Ensure we use original product names as keys
|
40 |
+
for i, product_name in enumerate(product_names):
|
41 |
+
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
|
42 |
+
products_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
|
43 |
+
else:
|
44 |
+
# Standard embedding creation with just product names
|
45 |
+
products_embeddings = create_product_embeddings(product_names, progress=gr.Progress())
|
46 |
|
47 |
if not products_embeddings:
|
48 |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
|
|
|
82 |
|
83 |
progress_tracker(1.0, desc="Done!")
|
84 |
return create_results_container(output_html)
|
85 |
+
|