esilver commited on
Commit
a318724
·
1 Parent(s): a198898

refactored

Browse files
Files changed (10) hide show
  1. .DS_Store +0 -0
  2. app.py +0 -369
  3. embeddings.py +72 -0
  4. ingredient_embeddings_voyageai.pkl +0 -3
  5. main.py +39 -0
  6. run_app.sh +1 -1
  7. similarity.py +53 -0
  8. spaces.py +0 -198
  9. ui.py +266 -0
  10. utils.py +75 -0
.DS_Store ADDED
Binary file (8.2 kB). View file
 
app.py DELETED
@@ -1,369 +0,0 @@
1
- import gradio as gr
2
- import pickle
3
- import os
4
- import json
5
- import numpy as np
6
- import voyageai
7
- import time
8
- import sys
9
- from concurrent.futures import ThreadPoolExecutor
10
-
11
- # Set Voyage AI API key directly (using the free version key from your code)
12
- voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
13
-
14
- # Force unbuffered output
15
- os.environ['PYTHONUNBUFFERED'] = '1'
16
-
17
- # ===== Embedding Generation Functions =====
18
- def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
19
- """Get embeddings for a list of texts in batches"""
20
- all_embeddings = []
21
- total_texts = len(texts)
22
-
23
- # Pre-process all texts to replace newlines
24
- texts = [text.replace("\n", " ") for text in texts]
25
-
26
- for i in range(0, len(texts), batch_size):
27
- batch = texts[i:i+batch_size]
28
-
29
- try:
30
- response = voyageai.Embedding.create(input=batch, model=model)
31
- batch_embeddings = [item['embedding'] for item in response['data']]
32
- all_embeddings.extend(batch_embeddings)
33
-
34
- # Sleep briefly to avoid rate limits
35
- if i + batch_size < len(texts):
36
- time.sleep(0.5)
37
-
38
- except Exception as e:
39
- print(f"Error in batch {i//batch_size + 1}: {e}")
40
- # Add empty embeddings for failed batch
41
- all_embeddings.extend([None] * len(batch))
42
-
43
- return all_embeddings
44
-
45
- def create_product_embeddings_voyageai(products, batch_size=100):
46
- """Create embeddings for products using batch processing with deduplication"""
47
- # De-duplication step
48
- unique_products = []
49
- product_to_index = {}
50
- index_map = {} # Maps original index to index in unique_products
51
-
52
- for i, product in enumerate(products):
53
- if product in product_to_index:
54
- # Product already seen, just store the mapping
55
- index_map[i] = product_to_index[product]
56
- else:
57
- # New unique product
58
- product_to_index[product] = len(unique_products)
59
- index_map[i] = len(unique_products)
60
- unique_products.append(product)
61
-
62
- print(f"Found {len(unique_products)} unique products out of {len(products)} total")
63
-
64
- if len(unique_products) == 0:
65
- return {}
66
-
67
- # Process only unique products
68
- print(f"Processing {len(unique_products)} unique products")
69
-
70
- # Get embeddings for unique products
71
- unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
72
-
73
- # Map embeddings back to all products
74
- all_products_dict = {}
75
- for i, product in enumerate(products):
76
- unique_idx = index_map[i]
77
- if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
78
- all_products_dict[product] = unique_embeddings[unique_idx]
79
-
80
- print(f"Created embeddings for {len(all_products_dict)} products")
81
-
82
- return all_products_dict
83
-
84
- # ===== Similarity Computation Functions =====
85
- def compute_similarities(ingredients_dict, products_dict):
86
- """Compute similarities between all products and ingredients using NumPy"""
87
- # Filter valid ingredients (with non-None embeddings)
88
- ingredient_names = []
89
- ingredient_embeddings_list = []
90
- for ing, emb in ingredients_dict.items():
91
- if emb is not None:
92
- ingredient_names.append(ing)
93
- ingredient_embeddings_list.append(emb)
94
-
95
- # Convert ingredient embeddings to numpy array
96
- ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
97
-
98
- # Normalize ingredient embeddings for cosine similarity
99
- ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
100
- normalized_ingredients = ingredient_embeddings / ingredient_norms
101
-
102
- # Process all products
103
- all_similarities = {}
104
- valid_products = []
105
- valid_embeddings = []
106
-
107
- for product, embedding in products_dict.items():
108
- if embedding is not None:
109
- valid_products.append(product)
110
- valid_embeddings.append(embedding)
111
-
112
- if not valid_products:
113
- return {}
114
-
115
- # Convert product embeddings to numpy array
116
- product_embeddings = np.array(valid_embeddings, dtype=np.float32)
117
-
118
- # Normalize product embeddings
119
- product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
120
- normalized_products = product_embeddings / product_norms
121
-
122
- # Compute all similarities at once using matrix multiplication
123
- # (dot product of normalized vectors = cosine similarity)
124
- similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
125
-
126
- # Process and store results
127
- for p_idx, product in enumerate(valid_products):
128
- product_similarities = [(ingredient_names[i_idx], float(similarity_matrix[p_idx, i_idx]))
129
- for i_idx in range(len(ingredient_names))]
130
-
131
- # Sort by similarity score (descending)
132
- product_similarities.sort(key=lambda x: x[1], reverse=True)
133
- all_similarities[product] = product_similarities
134
-
135
- return all_similarities
136
-
137
- # ===== Main Application Functions =====
138
- def load_embeddings(embeddings_path):
139
- """Load ingredient embeddings from pickle file"""
140
- print(f"Loading ingredient embeddings from {embeddings_path}")
141
- with open(embeddings_path, "rb") as f:
142
- ingredients_embeddings = pickle.load(f)
143
- print(f"Loaded {len(ingredients_embeddings)} ingredient embeddings")
144
- return ingredients_embeddings
145
-
146
- # Define a safe progress tracker that handles None
147
- class SafeProgress:
148
- def __init__(self, progress_obj=None):
149
- self.progress = progress_obj
150
-
151
- def __call__(self, value, desc=""):
152
- if self.progress is not None:
153
- try:
154
- self.progress(value, desc=desc)
155
- except:
156
- print(f"Progress {value}: {desc}")
157
- else:
158
- print(f"Progress {value}: {desc}")
159
-
160
- def categorize_products_from_text(product_text, top_n=5, confidence_threshold=0.5, progress=None):
161
- """Categorize products from text input (one product per line)"""
162
- # Create a safe progress tracker
163
- progress_tracker = SafeProgress(progress)
164
- progress_tracker(0, desc="Starting...")
165
-
166
- # Parse input text to get product names
167
- product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
168
-
169
- if not product_names:
170
- return "No product names provided."
171
-
172
- # Create product embeddings
173
- progress_tracker(0.1, desc="Generating product embeddings...")
174
- products_embeddings = create_product_embeddings_voyageai(product_names)
175
-
176
- # Compute similarities
177
- progress_tracker(0.6, desc="Computing similarities...")
178
- all_similarities = compute_similarities(embeddings, products_embeddings)
179
-
180
- # Format results
181
- progress_tracker(0.9, desc="Formatting results...")
182
- results = {}
183
- for product, similarities in all_similarities.items():
184
- # Filter by confidence threshold and take top N
185
- filtered_similarities = [(ingredient, score) for ingredient, score in similarities
186
- if score >= confidence_threshold]
187
- top_similarities = filtered_similarities[:top_n]
188
-
189
- results[product] = top_similarities
190
-
191
- # Format as readable text
192
- output_text = ""
193
- for product, categories in results.items():
194
- output_text += f"Product: {product}\n"
195
- if categories:
196
- for i, (category, score) in enumerate(categories, 1):
197
- output_text += f" {i}. {category} (confidence: {score:.3f})\n"
198
- else:
199
- output_text += " No matching categories found.\n"
200
- output_text += "\n"
201
-
202
- progress_tracker(1.0, desc="Done!")
203
- return output_text
204
-
205
- def categorize_products_from_file(file, top_n=5, confidence_threshold=0.5, progress=None):
206
- """Categorize products from a JSON file"""
207
- # Create a safe progress tracker
208
- progress_tracker = SafeProgress(progress)
209
- progress_tracker(0.1, desc="Reading file...")
210
-
211
- try:
212
- with open(file.name, 'r') as f:
213
- try:
214
- products_data = json.load(f)
215
- if isinstance(products_data, list):
216
- # Extract product names if it's a list of objects with 'name' field
217
- if all(isinstance(item, dict) for item in products_data):
218
- product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
219
- else:
220
- # If it's just a list of strings
221
- product_names = [str(item) for item in products_data if item]
222
- else:
223
- # If it's just a list of product names
224
- product_names = []
225
- except json.JSONDecodeError:
226
- # If not JSON, try reading as text file with one product per line
227
- f.seek(0)
228
- product_names = [line.strip() for line in f.readlines() if line.strip()]
229
- except Exception as e:
230
- return f"Error reading file: {str(e)}"
231
-
232
- if not product_names:
233
- return "No product names found in the file."
234
-
235
- # Create product embeddings
236
- progress_tracker(0.2, desc="Generating product embeddings...")
237
- products_embeddings = create_product_embeddings_voyageai(product_names)
238
-
239
- # Compute similarities
240
- progress_tracker(0.7, desc="Computing similarities...")
241
- all_similarities = compute_similarities(embeddings, products_embeddings)
242
-
243
- # Format results
244
- progress_tracker(0.9, desc="Formatting results...")
245
- output_text = f"Found {len(product_names)} products in file.\n\n"
246
-
247
- for product, similarities in all_similarities.items():
248
- # Filter by confidence threshold and take top N
249
- filtered_similarities = [(ingredient, score) for ingredient, score in similarities
250
- if score >= confidence_threshold]
251
- top_similarities = filtered_similarities[:top_n]
252
-
253
- output_text += f"Product: {product}\n"
254
- if top_similarities:
255
- for i, (category, score) in enumerate(top_similarities, 1):
256
- output_text += f" {i}. {category} (confidence: {score:.3f})\n"
257
- else:
258
- output_text += " No matching categories found.\n"
259
- output_text += "\n"
260
-
261
- progress_tracker(1.0, desc="Done!")
262
- return output_text
263
-
264
- # Load embeddings at the module level for easier access
265
- try:
266
- embeddings_path = "ingredient_embeddings_voyageai.pkl"
267
- embeddings = load_embeddings(embeddings_path)
268
- except Exception as e:
269
- print(f"Warning: Could not load embeddings at startup: {e}")
270
- print("Will attempt to load them when the app runs")
271
- embeddings = {}
272
-
273
- # ===== Gradio Interface Setup =====
274
- def create_interface(embeddings_path="ingredient_embeddings_voyageai.pkl"):
275
- # Ensure embeddings are loaded
276
- global embeddings
277
- if not embeddings:
278
- try:
279
- embeddings = load_embeddings(embeddings_path)
280
- except Exception as e:
281
- print(f"Error loading embeddings: {e}")
282
- gr.Error(f"Failed to load embeddings file: {e}")
283
-
284
- # Text input interface
285
- with gr.Blocks() as demo:
286
- gr.Markdown("# Product Categorization Tool")
287
- gr.Markdown("This tool uses AI to categorize products based on their similarity to known ingredients.")
288
-
289
- with gr.Tabs():
290
- with gr.TabItem("Text Input"):
291
- with gr.Row():
292
- with gr.Column():
293
- text_input = gr.Textbox(
294
- lines=10,
295
- placeholder="Enter product names, one per line",
296
- label="Product Names"
297
- )
298
- top_n = gr.Slider(
299
- minimum=1,
300
- maximum=10,
301
- value=5,
302
- step=1,
303
- label="Number of Top Categories"
304
- )
305
- confidence = gr.Slider(
306
- minimum=0.1,
307
- maximum=0.9,
308
- value=0.5,
309
- step=0.05,
310
- label="Confidence Threshold"
311
- )
312
- submit_button = gr.Button("Categorize Products")
313
-
314
- with gr.Column():
315
- text_output = gr.Textbox(label="Categorization Results", lines=20)
316
-
317
- submit_button.click(
318
- fn=categorize_products_from_text,
319
- inputs=[text_input, top_n, confidence],
320
- outputs=text_output
321
- )
322
-
323
- with gr.TabItem("File Upload"):
324
- with gr.Row():
325
- with gr.Column():
326
- file_input = gr.File(label="Upload JSON file with products")
327
- file_top_n = gr.Slider(
328
- minimum=1,
329
- maximum=10,
330
- value=5,
331
- step=1,
332
- label="Number of Top Categories"
333
- )
334
- file_confidence = gr.Slider(
335
- minimum=0.1,
336
- maximum=0.9,
337
- value=0.5,
338
- step=0.05,
339
- label="Confidence Threshold"
340
- )
341
- file_button = gr.Button("Process File")
342
-
343
- with gr.Column():
344
- file_output = gr.Textbox(label="Categorization Results", lines=20)
345
-
346
- file_button.click(
347
- fn=categorize_products_from_file,
348
- inputs=[file_input, file_top_n, file_confidence],
349
- outputs=file_output
350
- )
351
-
352
- gr.Markdown("### Example Input")
353
- gr.Markdown("Try entering product names like:\n- Tomato Sauce\n- Apple Pie\n- Greek Yogurt\n- Chocolate Chip Cookies")
354
-
355
- return demo
356
-
357
- if __name__ == "__main__":
358
- import argparse
359
-
360
- parser = argparse.ArgumentParser(description='Run the Product Categorization web app')
361
- parser.add_argument('--embeddings', default='ingredient_embeddings_voyageai.pkl',
362
- help='Path to the ingredient embeddings pickle file')
363
- parser.add_argument('--share', action='store_true', help='Create a public link for sharing')
364
-
365
- args = parser.parse_args()
366
-
367
- # Create and launch the interface
368
- demo = create_interface(args.embeddings)
369
- demo.launch(share=args.share)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
embeddings.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import voyageai
2
+ import time
3
+ import numpy as np
4
+
5
+ # Set Voyage AI API key directly
6
+ voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
7
+
8
+ def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
9
+ """Get embeddings for a list of texts in batches"""
10
+ all_embeddings = []
11
+ total_texts = len(texts)
12
+
13
+ # Pre-process all texts to replace newlines
14
+ texts = [text.replace("\n", " ") for text in texts]
15
+
16
+ for i in range(0, len(texts), batch_size):
17
+ batch = texts[i:i+batch_size]
18
+
19
+ try:
20
+ response = voyageai.Embedding.create(input=batch, model=model)
21
+ batch_embeddings = [item['embedding'] for item in response['data']]
22
+ all_embeddings.extend(batch_embeddings)
23
+
24
+ # Sleep briefly to avoid rate limits
25
+ if i + batch_size < len(texts):
26
+ time.sleep(0.5)
27
+
28
+ except Exception as e:
29
+ print(f"Error in batch {i//batch_size + 1}: {e}")
30
+ # Add empty embeddings for failed batch
31
+ all_embeddings.extend([None] * len(batch))
32
+
33
+ return all_embeddings
34
+
35
+ def create_product_embeddings(products, batch_size=100):
36
+ """Create embeddings for products using batch processing with deduplication"""
37
+ # De-duplication step
38
+ unique_products = []
39
+ product_to_index = {}
40
+ index_map = {} # Maps original index to index in unique_products
41
+
42
+ for i, product in enumerate(products):
43
+ if product in product_to_index:
44
+ # Product already seen, just store the mapping
45
+ index_map[i] = product_to_index[product]
46
+ else:
47
+ # New unique product
48
+ product_to_index[product] = len(unique_products)
49
+ index_map[i] = len(unique_products)
50
+ unique_products.append(product)
51
+
52
+ print(f"Found {len(unique_products)} unique products out of {len(products)} total")
53
+
54
+ if len(unique_products) == 0:
55
+ return {}
56
+
57
+ # Process only unique products
58
+ print(f"Processing {len(unique_products)} unique products")
59
+
60
+ # Get embeddings for unique products
61
+ unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
62
+
63
+ # Map embeddings back to all products
64
+ all_products_dict = {}
65
+ for i, product in enumerate(products):
66
+ unique_idx = index_map[i]
67
+ if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
68
+ all_products_dict[product] = unique_embeddings[unique_idx]
69
+
70
+ print(f"Created embeddings for {len(all_products_dict)} products")
71
+
72
+ return all_products_dict
ingredient_embeddings_voyageai.pkl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:de6791a4909432600b90a5523e8a105f047887d4ac59d63460d8a2f9d788d0c9
3
- size 27301581
 
 
 
 
main.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import sys
4
+ import gradio as gr
5
+ from utils import load_embeddings
6
+ from ui import create_demo, embeddings
7
+
8
+ def main():
9
+ """Main entry point for the application"""
10
+ parser = argparse.ArgumentParser(description='Run the Product Categorization web app')
11
+ parser.add_argument('--embeddings', default='ingredient_embeddings_voyageai.pkl',
12
+ help='Path to the ingredient embeddings pickle file')
13
+ parser.add_argument('--share', action='store_true', help='Create a public link for sharing')
14
+
15
+ args = parser.parse_args()
16
+
17
+ # Check if embeddings file exists
18
+ if not os.path.exists(args.embeddings):
19
+ print(f"Error: Embeddings file {args.embeddings} not found!")
20
+ print(f"Please ensure the file exists at {os.path.abspath(args.embeddings)}")
21
+ sys.exit(1)
22
+
23
+ # Load embeddings
24
+ try:
25
+ global embeddings
26
+ embeddings_data = load_embeddings(args.embeddings)
27
+ # Update the embeddings in the ui module
28
+ import ui
29
+ ui.embeddings = embeddings_data
30
+ except Exception as e:
31
+ print(f"Error loading embeddings: {e}")
32
+ sys.exit(1)
33
+
34
+ # Create and launch the interface
35
+ demo = create_demo()
36
+ demo.launch(share=args.share)
37
+
38
+ if __name__ == "__main__":
39
+ main()
run_app.sh CHANGED
@@ -6,7 +6,7 @@ pip install -r requirements.txt
6
  # Check if embeddings file exists
7
  if [ -f "ingredient_embeddings_voyageai.pkl" ]; then
8
  # Run with local embeddings file
9
- python app.py --share
10
  else
11
  echo "ERROR: ingredient_embeddings_voyageai.pkl file not found!"
12
  echo "Please place the embeddings file in the same directory as this script."
 
6
  # Check if embeddings file exists
7
  if [ -f "ingredient_embeddings_voyageai.pkl" ]; then
8
  # Run with local embeddings file
9
+ python main.py --share
10
  else
11
  echo "ERROR: ingredient_embeddings_voyageai.pkl file not found!"
12
  echo "Please place the embeddings file in the same directory as this script."
similarity.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ def compute_similarities(ingredients_dict, products_dict):
4
+ """Compute similarities between all products and ingredients using NumPy"""
5
+ # Filter valid ingredients (with non-None embeddings)
6
+ ingredient_names = []
7
+ ingredient_embeddings_list = []
8
+ for ing, emb in ingredients_dict.items():
9
+ if emb is not None:
10
+ ingredient_names.append(ing)
11
+ ingredient_embeddings_list.append(emb)
12
+
13
+ # Convert ingredient embeddings to numpy array
14
+ ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
15
+
16
+ # Normalize ingredient embeddings for cosine similarity
17
+ ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
18
+ normalized_ingredients = ingredient_embeddings / ingredient_norms
19
+
20
+ # Process all products
21
+ all_similarities = {}
22
+ valid_products = []
23
+ valid_embeddings = []
24
+
25
+ for product, embedding in products_dict.items():
26
+ if embedding is not None:
27
+ valid_products.append(product)
28
+ valid_embeddings.append(embedding)
29
+
30
+ if not valid_products:
31
+ return {}
32
+
33
+ # Convert product embeddings to numpy array
34
+ product_embeddings = np.array(valid_embeddings, dtype=np.float32)
35
+
36
+ # Normalize product embeddings
37
+ product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
38
+ normalized_products = product_embeddings / product_norms
39
+
40
+ # Compute all similarities at once using matrix multiplication
41
+ # (dot product of normalized vectors = cosine similarity)
42
+ similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
43
+
44
+ # Process and store results
45
+ for p_idx, product in enumerate(valid_products):
46
+ product_similarities = [(ingredient_names[i_idx], float(similarity_matrix[p_idx, i_idx]))
47
+ for i_idx in range(len(ingredient_names))]
48
+
49
+ # Sort by similarity score (descending)
50
+ product_similarities.sort(key=lambda x: x[1], reverse=True)
51
+ all_similarities[product] = product_similarities
52
+
53
+ return all_similarities
spaces.py DELETED
@@ -1,198 +0,0 @@
1
- import gradio as gr
2
- import pickle
3
- import os
4
- import json
5
- import numpy as np
6
- import voyageai
7
- import time
8
- import sys
9
-
10
- # Set Voyage AI API key directly
11
- voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
12
-
13
- # Import all necessary functions from the main app
14
- from app import create_product_embeddings_voyageai, get_embeddings_batch, compute_similarities
15
-
16
- # Path to the embeddings file for Hugging Face Spaces
17
- EMBEDDINGS_PATH = "ingredient_embeddings_voyageai.pkl"
18
-
19
- # Load the embeddings
20
- print(f"Loading ingredient embeddings from {EMBEDDINGS_PATH}")
21
- try:
22
- with open(EMBEDDINGS_PATH, "rb") as f:
23
- embeddings = pickle.load(f)
24
- print(f"Successfully loaded {len(embeddings)} ingredient embeddings")
25
- except Exception as e:
26
- print(f"ERROR: Failed to load embeddings: {e}")
27
- # Create an empty dict as fallback
28
- embeddings = {}
29
-
30
- # Define the categorization function for text input
31
- def categorize_products_from_text(product_text, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
32
- """Categorize products from text input (one product per line)"""
33
- # Parse input text to get product names
34
- product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
35
-
36
- if not product_names:
37
- return "No product names provided."
38
-
39
- progress(0.1, desc="Generating product embeddings...")
40
-
41
- # Create product embeddings
42
- products_embeddings = create_product_embeddings_voyageai(product_names)
43
-
44
- # Compute similarities
45
- progress(0.6, desc="Computing similarities...")
46
- all_similarities = compute_similarities(embeddings, products_embeddings)
47
-
48
- # Format results
49
- progress(0.9, desc="Formatting results...")
50
- output_text = ""
51
- for product, similarities in all_similarities.items():
52
- # Filter by confidence threshold and take top N
53
- filtered_similarities = [(ingredient, score) for ingredient, score in similarities
54
- if score >= confidence_threshold]
55
- top_similarities = filtered_similarities[:top_n]
56
-
57
- output_text += f"Product: {product}\n"
58
- if top_similarities:
59
- for i, (category, score) in enumerate(top_similarities, 1):
60
- output_text += f" {i}. {category} (confidence: {score:.3f})\n"
61
- else:
62
- output_text += " No matching categories found.\n"
63
- output_text += "\n"
64
-
65
- progress(1.0, desc="Done!")
66
- return output_text
67
-
68
- # Define the categorization function for file input
69
- def categorize_products_from_file(file, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
70
- """Categorize products from a JSON file"""
71
- progress(0.1, desc="Reading file...")
72
-
73
- try:
74
- with open(file.name, 'r') as f:
75
- try:
76
- products_data = json.load(f)
77
- if isinstance(products_data, list):
78
- # Extract product names if it's a list of objects with 'name' field
79
- if all(isinstance(item, dict) for item in products_data):
80
- product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
81
- else:
82
- # If it's just a list of strings
83
- product_names = [str(item) for item in products_data if item]
84
- else:
85
- # If it's just a list of product names
86
- product_names = []
87
- except json.JSONDecodeError:
88
- # If not JSON, try reading as text file with one product per line
89
- f.seek(0)
90
- product_names = [line.strip() for line in f.readlines() if line.strip()]
91
- except Exception as e:
92
- return f"Error reading file: {str(e)}"
93
-
94
- if not product_names:
95
- return "No product names found in the file."
96
-
97
- # Create product embeddings
98
- progress(0.2, desc="Generating product embeddings...")
99
- products_embeddings = create_product_embeddings_voyageai(product_names)
100
-
101
- # Compute similarities
102
- progress(0.7, desc="Computing similarities...")
103
- all_similarities = compute_similarities(embeddings, products_embeddings)
104
-
105
- # Format results
106
- progress(0.9, desc="Formatting results...")
107
- output_text = f"Found {len(product_names)} products in file.\n\n"
108
-
109
- for product, similarities in all_similarities.items():
110
- # Filter by confidence threshold and take top N
111
- filtered_similarities = [(ingredient, score) for ingredient, score in similarities
112
- if score >= confidence_threshold]
113
- top_similarities = filtered_similarities[:top_n]
114
-
115
- output_text += f"Product: {product}\n"
116
- if top_similarities:
117
- for i, (category, score) in enumerate(top_similarities, 1):
118
- output_text += f" {i}. {category} (confidence: {score:.3f})\n"
119
- else:
120
- output_text += " No matching categories found.\n"
121
- output_text += "\n"
122
-
123
- progress(1.0, desc="Done!")
124
- return output_text
125
-
126
- # Create the Gradio interface
127
- with gr.Blocks() as demo:
128
- gr.Markdown("# Product Categorization Tool")
129
- gr.Markdown("This tool uses AI to categorize products based on their similarity to known ingredients.")
130
-
131
- with gr.Tabs():
132
- with gr.TabItem("Text Input"):
133
- with gr.Row():
134
- with gr.Column():
135
- text_input = gr.Textbox(
136
- lines=10,
137
- placeholder="Enter product names, one per line",
138
- label="Product Names"
139
- )
140
- top_n = gr.Slider(
141
- minimum=1,
142
- maximum=10,
143
- value=5,
144
- step=1,
145
- label="Number of Top Categories"
146
- )
147
- confidence = gr.Slider(
148
- minimum=0.1,
149
- maximum=0.9,
150
- value=0.5,
151
- step=0.05,
152
- label="Confidence Threshold"
153
- )
154
- submit_button = gr.Button("Categorize Products")
155
-
156
- with gr.Column():
157
- text_output = gr.Textbox(label="Categorization Results", lines=20)
158
-
159
- submit_button.click(
160
- fn=categorize_products_from_text,
161
- inputs=[text_input, top_n, confidence],
162
- outputs=text_output
163
- )
164
-
165
- with gr.TabItem("File Upload"):
166
- with gr.Row():
167
- with gr.Column():
168
- file_input = gr.File(label="Upload JSON file with products")
169
- file_top_n = gr.Slider(
170
- minimum=1,
171
- maximum=10,
172
- value=5,
173
- step=1,
174
- label="Number of Top Categories"
175
- )
176
- file_confidence = gr.Slider(
177
- minimum=0.1,
178
- maximum=0.9,
179
- value=0.5,
180
- step=0.05,
181
- label="Confidence Threshold"
182
- )
183
- file_button = gr.Button("Process File")
184
-
185
- with gr.Column():
186
- file_output = gr.Textbox(label="Categorization Results", lines=20)
187
-
188
- file_button.click(
189
- fn=categorize_products_from_file,
190
- inputs=[file_input, file_top_n, file_confidence],
191
- outputs=file_output
192
- )
193
-
194
- gr.Markdown("### Example Input")
195
- gr.Markdown("Try entering product names like:\n- Tomato Sauce\n- Apple Pie\n- Greek Yogurt\n- Chocolate Chip Cookies")
196
-
197
- # Launch the demo (for Hugging Face Spaces)
198
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ui.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from utils import SafeProgress, format_categories_html
3
+ from embeddings import create_product_embeddings
4
+ from similarity import compute_similarities
5
+
6
+ # Global variable for embeddings
7
+ embeddings = {}
8
+
9
+ def categorize_products_from_text(product_text, top_n=5, confidence_threshold=0.5, progress=None):
10
+ """Categorize products from text input (one product per line)"""
11
+ # Create a safe progress tracker
12
+ progress_tracker = SafeProgress(progress)
13
+ progress_tracker(0, desc="Starting...")
14
+
15
+ # Parse input text to get product names
16
+ product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
17
+
18
+ if not product_names:
19
+ return "No product names provided."
20
+
21
+ # Create product embeddings
22
+ progress_tracker(0.1, desc="Generating product embeddings...")
23
+ products_embeddings = create_product_embeddings(product_names)
24
+
25
+ # Compute similarities
26
+ progress_tracker(0.6, desc="Computing similarities...")
27
+ all_similarities = compute_similarities(embeddings, products_embeddings)
28
+
29
+ # Format results
30
+ progress_tracker(0.9, desc="Formatting results...")
31
+ output_html = "<div style='font-family: Arial, sans-serif;'>"
32
+
33
+ for product, similarities in all_similarities.items():
34
+ # Filter by confidence threshold and take top N
35
+ filtered_similarities = [(ingredient, score) for ingredient, score in similarities
36
+ if score >= confidence_threshold]
37
+ top_similarities = filtered_similarities[:top_n]
38
+
39
+ output_html += format_categories_html(product, top_similarities)
40
+ output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
41
+
42
+ output_html += "</div>"
43
+
44
+ if not all_similarities:
45
+ output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
46
+
47
+ progress_tracker(1.0, desc="Done!")
48
+ return output_html
49
+
50
+ def categorize_products_from_file(file, top_n=5, confidence_threshold=0.5, progress=None):
51
+ """Categorize products from a JSON or text file"""
52
+ from utils import parse_product_file
53
+
54
+ # Create a safe progress tracker
55
+ progress_tracker = SafeProgress(progress)
56
+ progress_tracker(0.1, desc="Reading file...")
57
+
58
+ try:
59
+ product_names = parse_product_file(file.name)
60
+ except Exception as e:
61
+ return f"<div style='color: #d32f2f; font-weight: bold;'>Error: {str(e)}</div>"
62
+
63
+ if not product_names:
64
+ return "<div style='color: #d32f2f;'>No product names found in the file.</div>"
65
+
66
+ # Create product embeddings
67
+ progress_tracker(0.2, desc="Generating product embeddings...")
68
+ products_embeddings = create_product_embeddings(product_names)
69
+
70
+ # Compute similarities
71
+ progress_tracker(0.7, desc="Computing similarities...")
72
+ all_similarities = compute_similarities(embeddings, products_embeddings)
73
+
74
+ # Format results
75
+ progress_tracker(0.9, desc="Formatting results...")
76
+ output_html = f"<div style='font-family: Arial, sans-serif;'>"
77
+ output_html += f"<div style='margin-bottom: 20px; padding: 10px; background-color: #e8f5e9; border-radius: 5px;'>"
78
+ output_html += f"Found <b>{len(product_names)}</b> products in file. Showing results with confidence ≥ {confidence_threshold}."
79
+ output_html += "</div>"
80
+
81
+ for product, similarities in all_similarities.items():
82
+ # Filter by confidence threshold and take top N
83
+ filtered_similarities = [(ingredient, score) for ingredient, score in similarities
84
+ if score >= confidence_threshold]
85
+ top_similarities = filtered_similarities[:top_n]
86
+
87
+ output_html += format_categories_html(product, top_similarities)
88
+ output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
89
+
90
+ output_html += "</div>"
91
+
92
+ progress_tracker(1.0, desc="Done!")
93
+ return output_html
94
+
95
+ def create_demo():
96
+ """Create the Gradio interface"""
97
+ # Basic CSS theme
98
+ css = """
99
+ .container {
100
+ max-width: 1200px;
101
+ margin: auto;
102
+ padding: 0;
103
+ }
104
+ footer {display: none !important;}
105
+ .header {
106
+ background-color: #0d47a1;
107
+ padding: 15px 20px;
108
+ border-radius: 10px;
109
+ color: white;
110
+ margin-bottom: 20px;
111
+ display: flex;
112
+ align-items: center;
113
+ }
114
+ .header svg {
115
+ margin-right: 10px;
116
+ height: 30px;
117
+ width: 30px;
118
+ }
119
+ .header h1 {
120
+ margin: 0;
121
+ font-size: 24px;
122
+ }
123
+ .description {
124
+ margin-bottom: 20px;
125
+ padding: 15px;
126
+ background-color: #f5f5f5;
127
+ border-radius: 5px;
128
+ }
129
+ """
130
+
131
+ # Custom theme
132
+ theme = gr.themes.Soft(
133
+ primary_hue="blue",
134
+ secondary_hue="indigo",
135
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
136
+ ).set(
137
+ button_primary_background_fill="*primary_500",
138
+ button_primary_background_fill_hover="*primary_600",
139
+ button_secondary_background_fill="*neutral_200",
140
+ block_title_text_size="lg",
141
+ block_label_text_size="md"
142
+ )
143
+
144
+ with gr.Blocks(css=css, theme=theme) as demo:
145
+ # Header with icon
146
+ gr.HTML("""
147
+ <div class="header">
148
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="white">
149
+ <path d="M12 2L2 7l10 5 10-5-10-5zM2 17l10 5 10-5M2 12l10 5 10-5"></path>
150
+ </svg>
151
+ <h1>Product Categorization Tool</h1>
152
+ </div>
153
+ <div class="description">
154
+ This tool analyzes products and finds the most similar ingredients using AI embeddings.
155
+ Just enter product names or upload a file to get started.
156
+ </div>
157
+ """)
158
+
159
+ with gr.Tabs():
160
+ with gr.TabItem("Text Input"):
161
+ with gr.Row():
162
+ with gr.Column(scale=2):
163
+ example_products = [
164
+ "Tomato Sauce\nApple Pie\nGreek Yogurt\nChocolate Chip Cookies",
165
+ "Banana Bread\nOrange Juice\nGrilled Chicken\nCaesar Salad",
166
+ "Vanilla Ice Cream\nPizza Dough\nStrawberry Jam\nGrilled Salmon"
167
+ ]
168
+
169
+ text_input = gr.Textbox(
170
+ lines=10,
171
+ placeholder="Enter product names, one per line",
172
+ label="Product Names"
173
+ )
174
+
175
+ gr.Examples(
176
+ examples=example_products,
177
+ inputs=text_input,
178
+ label="Example Product Sets"
179
+ )
180
+
181
+ with gr.Row():
182
+ with gr.Column(scale=1):
183
+ top_n = gr.Slider(
184
+ minimum=1,
185
+ maximum=10,
186
+ value=5,
187
+ step=1,
188
+ label="Number of Top Categories"
189
+ )
190
+ with gr.Column(scale=1):
191
+ confidence = gr.Slider(
192
+ minimum=0.1,
193
+ maximum=0.9,
194
+ value=0.5,
195
+ step=0.05,
196
+ label="Confidence Threshold"
197
+ )
198
+
199
+ submit_button = gr.Button("Categorize Products", variant="primary")
200
+
201
+ with gr.Column(scale=3):
202
+ text_output = gr.HTML(label="Categorization Results",
203
+ value="<div style='height: 450px; display: flex; justify-content: center; align-items: center; color: #666;'>Results will appear here</div>")
204
+
205
+ submit_button.click(
206
+ fn=categorize_products_from_text,
207
+ inputs=[text_input, top_n, confidence],
208
+ outputs=text_output
209
+ )
210
+
211
+ with gr.TabItem("File Upload"):
212
+ with gr.Row():
213
+ with gr.Column(scale=2):
214
+ file_input = gr.File(
215
+ label="Upload JSON or text file with products",
216
+ file_types=[".json", ".txt"]
217
+ )
218
+
219
+ with gr.Accordion("Help", open=False):
220
+ gr.Markdown("""
221
+ - JSON files should contain either:
222
+ - A list of objects with a 'name' field for each product
223
+ - A simple array of product name strings
224
+ - Text files should have one product name per line
225
+ """)
226
+
227
+ with gr.Row():
228
+ with gr.Column(scale=1):
229
+ file_top_n = gr.Slider(
230
+ minimum=1,
231
+ maximum=10,
232
+ value=5,
233
+ step=1,
234
+ label="Number of Top Categories"
235
+ )
236
+ with gr.Column(scale=1):
237
+ file_confidence = gr.Slider(
238
+ minimum=0.1,
239
+ maximum=0.9,
240
+ value=0.5,
241
+ step=0.05,
242
+ label="Confidence Threshold"
243
+ )
244
+
245
+ file_button = gr.Button("Process File", variant="primary")
246
+
247
+ with gr.Column(scale=3):
248
+ file_output = gr.HTML(
249
+ label="Categorization Results",
250
+ value="<div style='height: 450px; display: flex; justify-content: center; align-items: center; color: #666;'>Upload a file to see results</div>"
251
+ )
252
+
253
+ file_button.click(
254
+ fn=categorize_products_from_file,
255
+ inputs=[file_input, file_top_n, file_confidence],
256
+ outputs=file_output
257
+ )
258
+
259
+ # Footer
260
+ gr.HTML("""
261
+ <div style="margin-top: 20px; text-align: center; color: #666;">
262
+ Powered by Voyage AI embeddings • Built with Gradio
263
+ </div>
264
+ """)
265
+
266
+ return demo
utils.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import json
3
+ import os
4
+
5
+ class SafeProgress:
6
+ """Wrapper for progress tracking that handles None gracefully"""
7
+ def __init__(self, progress_obj=None):
8
+ self.progress = progress_obj
9
+
10
+ def __call__(self, value, desc=""):
11
+ if self.progress is not None:
12
+ try:
13
+ self.progress(value, desc=desc)
14
+ except:
15
+ print(f"Progress {value}: {desc}")
16
+ else:
17
+ print(f"Progress {value}: {desc}")
18
+
19
+ def load_embeddings(embeddings_path):
20
+ """Load ingredient embeddings from pickle file"""
21
+ print(f"Loading ingredient embeddings from {embeddings_path}")
22
+ with open(embeddings_path, "rb") as f:
23
+ ingredients_embeddings = pickle.load(f)
24
+ print(f"Loaded {len(ingredients_embeddings)} ingredient embeddings")
25
+ return ingredients_embeddings
26
+
27
+ def parse_product_file(file_path):
28
+ """Parse a file containing product data and extract product names"""
29
+ try:
30
+ with open(file_path, 'r') as f:
31
+ try:
32
+ products_data = json.load(f)
33
+ if isinstance(products_data, list):
34
+ # Extract product names if it's a list of objects with 'name' field
35
+ if all(isinstance(item, dict) for item in products_data):
36
+ product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
37
+ else:
38
+ # If it's just a list of strings
39
+ product_names = [str(item) for item in products_data if item]
40
+ else:
41
+ # If it's just a list of product names
42
+ product_names = []
43
+ except json.JSONDecodeError:
44
+ # If not JSON, try reading as text file with one product per line
45
+ f.seek(0)
46
+ product_names = [line.strip() for line in f.readlines() if line.strip()]
47
+ except Exception as e:
48
+ raise Exception(f"Error reading file: {str(e)}")
49
+
50
+ return product_names
51
+
52
+ def format_categories_html(product, categories):
53
+ """Format categories as HTML with color-coded confidence scores"""
54
+ html = f"<div style='margin-bottom: 10px;'><b>{product}</b></div>"
55
+
56
+ if not categories:
57
+ html += "<div style='color: #666; font-style: italic;'>No matching categories found.</div>"
58
+ return html
59
+
60
+ html += "<div style='margin-left: 15px;'>"
61
+ for i, (category, score) in enumerate(categories, 1):
62
+ # Color code based on confidence
63
+ if score >= 0.8:
64
+ color = "#1a8a38" # Strong green
65
+ elif score >= 0.65:
66
+ color = "#4caf50" # Medium green
67
+ elif score >= 0.5:
68
+ color = "#8bc34a" # Light green
69
+ else:
70
+ color = "#9e9e9e" # Gray
71
+
72
+ html += f"<div style='margin-bottom: 5px;'>{i}. <span style='font-weight: 500;'>{category}</span> <span style='color: {color}; font-weight: bold;'>({score:.3f})</span></div>"
73
+
74
+ html += "</div>"
75
+ return html