Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

esilver commited on Mar 18

Commit

a318724

1 Parent(s): a198898

refactored

Browse files

Files changed (10) hide show

.DS_Store +0 -0
app.py +0 -369
embeddings.py +72 -0
ingredient_embeddings_voyageai.pkl +0 -3
main.py +39 -0
run_app.sh +1 -1
similarity.py +53 -0
spaces.py +0 -198
ui.py +266 -0
utils.py +75 -0

.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

app.py DELETED Viewed

@@ -1,369 +0,0 @@
-import gradio as gr
-import pickle
-import os
-import json
-import numpy as np
-import voyageai
-import time
-import sys
-from concurrent.futures import ThreadPoolExecutor
-# Set Voyage AI API key directly (using the free version key from your code)
-voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
-# Force unbuffered output
-os.environ['PYTHONUNBUFFERED'] = '1'
-# ===== Embedding Generation Functions =====
-def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
-    """Get embeddings for a list of texts in batches"""
-    all_embeddings = []
-    total_texts = len(texts)
-    # Pre-process all texts to replace newlines
-    texts = [text.replace("\n", " ") for text in texts]
-    for i in range(0, len(texts), batch_size):
-        batch = texts[i:i+batch_size]
-        try:
-            response = voyageai.Embedding.create(input=batch, model=model)
-            batch_embeddings = [item['embedding'] for item in response['data']]
-            all_embeddings.extend(batch_embeddings)
-            # Sleep briefly to avoid rate limits
-            if i + batch_size < len(texts):
-                time.sleep(0.5)
-        except Exception as e:
-            print(f"Error in batch {i//batch_size + 1}: {e}")
-            # Add empty embeddings for failed batch
-            all_embeddings.extend([None] * len(batch))
-    return all_embeddings
-def create_product_embeddings_voyageai(products, batch_size=100):
-    """Create embeddings for products using batch processing with deduplication"""
-    # De-duplication step
-    unique_products = []
-    product_to_index = {}
-    index_map = {}  # Maps original index to index in unique_products
-    for i, product in enumerate(products):
-        if product in product_to_index:
-            # Product already seen, just store the mapping
-            index_map[i] = product_to_index[product]
-        else:
-            # New unique product
-            product_to_index[product] = len(unique_products)
-            index_map[i] = len(unique_products)
-            unique_products.append(product)
-    print(f"Found {len(unique_products)} unique products out of {len(products)} total")
-    if len(unique_products) == 0:
-        return {}
-    # Process only unique products
-    print(f"Processing {len(unique_products)} unique products")
-    # Get embeddings for unique products
-    unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
-    # Map embeddings back to all products
-    all_products_dict = {}
-    for i, product in enumerate(products):
-        unique_idx = index_map[i]
-        if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
-            all_products_dict[product] = unique_embeddings[unique_idx]
-    print(f"Created embeddings for {len(all_products_dict)} products")
-    return all_products_dict
-# ===== Similarity Computation Functions =====
-def compute_similarities(ingredients_dict, products_dict):
-    """Compute similarities between all products and ingredients using NumPy"""
-    # Filter valid ingredients (with non-None embeddings)
-    ingredient_names = []
-    ingredient_embeddings_list = []
-    for ing, emb in ingredients_dict.items():
-        if emb is not None:
-            ingredient_names.append(ing)
-            ingredient_embeddings_list.append(emb)
-    # Convert ingredient embeddings to numpy array
-    ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
-    # Normalize ingredient embeddings for cosine similarity
-    ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
-    normalized_ingredients = ingredient_embeddings / ingredient_norms
-    # Process all products
-    all_similarities = {}
-    valid_products = []
-    valid_embeddings = []
-    for product, embedding in products_dict.items():
-        if embedding is not None:
-            valid_products.append(product)
-            valid_embeddings.append(embedding)
-    if not valid_products:
-        return {}
-    # Convert product embeddings to numpy array
-    product_embeddings = np.array(valid_embeddings, dtype=np.float32)
-    # Normalize product embeddings
-    product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
-    normalized_products = product_embeddings / product_norms
-    # Compute all similarities at once using matrix multiplication
-    # (dot product of normalized vectors = cosine similarity)
-    similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
-    # Process and store results
-    for p_idx, product in enumerate(valid_products):
-        product_similarities = [(ingredient_names[i_idx], float(similarity_matrix[p_idx, i_idx]))
-                              for i_idx in range(len(ingredient_names))]
-        # Sort by similarity score (descending)
-        product_similarities.sort(key=lambda x: x[1], reverse=True)
-        all_similarities[product] = product_similarities
-    return all_similarities
-# ===== Main Application Functions =====
-def load_embeddings(embeddings_path):
-    """Load ingredient embeddings from pickle file"""
-    print(f"Loading ingredient embeddings from {embeddings_path}")
-    with open(embeddings_path, "rb") as f:
-        ingredients_embeddings = pickle.load(f)
-    print(f"Loaded {len(ingredients_embeddings)} ingredient embeddings")
-    return ingredients_embeddings
-# Define a safe progress tracker that handles None
-class SafeProgress:
-    def __init__(self, progress_obj=None):
-        self.progress = progress_obj
-    def __call__(self, value, desc=""):
-        if self.progress is not None:
-            try:
-                self.progress(value, desc=desc)
-            except:
-                print(f"Progress {value}: {desc}")
-        else:
-            print(f"Progress {value}: {desc}")
-def categorize_products_from_text(product_text, top_n=5, confidence_threshold=0.5, progress=None):
-    """Categorize products from text input (one product per line)"""
-    # Create a safe progress tracker
-    progress_tracker = SafeProgress(progress)
-    progress_tracker(0, desc="Starting...")
-    # Parse input text to get product names
-    product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
-    if not product_names:
-        return "No product names provided."
-    # Create product embeddings
-    progress_tracker(0.1, desc="Generating product embeddings...")
-    products_embeddings = create_product_embeddings_voyageai(product_names)
-    # Compute similarities
-    progress_tracker(0.6, desc="Computing similarities...")
-    all_similarities = compute_similarities(embeddings, products_embeddings)
-    # Format results
-    progress_tracker(0.9, desc="Formatting results...")
-    results = {}
-    for product, similarities in all_similarities.items():
-        # Filter by confidence threshold and take top N
-        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
-                                if score >= confidence_threshold]
-        top_similarities = filtered_similarities[:top_n]
-        results[product] = top_similarities
-    # Format as readable text
-    output_text = ""
-    for product, categories in results.items():
-        output_text += f"Product: {product}\n"
-        if categories:
-            for i, (category, score) in enumerate(categories, 1):
-                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
-        else:
-            output_text += "  No matching categories found.\n"
-        output_text += "\n"
-    progress_tracker(1.0, desc="Done!")
-    return output_text
-def categorize_products_from_file(file, top_n=5, confidence_threshold=0.5, progress=None):
-    """Categorize products from a JSON file"""
-    # Create a safe progress tracker
-    progress_tracker = SafeProgress(progress)
-    progress_tracker(0.1, desc="Reading file...")
-    try:
-        with open(file.name, 'r') as f:
-            try:
-                products_data = json.load(f)
-                if isinstance(products_data, list):
-                    # Extract product names if it's a list of objects with 'name' field
-                    if all(isinstance(item, dict) for item in products_data):
-                        product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
-                    else:
-                        # If it's just a list of strings
-                        product_names = [str(item) for item in products_data if item]
-                else:
-                    # If it's just a list of product names
-                    product_names = []
-            except json.JSONDecodeError:
-                # If not JSON, try reading as text file with one product per line
-                f.seek(0)
-                product_names = [line.strip() for line in f.readlines() if line.strip()]
-    except Exception as e:
-        return f"Error reading file: {str(e)}"
-    if not product_names:
-        return "No product names found in the file."
-    # Create product embeddings
-    progress_tracker(0.2, desc="Generating product embeddings...")
-    products_embeddings = create_product_embeddings_voyageai(product_names)
-    # Compute similarities
-    progress_tracker(0.7, desc="Computing similarities...")
-    all_similarities = compute_similarities(embeddings, products_embeddings)
-    # Format results
-    progress_tracker(0.9, desc="Formatting results...")
-    output_text = f"Found {len(product_names)} products in file.\n\n"
-    for product, similarities in all_similarities.items():
-        # Filter by confidence threshold and take top N
-        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
-                                if score >= confidence_threshold]
-        top_similarities = filtered_similarities[:top_n]
-        output_text += f"Product: {product}\n"
-        if top_similarities:
-            for i, (category, score) in enumerate(top_similarities, 1):
-                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
-        else:
-            output_text += "  No matching categories found.\n"
-        output_text += "\n"
-    progress_tracker(1.0, desc="Done!")
-    return output_text
-# Load embeddings at the module level for easier access
-try:
-    embeddings_path = "ingredient_embeddings_voyageai.pkl"
-    embeddings = load_embeddings(embeddings_path)
-except Exception as e:
-    print(f"Warning: Could not load embeddings at startup: {e}")
-    print("Will attempt to load them when the app runs")
-    embeddings = {}
-# ===== Gradio Interface Setup =====
-def create_interface(embeddings_path="ingredient_embeddings_voyageai.pkl"):
-    # Ensure embeddings are loaded
-    global embeddings
-    if not embeddings:
-        try:
-            embeddings = load_embeddings(embeddings_path)
-        except Exception as e:
-            print(f"Error loading embeddings: {e}")
-            gr.Error(f"Failed to load embeddings file: {e}")
-    # Text input interface
-    with gr.Blocks() as demo:
-        gr.Markdown("# Product Categorization Tool")
-        gr.Markdown("This tool uses AI to categorize products based on their similarity to known ingredients.")
-        with gr.Tabs():
-            with gr.TabItem("Text Input"):
-                with gr.Row():
-                    with gr.Column():
-                        text_input = gr.Textbox(
-                            lines=10,
-                            placeholder="Enter product names, one per line",
-                            label="Product Names"
-                        )
-                        top_n = gr.Slider(
-                            minimum=1,
-                            maximum=10,
-                            value=5,
-                            step=1,
-                            label="Number of Top Categories"
-                        )
-                        confidence = gr.Slider(
-                            minimum=0.1,
-                            maximum=0.9,
-                            value=0.5,
-                            step=0.05,
-                            label="Confidence Threshold"
-                        )
-                        submit_button = gr.Button("Categorize Products")
-                    with gr.Column():
-                        text_output = gr.Textbox(label="Categorization Results", lines=20)
-                submit_button.click(
-                    fn=categorize_products_from_text,
-                    inputs=[text_input, top_n, confidence],
-                    outputs=text_output
-                )
-            with gr.TabItem("File Upload"):
-                with gr.Row():
-                    with gr.Column():
-                        file_input = gr.File(label="Upload JSON file with products")
-                        file_top_n = gr.Slider(
-                            minimum=1,
-                            maximum=10,
-                            value=5,
-                            step=1,
-                            label="Number of Top Categories"
-                        )
-                        file_confidence = gr.Slider(
-                            minimum=0.1,
-                            maximum=0.9,
-                            value=0.5,
-                            step=0.05,
-                            label="Confidence Threshold"
-                        )
-                        file_button = gr.Button("Process File")
-                    with gr.Column():
-                        file_output = gr.Textbox(label="Categorization Results", lines=20)
-                file_button.click(
-                    fn=categorize_products_from_file,
-                    inputs=[file_input, file_top_n, file_confidence],
-                    outputs=file_output
-                )
-        gr.Markdown("### Example Input")
-        gr.Markdown("Try entering product names like:\n- Tomato Sauce\n- Apple Pie\n- Greek Yogurt\n- Chocolate Chip Cookies")
-    return demo
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser(description='Run the Product Categorization web app')
-    parser.add_argument('--embeddings', default='ingredient_embeddings_voyageai.pkl',
-                        help='Path to the ingredient embeddings pickle file')
-    parser.add_argument('--share', action='store_true', help='Create a public link for sharing')
-    args = parser.parse_args()
-    # Create and launch the interface
-    demo = create_interface(args.embeddings)
-    demo.launch(share=args.share)

embeddings.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import voyageai
+import time
+import numpy as np
+# Set Voyage AI API key directly
+voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
+def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
+    """Get embeddings for a list of texts in batches"""
+    all_embeddings = []
+    total_texts = len(texts)
+    # Pre-process all texts to replace newlines
+    texts = [text.replace("\n", " ") for text in texts]
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i+batch_size]
+        try:
+            response = voyageai.Embedding.create(input=batch, model=model)
+            batch_embeddings = [item['embedding'] for item in response['data']]
+            all_embeddings.extend(batch_embeddings)
+            # Sleep briefly to avoid rate limits
+            if i + batch_size < len(texts):
+                time.sleep(0.5)
+        except Exception as e:
+            print(f"Error in batch {i//batch_size + 1}: {e}")
+            # Add empty embeddings for failed batch
+            all_embeddings.extend([None] * len(batch))
+    return all_embeddings
+def create_product_embeddings(products, batch_size=100):
+    """Create embeddings for products using batch processing with deduplication"""
+    # De-duplication step
+    unique_products = []
+    product_to_index = {}
+    index_map = {}  # Maps original index to index in unique_products
+    for i, product in enumerate(products):
+        if product in product_to_index:
+            # Product already seen, just store the mapping
+            index_map[i] = product_to_index[product]
+        else:
+            # New unique product
+            product_to_index[product] = len(unique_products)
+            index_map[i] = len(unique_products)
+            unique_products.append(product)
+    print(f"Found {len(unique_products)} unique products out of {len(products)} total")
+    if len(unique_products) == 0:
+        return {}
+    # Process only unique products
+    print(f"Processing {len(unique_products)} unique products")
+    # Get embeddings for unique products
+    unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
+    # Map embeddings back to all products
+    all_products_dict = {}
+    for i, product in enumerate(products):
+        unique_idx = index_map[i]
+        if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
+            all_products_dict[product] = unique_embeddings[unique_idx]
+    print(f"Created embeddings for {len(all_products_dict)} products")
+    return all_products_dict

ingredient_embeddings_voyageai.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:de6791a4909432600b90a5523e8a105f047887d4ac59d63460d8a2f9d788d0c9
-size 27301581

main.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import argparse
+import os
+import sys
+import gradio as gr
+from utils import load_embeddings
+from ui import create_demo, embeddings
+def main():
+    """Main entry point for the application"""
+    parser = argparse.ArgumentParser(description='Run the Product Categorization web app')
+    parser.add_argument('--embeddings', default='ingredient_embeddings_voyageai.pkl',
+                        help='Path to the ingredient embeddings pickle file')
+    parser.add_argument('--share', action='store_true', help='Create a public link for sharing')
+    args = parser.parse_args()
+    # Check if embeddings file exists
+    if not os.path.exists(args.embeddings):
+        print(f"Error: Embeddings file {args.embeddings} not found!")
+        print(f"Please ensure the file exists at {os.path.abspath(args.embeddings)}")
+        sys.exit(1)
+    # Load embeddings
+    try:
+        global embeddings
+        embeddings_data = load_embeddings(args.embeddings)
+        # Update the embeddings in the ui module
+        import ui
+        ui.embeddings = embeddings_data
+    except Exception as e:
+        print(f"Error loading embeddings: {e}")
+        sys.exit(1)
+    # Create and launch the interface
+    demo = create_demo()
+    demo.launch(share=args.share)
+if __name__ == "__main__":
+    main()

run_app.sh CHANGED Viewed

@@ -6,7 +6,7 @@ pip install -r requirements.txt
 # Check if embeddings file exists
 if [ -f "ingredient_embeddings_voyageai.pkl" ]; then
     # Run with local embeddings file
-    python app.py --share
 else
     echo "ERROR: ingredient_embeddings_voyageai.pkl file not found!"
     echo "Please place the embeddings file in the same directory as this script."

 # Check if embeddings file exists
 if [ -f "ingredient_embeddings_voyageai.pkl" ]; then
     # Run with local embeddings file
+    python main.py --share
 else
     echo "ERROR: ingredient_embeddings_voyageai.pkl file not found!"
     echo "Please place the embeddings file in the same directory as this script."

similarity.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+def compute_similarities(ingredients_dict, products_dict):
+    """Compute similarities between all products and ingredients using NumPy"""
+    # Filter valid ingredients (with non-None embeddings)
+    ingredient_names = []
+    ingredient_embeddings_list = []
+    for ing, emb in ingredients_dict.items():
+        if emb is not None:
+            ingredient_names.append(ing)
+            ingredient_embeddings_list.append(emb)
+    # Convert ingredient embeddings to numpy array
+    ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
+    # Normalize ingredient embeddings for cosine similarity
+    ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
+    normalized_ingredients = ingredient_embeddings / ingredient_norms
+    # Process all products
+    all_similarities = {}
+    valid_products = []
+    valid_embeddings = []
+    for product, embedding in products_dict.items():
+        if embedding is not None:
+            valid_products.append(product)
+            valid_embeddings.append(embedding)
+    if not valid_products:
+        return {}
+    # Convert product embeddings to numpy array
+    product_embeddings = np.array(valid_embeddings, dtype=np.float32)
+    # Normalize product embeddings
+    product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
+    normalized_products = product_embeddings / product_norms
+    # Compute all similarities at once using matrix multiplication
+    # (dot product of normalized vectors = cosine similarity)
+    similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
+    # Process and store results
+    for p_idx, product in enumerate(valid_products):
+        product_similarities = [(ingredient_names[i_idx], float(similarity_matrix[p_idx, i_idx]))
+                              for i_idx in range(len(ingredient_names))]
+        # Sort by similarity score (descending)
+        product_similarities.sort(key=lambda x: x[1], reverse=True)
+        all_similarities[product] = product_similarities
+    return all_similarities

spaces.py DELETED Viewed

@@ -1,198 +0,0 @@
-import gradio as gr
-import pickle
-import os
-import json
-import numpy as np
-import voyageai
-import time
-import sys
-# Set Voyage AI API key directly
-voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
-# Import all necessary functions from the main app
-from app import create_product_embeddings_voyageai, get_embeddings_batch, compute_similarities
-# Path to the embeddings file for Hugging Face Spaces
-EMBEDDINGS_PATH = "ingredient_embeddings_voyageai.pkl"
-# Load the embeddings
-print(f"Loading ingredient embeddings from {EMBEDDINGS_PATH}")
-try:
-    with open(EMBEDDINGS_PATH, "rb") as f:
-        embeddings = pickle.load(f)
-    print(f"Successfully loaded {len(embeddings)} ingredient embeddings")
-except Exception as e:
-    print(f"ERROR: Failed to load embeddings: {e}")
-    # Create an empty dict as fallback
-    embeddings = {}
-# Define the categorization function for text input
-def categorize_products_from_text(product_text, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
-    """Categorize products from text input (one product per line)"""
-    # Parse input text to get product names
-    product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
-    if not product_names:
-        return "No product names provided."
-    progress(0.1, desc="Generating product embeddings...")
-    # Create product embeddings
-    products_embeddings = create_product_embeddings_voyageai(product_names)
-    # Compute similarities
-    progress(0.6, desc="Computing similarities...")
-    all_similarities = compute_similarities(embeddings, products_embeddings)
-    # Format results
-    progress(0.9, desc="Formatting results...")
-    output_text = ""
-    for product, similarities in all_similarities.items():
-        # Filter by confidence threshold and take top N
-        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
-                                if score >= confidence_threshold]
-        top_similarities = filtered_similarities[:top_n]
-        output_text += f"Product: {product}\n"
-        if top_similarities:
-            for i, (category, score) in enumerate(top_similarities, 1):
-                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
-        else:
-            output_text += "  No matching categories found.\n"
-        output_text += "\n"
-    progress(1.0, desc="Done!")
-    return output_text
-# Define the categorization function for file input
-def categorize_products_from_file(file, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
-    """Categorize products from a JSON file"""
-    progress(0.1, desc="Reading file...")
-    try:
-        with open(file.name, 'r') as f:
-            try:
-                products_data = json.load(f)
-                if isinstance(products_data, list):
-                    # Extract product names if it's a list of objects with 'name' field
-                    if all(isinstance(item, dict) for item in products_data):
-                        product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
-                    else:
-                        # If it's just a list of strings
-                        product_names = [str(item) for item in products_data if item]
-                else:
-                    # If it's just a list of product names
-                    product_names = []
-            except json.JSONDecodeError:
-                # If not JSON, try reading as text file with one product per line
-                f.seek(0)
-                product_names = [line.strip() for line in f.readlines() if line.strip()]
-    except Exception as e:
-        return f"Error reading file: {str(e)}"
-    if not product_names:
-        return "No product names found in the file."
-    # Create product embeddings
-    progress(0.2, desc="Generating product embeddings...")
-    products_embeddings = create_product_embeddings_voyageai(product_names)
-    # Compute similarities
-    progress(0.7, desc="Computing similarities...")
-    all_similarities = compute_similarities(embeddings, products_embeddings)
-    # Format results
-    progress(0.9, desc="Formatting results...")
-    output_text = f"Found {len(product_names)} products in file.\n\n"
-    for product, similarities in all_similarities.items():
-        # Filter by confidence threshold and take top N
-        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
-                                if score >= confidence_threshold]
-        top_similarities = filtered_similarities[:top_n]
-        output_text += f"Product: {product}\n"
-        if top_similarities:
-            for i, (category, score) in enumerate(top_similarities, 1):
-                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
-        else:
-            output_text += "  No matching categories found.\n"
-        output_text += "\n"
-    progress(1.0, desc="Done!")
-    return output_text
-# Create the Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Product Categorization Tool")
-    gr.Markdown("This tool uses AI to categorize products based on their similarity to known ingredients.")
-    with gr.Tabs():
-        with gr.TabItem("Text Input"):
-            with gr.Row():
-                with gr.Column():
-                    text_input = gr.Textbox(
-                        lines=10,
-                        placeholder="Enter product names, one per line",
-                        label="Product Names"
-                    )
-                    top_n = gr.Slider(
-                        minimum=1,
-                        maximum=10,
-                        value=5,
-                        step=1,
-                        label="Number of Top Categories"
-                    )
-                    confidence = gr.Slider(
-                        minimum=0.1,
-                        maximum=0.9,
-                        value=0.5,
-                        step=0.05,
-                        label="Confidence Threshold"
-                    )
-                    submit_button = gr.Button("Categorize Products")
-                with gr.Column():
-                    text_output = gr.Textbox(label="Categorization Results", lines=20)
-            submit_button.click(
-                fn=categorize_products_from_text,
-                inputs=[text_input, top_n, confidence],
-                outputs=text_output
-            )
-        with gr.TabItem("File Upload"):
-            with gr.Row():
-                with gr.Column():
-                    file_input = gr.File(label="Upload JSON file with products")
-                    file_top_n = gr.Slider(
-                        minimum=1,
-                        maximum=10,
-                        value=5,
-                        step=1,
-                        label="Number of Top Categories"
-                    )
-                    file_confidence = gr.Slider(
-                        minimum=0.1,
-                        maximum=0.9,
-                        value=0.5,
-                        step=0.05,
-                        label="Confidence Threshold"
-                    )
-                    file_button = gr.Button("Process File")
-                with gr.Column():
-                    file_output = gr.Textbox(label="Categorization Results", lines=20)
-            file_button.click(
-                fn=categorize_products_from_file,
-                inputs=[file_input, file_top_n, file_confidence],
-                outputs=file_output
-            )
-    gr.Markdown("### Example Input")
-    gr.Markdown("Try entering product names like:\n- Tomato Sauce\n- Apple Pie\n- Greek Yogurt\n- Chocolate Chip Cookies")
-# Launch the demo (for Hugging Face Spaces)
-demo.launch()

ui.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import gradio as gr
+from utils import SafeProgress, format_categories_html
+from embeddings import create_product_embeddings
+from similarity import compute_similarities
+# Global variable for embeddings
+embeddings = {}
+def categorize_products_from_text(product_text, top_n=5, confidence_threshold=0.5, progress=None):
+    """Categorize products from text input (one product per line)"""
+    # Create a safe progress tracker
+    progress_tracker = SafeProgress(progress)
+    progress_tracker(0, desc="Starting...")
+    # Parse input text to get product names
+    product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
+    if not product_names:
+        return "No product names provided."
+    # Create product embeddings
+    progress_tracker(0.1, desc="Generating product embeddings...")
+    products_embeddings = create_product_embeddings(product_names)
+    # Compute similarities
+    progress_tracker(0.6, desc="Computing similarities...")
+    all_similarities = compute_similarities(embeddings, products_embeddings)
+    # Format results
+    progress_tracker(0.9, desc="Formatting results...")
+    output_html = "<div style='font-family: Arial, sans-serif;'>"
+    for product, similarities in all_similarities.items():
+        # Filter by confidence threshold and take top N
+        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
+                                if score >= confidence_threshold]
+        top_similarities = filtered_similarities[:top_n]
+        output_html += format_categories_html(product, top_similarities)
+        output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
+    output_html += "</div>"
+    if not all_similarities:
+        output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
+    progress_tracker(1.0, desc="Done!")
+    return output_html
+def categorize_products_from_file(file, top_n=5, confidence_threshold=0.5, progress=None):
+    """Categorize products from a JSON or text file"""
+    from utils import parse_product_file
+    # Create a safe progress tracker
+    progress_tracker = SafeProgress(progress)
+    progress_tracker(0.1, desc="Reading file...")
+    try:
+        product_names = parse_product_file(file.name)
+    except Exception as e:
+        return f"<div style='color: #d32f2f; font-weight: bold;'>Error: {str(e)}</div>"
+    if not product_names:
+        return "<div style='color: #d32f2f;'>No product names found in the file.</div>"
+    # Create product embeddings
+    progress_tracker(0.2, desc="Generating product embeddings...")
+    products_embeddings = create_product_embeddings(product_names)
+    # Compute similarities
+    progress_tracker(0.7, desc="Computing similarities...")
+    all_similarities = compute_similarities(embeddings, products_embeddings)
+    # Format results
+    progress_tracker(0.9, desc="Formatting results...")
+    output_html = f"<div style='font-family: Arial, sans-serif;'>"
+    output_html += f"<div style='margin-bottom: 20px; padding: 10px; background-color: #e8f5e9; border-radius: 5px;'>"
+    output_html += f"Found <b>{len(product_names)}</b> products in file. Showing results with confidence ≥ {confidence_threshold}."
+    output_html += "</div>"
+    for product, similarities in all_similarities.items():
+        # Filter by confidence threshold and take top N
+        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
+                                if score >= confidence_threshold]
+        top_similarities = filtered_similarities[:top_n]
+        output_html += format_categories_html(product, top_similarities)
+        output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
+    output_html += "</div>"
+    progress_tracker(1.0, desc="Done!")
+    return output_html
+def create_demo():
+    """Create the Gradio interface"""
+    # Basic CSS theme
+    css = """
+    .container {
+        max-width: 1200px;
+        margin: auto;
+        padding: 0;
+    }
+    footer {display: none !important;}
+    .header {
+        background-color: #0d47a1;
+        padding: 15px 20px;
+        border-radius: 10px;
+        color: white;
+        margin-bottom: 20px;
+        display: flex;
+        align-items: center;
+    }
+    .header svg {
+        margin-right: 10px;
+        height: 30px;
+        width: 30px;
+    }
+    .header h1 {
+        margin: 0;
+        font-size: 24px;
+    }
+    .description {
+        margin-bottom: 20px;
+        padding: 15px;
+        background-color: #f5f5f5;
+        border-radius: 5px;
+    }
+    """
+    # Custom theme
+    theme = gr.themes.Soft(
+        primary_hue="blue",
+        secondary_hue="indigo",
+        font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
+    ).set(
+        button_primary_background_fill="*primary_500",
+        button_primary_background_fill_hover="*primary_600",
+        button_secondary_background_fill="*neutral_200",
+        block_title_text_size="lg",
+        block_label_text_size="md"
+    )
+    with gr.Blocks(css=css, theme=theme) as demo:
+        # Header with icon
+        gr.HTML("""
+        <div class="header">
+            <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="white">
+                <path d="M12 2L2 7l10 5 10-5-10-5zM2 17l10 5 10-5M2 12l10 5 10-5"></path>
+            </svg>
+            <h1>Product Categorization Tool</h1>
+        </div>
+        <div class="description">
+            This tool analyzes products and finds the most similar ingredients using AI embeddings.
+            Just enter product names or upload a file to get started.
+        </div>
+        """)
+        with gr.Tabs():
+            with gr.TabItem("Text Input"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        example_products = [
+                            "Tomato Sauce\nApple Pie\nGreek Yogurt\nChocolate Chip Cookies",
+                            "Banana Bread\nOrange Juice\nGrilled Chicken\nCaesar Salad",
+                            "Vanilla Ice Cream\nPizza Dough\nStrawberry Jam\nGrilled Salmon"
+                        ]
+                        text_input = gr.Textbox(
+                            lines=10,
+                            placeholder="Enter product names, one per line",
+                            label="Product Names"
+                        )
+                        gr.Examples(
+                            examples=example_products,
+                            inputs=text_input,
+                            label="Example Product Sets"
+                        )
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                top_n = gr.Slider(
+                                    minimum=1,
+                                    maximum=10,
+                                    value=5,
+                                    step=1,
+                                    label="Number of Top Categories"
+                                )
+                            with gr.Column(scale=1):
+                                confidence = gr.Slider(
+                                    minimum=0.1,
+                                    maximum=0.9,
+                                    value=0.5,
+                                    step=0.05,
+                                    label="Confidence Threshold"
+                                )
+                        submit_button = gr.Button("Categorize Products", variant="primary")
+                    with gr.Column(scale=3):
+                        text_output = gr.HTML(label="Categorization Results",
+                                             value="<div style='height: 450px; display: flex; justify-content: center; align-items: center; color: #666;'>Results will appear here</div>")
+                submit_button.click(
+                    fn=categorize_products_from_text,
+                    inputs=[text_input, top_n, confidence],
+                    outputs=text_output
+                )
+            with gr.TabItem("File Upload"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        file_input = gr.File(
+                            label="Upload JSON or text file with products",
+                            file_types=[".json", ".txt"]
+                        )
+                        with gr.Accordion("Help", open=False):
+                            gr.Markdown("""
+                            - JSON files should contain either:
+                              - A list of objects with a 'name' field for each product
+                              - A simple array of product name strings
+                            - Text files should have one product name per line
+                            """)
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                file_top_n = gr.Slider(
+                                    minimum=1,
+                                    maximum=10,
+                                    value=5,
+                                    step=1,
+                                    label="Number of Top Categories"
+                                )
+                            with gr.Column(scale=1):
+                                file_confidence = gr.Slider(
+                                    minimum=0.1,
+                                    maximum=0.9,
+                                    value=0.5,
+                                    step=0.05,
+                                    label="Confidence Threshold"
+                                )
+                        file_button = gr.Button("Process File", variant="primary")
+                    with gr.Column(scale=3):
+                        file_output = gr.HTML(
+                            label="Categorization Results",
+                            value="<div style='height: 450px; display: flex; justify-content: center; align-items: center; color: #666;'>Upload a file to see results</div>"
+                        )
+                file_button.click(
+                    fn=categorize_products_from_file,
+                    inputs=[file_input, file_top_n, file_confidence],
+                    outputs=file_output
+                )
+        # Footer
+        gr.HTML("""
+        <div style="margin-top: 20px; text-align: center; color: #666;">
+            Powered by Voyage AI embeddings • Built with Gradio
+        </div>
+        """)
+    return demo

utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import pickle
+import json
+import os
+class SafeProgress:
+    """Wrapper for progress tracking that handles None gracefully"""
+    def __init__(self, progress_obj=None):
+        self.progress = progress_obj
+    def __call__(self, value, desc=""):
+        if self.progress is not None:
+            try:
+                self.progress(value, desc=desc)
+            except:
+                print(f"Progress {value}: {desc}")
+        else:
+            print(f"Progress {value}: {desc}")
+def load_embeddings(embeddings_path):
+    """Load ingredient embeddings from pickle file"""
+    print(f"Loading ingredient embeddings from {embeddings_path}")
+    with open(embeddings_path, "rb") as f:
+        ingredients_embeddings = pickle.load(f)
+    print(f"Loaded {len(ingredients_embeddings)} ingredient embeddings")
+    return ingredients_embeddings
+def parse_product_file(file_path):
+    """Parse a file containing product data and extract product names"""
+    try:
+        with open(file_path, 'r') as f:
+            try:
+                products_data = json.load(f)
+                if isinstance(products_data, list):
+                    # Extract product names if it's a list of objects with 'name' field
+                    if all(isinstance(item, dict) for item in products_data):
+                        product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
+                    else:
+                        # If it's just a list of strings
+                        product_names = [str(item) for item in products_data if item]
+                else:
+                    # If it's just a list of product names
+                    product_names = []
+            except json.JSONDecodeError:
+                # If not JSON, try reading as text file with one product per line
+                f.seek(0)
+                product_names = [line.strip() for line in f.readlines() if line.strip()]
+    except Exception as e:
+        raise Exception(f"Error reading file: {str(e)}")
+    return product_names
+def format_categories_html(product, categories):
+    """Format categories as HTML with color-coded confidence scores"""
+    html = f"<div style='margin-bottom: 10px;'><b>{product}</b></div>"
+    if not categories:
+        html += "<div style='color: #666; font-style: italic;'>No matching categories found.</div>"
+        return html
+    html += "<div style='margin-left: 15px;'>"
+    for i, (category, score) in enumerate(categories, 1):
+        # Color code based on confidence
+        if score >= 0.8:
+            color = "#1a8a38"  # Strong green
+        elif score >= 0.65:
+            color = "#4caf50"  # Medium green
+        elif score >= 0.5:
+            color = "#8bc34a"  # Light green
+        else:
+            color = "#9e9e9e"  # Gray
+        html += f"<div style='margin-bottom: 5px;'>{i}. <span style='font-weight: 500;'>{category}</span> <span style='color: {color}; font-weight: bold;'>({score:.3f})</span></div>"
+    html += "</div>"
+    return html