Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

eliago commited on Mar 18

Commit

e7db3d5

verified ·

1 Parent(s): 91ba2a4

Upload 5 files

Browse files

Files changed (5) hide show

app.py +340 -0
readme.md +43 -0
requirements.txt +3 -0
run_app.sh +14 -0
spaces.py +198 -0

app.py ADDED Viewed

	@@ -0,0 +1,340 @@

+import gradio as gr
+import pickle
+import os
+import json
+import numpy as np
+import voyageai
+import time
+import sys
+from concurrent.futures import ThreadPoolExecutor
+# Set Voyage AI API key directly (using the free version key from your code)
+voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
+# Force unbuffered output
+os.environ['PYTHONUNBUFFERED'] = '1'
+# ===== Embedding Generation Functions =====
+def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
+    """Get embeddings for a list of texts in batches"""
+    all_embeddings = []
+    total_texts = len(texts)
+    # Pre-process all texts to replace newlines
+    texts = [text.replace("\n", " ") for text in texts]
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i+batch_size]
+        try:
+            response = voyageai.Embedding.create(input=batch, model=model)
+            batch_embeddings = [item['embedding'] for item in response['data']]
+            all_embeddings.extend(batch_embeddings)
+            # Sleep briefly to avoid rate limits
+            if i + batch_size < len(texts):
+                time.sleep(0.5)
+        except Exception as e:
+            print(f"Error in batch {i//batch_size + 1}: {e}")
+            # Add empty embeddings for failed batch
+            all_embeddings.extend([None] * len(batch))
+    return all_embeddings
+def create_product_embeddings_voyageai(products, batch_size=100):
+    """Create embeddings for products using batch processing with deduplication"""
+    # De-duplication step
+    unique_products = []
+    product_to_index = {}
+    index_map = {}  # Maps original index to index in unique_products
+    for i, product in enumerate(products):
+        if product in product_to_index:
+            # Product already seen, just store the mapping
+            index_map[i] = product_to_index[product]
+        else:
+            # New unique product
+            product_to_index[product] = len(unique_products)
+            index_map[i] = len(unique_products)
+            unique_products.append(product)
+    print(f"Found {len(unique_products)} unique products out of {len(products)} total")
+    if len(unique_products) == 0:
+        return {}
+    # Process only unique products
+    print(f"Processing {len(unique_products)} unique products")
+    # Get embeddings for unique products
+    unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
+    # Map embeddings back to all products
+    all_products_dict = {}
+    for i, product in enumerate(products):
+        unique_idx = index_map[i]
+        if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
+            all_products_dict[product] = unique_embeddings[unique_idx]
+    print(f"Created embeddings for {len(all_products_dict)} products")
+    return all_products_dict
+# ===== Similarity Computation Functions =====
+def compute_similarities(ingredients_dict, products_dict):
+    """Compute similarities between all products and ingredients using NumPy"""
+    # Filter valid ingredients (with non-None embeddings)
+    ingredient_names = []
+    ingredient_embeddings_list = []
+    for ing, emb in ingredients_dict.items():
+        if emb is not None:
+            ingredient_names.append(ing)
+            ingredient_embeddings_list.append(emb)
+    # Convert ingredient embeddings to numpy array
+    ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
+    # Normalize ingredient embeddings for cosine similarity
+    ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
+    normalized_ingredients = ingredient_embeddings / ingredient_norms
+    # Process all products
+    all_similarities = {}
+    valid_products = []
+    valid_embeddings = []
+    for product, embedding in products_dict.items():
+        if embedding is not None:
+            valid_products.append(product)
+            valid_embeddings.append(embedding)
+    if not valid_products:
+        return {}
+    # Convert product embeddings to numpy array
+    product_embeddings = np.array(valid_embeddings, dtype=np.float32)
+    # Normalize product embeddings
+    product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
+    normalized_products = product_embeddings / product_norms
+    # Compute all similarities at once using matrix multiplication
+    # (dot product of normalized vectors = cosine similarity)
+    similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
+    # Process and store results
+    for p_idx, product in enumerate(valid_products):
+        product_similarities = [(ingredient_names[i_idx], float(similarity_matrix[p_idx, i_idx]))
+                              for i_idx in range(len(ingredient_names))]
+        # Sort by similarity score (descending)
+        product_similarities.sort(key=lambda x: x[1], reverse=True)
+        all_similarities[product] = product_similarities
+    return all_similarities
+# ===== Main Application Functions =====
+def load_embeddings(embeddings_path):
+    """Load ingredient embeddings from pickle file"""
+    print(f"Loading ingredient embeddings from {embeddings_path}")
+    with open(embeddings_path, "rb") as f:
+        ingredients_embeddings = pickle.load(f)
+    print(f"Loaded {len(ingredients_embeddings)} ingredient embeddings")
+    return ingredients_embeddings
+def categorize_products_from_text(product_text, embeddings, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
+    """Categorize products from text input (one product per line)"""
+    # Parse input text to get product names
+    product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
+    if not product_names:
+        return "No product names provided."
+    progress(0, desc="Starting...")
+    # Create product embeddings
+    progress(0.1, desc="Generating product embeddings...")
+    products_embeddings = create_product_embeddings_voyageai(product_names)
+    # Compute similarities
+    progress(0.6, desc="Computing similarities...")
+    all_similarities = compute_similarities(embeddings, products_embeddings)
+    # Format results
+    progress(0.9, desc="Formatting results...")
+    results = {}
+    for product, similarities in all_similarities.items():
+        # Filter by confidence threshold and take top N
+        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
+                                if score >= confidence_threshold]
+        top_similarities = filtered_similarities[:top_n]
+        results[product] = top_similarities
+    # Format as readable text
+    output_text = ""
+    for product, categories in results.items():
+        output_text += f"Product: {product}\n"
+        if categories:
+            for i, (category, score) in enumerate(categories, 1):
+                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
+        else:
+            output_text += "  No matching categories found.\n"
+        output_text += "\n"
+    progress(1.0, desc="Done!")
+    return output_text
+def categorize_products_from_file(file, embeddings, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
+    """Categorize products from a JSON file"""
+    progress(0.1, desc="Reading file...")
+    try:
+        with open(file.name, 'r') as f:
+            try:
+                products_data = json.load(f)
+                if isinstance(products_data, list):
+                    # Extract product names if it's a list of objects with 'name' field
+                    if all(isinstance(item, dict) for item in products_data):
+                        product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
+                    else:
+                        # If it's just a list of strings
+                        product_names = [str(item) for item in products_data if item]
+                else:
+                    # If it's just a list of product names
+                    product_names = []
+            except json.JSONDecodeError:
+                # If not JSON, try reading as text file with one product per line
+                f.seek(0)
+                product_names = [line.strip() for line in f.readlines() if line.strip()]
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+    if not product_names:
+        return "No product names found in the file."
+    # Create product embeddings
+    progress(0.2, desc="Generating product embeddings...")
+    products_embeddings = create_product_embeddings_voyageai(product_names)
+    # Compute similarities
+    progress(0.7, desc="Computing similarities...")
+    all_similarities = compute_similarities(embeddings, products_embeddings)
+    # Format results
+    progress(0.9, desc="Formatting results...")
+    output_text = f"Found {len(product_names)} products in file.\n\n"
+    for product, similarities in all_similarities.items():
+        # Filter by confidence threshold and take top N
+        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
+                                if score >= confidence_threshold]
+        top_similarities = filtered_similarities[:top_n]
+        output_text += f"Product: {product}\n"
+        if top_similarities:
+            for i, (category, score) in enumerate(top_similarities, 1):
+                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
+        else:
+            output_text += "  No matching categories found.\n"
+        output_text += "\n"
+    progress(1.0, desc="Done!")
+    return output_text
+# ===== Gradio Interface Setup =====
+def create_interface(embeddings_path="ingredient_embeddings_voyageai.pkl"):
+    # Load embeddings once at startup
+    embeddings = load_embeddings(embeddings_path)
+    # Text input interface
+    with gr.Blocks() as demo:
+        gr.Markdown("# Product Categorization Tool")
+        gr.Markdown("This tool uses AI to categorize products based on their similarity to known ingredients.")
+        with gr.Tabs():
+            with gr.TabItem("Text Input"):
+                with gr.Row():
+                    with gr.Column():
+                        text_input = gr.Textbox(
+                            lines=10,
+                            placeholder="Enter product names, one per line",
+                            label="Product Names"
+                        )
+                        top_n = gr.Slider(
+                            minimum=1,
+                            maximum=10,
+                            value=5,
+                            step=1,
+                            label="Number of Top Categories"
+                        )
+                        confidence = gr.Slider(
+                            minimum=0.1,
+                            maximum=0.9,
+                            value=0.5,
+                            step=0.05,
+                            label="Confidence Threshold"
+                        )
+                        submit_button = gr.Button("Categorize Products")
+                    with gr.Column():
+                        text_output = gr.Textbox(label="Categorization Results", lines=20)
+                submit_button.click(
+                    fn=lambda text, top_n, conf, prog: categorize_products_from_text(
+                        text, embeddings, prog, top_n, conf
+                    ),
+                    inputs=[text_input, top_n, confidence],
+                    outputs=text_output
+                )
+            with gr.TabItem("File Upload"):
+                with gr.Row():
+                    with gr.Column():
+                        file_input = gr.File(label="Upload JSON file with products")
+                        file_top_n = gr.Slider(
+                            minimum=1,
+                            maximum=10,
+                            value=5,
+                            step=1,
+                            label="Number of Top Categories"
+                        )
+                        file_confidence = gr.Slider(
+                            minimum=0.1,
+                            maximum=0.9,
+                            value=0.5,
+                            step=0.05,
+                            label="Confidence Threshold"
+                        )
+                        file_button = gr.Button("Process File")
+                    with gr.Column():
+                        file_output = gr.Textbox(label="Categorization Results", lines=20)
+                file_button.click(
+                    fn=lambda file, top_n, conf, prog: categorize_products_from_file(
+                        file, embeddings, prog, top_n, conf
+                    ),
+                    inputs=[file_input, file_top_n, file_confidence],
+                    outputs=file_output
+                )
+        gr.Markdown("### Example Input")
+        gr.Markdown("Try entering product names like:\n- Tomato Sauce\n- Apple Pie\n- Greek Yogurt\n- Chocolate Chip Cookies")
+    return demo
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Run the Product Categorization web app')
+    parser.add_argument('--embeddings', default='ingredient_embeddings_voyageai.pkl',
+                        help='Path to the ingredient embeddings pickle file')
+    parser.add_argument('--share', action='store_true', help='Create a public link for sharing')
+    args = parser.parse_args()
+    # Create and launch the interface
+    demo = create_interface(args.embeddings)
+    demo.launch(share=args.share)

readme.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# Product Categorization App - One-Click Solution
+This is a turnkey solution for categorizing products based on their similarity to ingredients using Voyage AI.
+## Quick Start
+1. Place your `ingredient_embeddings_voyageai.pkl` file in the same folder as this README
+2. Run the application:
+   ```bash
+   bash run_app.sh
+   ```
+3. That's it! A browser window will open with the app, and a public URL will be created for sharing
+## What You Can Do
+- **Text Input:** Enter product names one per line
+- **File Upload:** Upload a JSON file with product data
+- Adjust the number of categories and confidence threshold
+- View the categorization results with confidence scores
+## Hosting on Hugging Face Spaces
+For permanent, free hosting on Gradio:
+1. Create a free account on [Hugging Face](https://huggingface.co/)
+2. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
+3. Click "Create a Space"
+4. Select "Gradio" as the SDK
+5. Upload all files (including your embeddings file) to the space
+6. Your app will be automatically deployed!
+## Files Included
+- `app.py`: The main application code
+- `requirements.txt`: Required Python packages
+- `run_app.sh`: One-click deployment script
+## Requirements
+- Python 3.7+
+- Internet connection (for Voyage AI API)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+voyageai==0.2.3
+numpy==1.24.3
+gradio==4.12.0

run_app.sh ADDED Viewed

	@@ -0,0 +1,14 @@

+#!/bin/bash
+# Install required packages
+pip install -r requirements.txt
+# Check if embeddings file exists
+if [ -f "ingredient_embeddings_voyageai.pkl" ]; then
+    # Run with local embeddings file
+    python app.py --share
+else
+    echo "ERROR: ingredient_embeddings_voyageai.pkl file not found!"
+    echo "Please place the embeddings file in the same directory as this script."
+    exit 1
+fi

spaces.py ADDED Viewed

	@@ -0,0 +1,198 @@

+import gradio as gr
+import pickle
+import os
+import json
+import numpy as np
+import voyageai
+import time
+import sys
+# Set Voyage AI API key directly
+voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
+# Import all necessary functions from the main app
+from app import create_product_embeddings_voyageai, get_embeddings_batch, compute_similarities
+# Path to the embeddings file for Hugging Face Spaces
+EMBEDDINGS_PATH = "ingredient_embeddings_voyageai.pkl"
+# Load the embeddings
+print(f"Loading ingredient embeddings from {EMBEDDINGS_PATH}")
+try:
+    with open(EMBEDDINGS_PATH, "rb") as f:
+        embeddings = pickle.load(f)
+    print(f"Successfully loaded {len(embeddings)} ingredient embeddings")
+except Exception as e:
+    print(f"ERROR: Failed to load embeddings: {e}")
+    # Create an empty dict as fallback
+    embeddings = {}
+# Define the categorization function for text input
+def categorize_products_from_text(product_text, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
+    """Categorize products from text input (one product per line)"""
+    # Parse input text to get product names
+    product_names = [line.strip() for line in product_text.split("\n") if line.strip()]
+    if not product_names:
+        return "No product names provided."
+    progress(0.1, desc="Generating product embeddings...")
+    # Create product embeddings
+    products_embeddings = create_product_embeddings_voyageai(product_names)
+    # Compute similarities
+    progress(0.6, desc="Computing similarities...")
+    all_similarities = compute_similarities(embeddings, products_embeddings)
+    # Format results
+    progress(0.9, desc="Formatting results...")
+    output_text = ""
+    for product, similarities in all_similarities.items():
+        # Filter by confidence threshold and take top N
+        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
+                                if score >= confidence_threshold]
+        top_similarities = filtered_similarities[:top_n]
+        output_text += f"Product: {product}\n"
+        if top_similarities:
+            for i, (category, score) in enumerate(top_similarities, 1):
+                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
+        else:
+            output_text += "  No matching categories found.\n"
+        output_text += "\n"
+    progress(1.0, desc="Done!")
+    return output_text
+# Define the categorization function for file input
+def categorize_products_from_file(file, progress=gr.Progress(), top_n=5, confidence_threshold=0.5):
+    """Categorize products from a JSON file"""
+    progress(0.1, desc="Reading file...")
+    try:
+        with open(file.name, 'r') as f:
+            try:
+                products_data = json.load(f)
+                if isinstance(products_data, list):
+                    # Extract product names if it's a list of objects with 'name' field
+                    if all(isinstance(item, dict) for item in products_data):
+                        product_names = [item.get('name', '') for item in products_data if isinstance(item, dict)]
+                    else:
+                        # If it's just a list of strings
+                        product_names = [str(item) for item in products_data if item]
+                else:
+                    # If it's just a list of product names
+                    product_names = []
+            except json.JSONDecodeError:
+                # If not JSON, try reading as text file with one product per line
+                f.seek(0)
+                product_names = [line.strip() for line in f.readlines() if line.strip()]
+    except Exception as e:
+        return f"Error reading file: {str(e)}"
+    if not product_names:
+        return "No product names found in the file."
+    # Create product embeddings
+    progress(0.2, desc="Generating product embeddings...")
+    products_embeddings = create_product_embeddings_voyageai(product_names)
+    # Compute similarities
+    progress(0.7, desc="Computing similarities...")
+    all_similarities = compute_similarities(embeddings, products_embeddings)
+    # Format results
+    progress(0.9, desc="Formatting results...")
+    output_text = f"Found {len(product_names)} products in file.\n\n"
+    for product, similarities in all_similarities.items():
+        # Filter by confidence threshold and take top N
+        filtered_similarities = [(ingredient, score) for ingredient, score in similarities
+                                if score >= confidence_threshold]
+        top_similarities = filtered_similarities[:top_n]
+        output_text += f"Product: {product}\n"
+        if top_similarities:
+            for i, (category, score) in enumerate(top_similarities, 1):
+                output_text += f"  {i}. {category} (confidence: {score:.3f})\n"
+        else:
+            output_text += "  No matching categories found.\n"
+        output_text += "\n"
+    progress(1.0, desc="Done!")
+    return output_text
+# Create the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Product Categorization Tool")
+    gr.Markdown("This tool uses AI to categorize products based on their similarity to known ingredients.")
+    with gr.Tabs():
+        with gr.TabItem("Text Input"):
+            with gr.Row():
+                with gr.Column():
+                    text_input = gr.Textbox(
+                        lines=10,
+                        placeholder="Enter product names, one per line",
+                        label="Product Names"
+                    )
+                    top_n = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=5,
+                        step=1,
+                        label="Number of Top Categories"
+                    )
+                    confidence = gr.Slider(
+                        minimum=0.1,
+                        maximum=0.9,
+                        value=0.5,
+                        step=0.05,
+                        label="Confidence Threshold"
+                    )
+                    submit_button = gr.Button("Categorize Products")
+                with gr.Column():
+                    text_output = gr.Textbox(label="Categorization Results", lines=20)
+            submit_button.click(
+                fn=categorize_products_from_text,
+                inputs=[text_input, top_n, confidence],
+                outputs=text_output
+            )
+        with gr.TabItem("File Upload"):
+            with gr.Row():
+                with gr.Column():
+                    file_input = gr.File(label="Upload JSON file with products")
+                    file_top_n = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=5,
+                        step=1,
+                        label="Number of Top Categories"
+                    )
+                    file_confidence = gr.Slider(
+                        minimum=0.1,
+                        maximum=0.9,
+                        value=0.5,
+                        step=0.05,
+                        label="Confidence Threshold"
+                    )
+                    file_button = gr.Button("Process File")
+                with gr.Column():
+                    file_output = gr.Textbox(label="Categorization Results", lines=20)
+            file_button.click(
+                fn=categorize_products_from_file,
+                inputs=[file_input, file_top_n, file_confidence],
+                outputs=file_output
+            )
+    gr.Markdown("### Example Input")
+    gr.Markdown("Try entering product names like:\n- Tomato Sauce\n- Apple Pie\n- Greek Yogurt\n- Chocolate Chip Cookies")
+# Launch the demo (for Hugging Face Spaces)
+demo.launch()