Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

esilver commited on Mar 23

Commit

31ebc8b

0 Parent(s):

Initial commit

Browse files

Files changed (28) hide show

.gitattributes +35 -0
.gitignore +6 -0
README.md +51 -0
api_utils.py +384 -0
app.py +29 -0
categories.json +140 -0
category_embeddings.pickle +3 -0
category_matching.py +258 -0
chicory_api.py +91 -0
comparison.py +252 -0
config.py +2 -0
data/category_embeddings.pickle +3 -0
data/ingredient_embeddings_voyageai.pkl +3 -0
debug_embeddings.py +130 -0
embeddings.py +128 -0
generate_category_embeddings.py +29 -0
main.py +48 -0
openai_expansion.py +91 -0
requirements.txt +6 -0
similarity.py +262 -0
ui.py +262 -0
ui_category_matching.py +46 -0
ui_core.py +140 -0
ui_expanded_matching.py +224 -0
ui_formatters.py +419 -0
ui_hybrid_matching.py +86 -0
ui_ingredient_matching.py +59 -0
utils.py +156 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+*.pyc
+*.pem
+.DS_Store
+run_app.sh

README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+---
+license: mit
+title: Demo
+sdk: gradio
+emoji: 🚀
+colorFrom: purple
+colorTo: yellow
+---
+# Product Categorization App - One-Click Solution
+This is a turnkey solution for categorizing products based on their similarity to ingredients using Voyage AI.
+## Quick Start
+1. Place your `ingredient_embeddings_voyageai.pkl` file in the same folder as this README
+2. Run the application:
+   ```bash
+   bash run_app.sh
+   ```
+3. That's it! A browser window will open with the app, and a public URL will be created for sharing
+## What You Can Do
+- **Text Input:** Enter product names one per line
+- **File Upload:** Upload a JSON file with product data
+- Adjust the number of categories and Similarity Threshold
+- View the categorization results with confidence scores
+## Hosting on Hugging Face Spaces
+For permanent, free hosting on Gradio:
+1. Create a free account on [Hugging Face](https://huggingface.co/)
+2. Go to [Hugging Face Spaces](https://huggingface.co/spaces)
+3. Click "Create a Space"
+4. Select "Gradio" as the SDK
+5. Upload all files (including your embeddings file) to the space
+6. Your app will be automatically deployed!
+## Files Included
+- `app.py`: The main application code
+- `requirements.txt`: Required Python packages
+- `run_app.sh`: One-click deployment script
+## Requirements
+- Python 3.7+
+- Internet connection (for Voyage AI API)

api_utils.py ADDED Viewed

	@@ -0,0 +1,384 @@

+import os
+import concurrent.futures
+from typing import List, Dict, Callable, Any, Tuple
+from openai import OpenAI
+import voyageai
+from utils import SafeProgress
+import json
+# Centralized API clients
+def get_openai_client():
+    """Get a configured OpenAI client"""
+    OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+    return OpenAI(api_key=OPENAI_API_KEY)
+def get_voyage_client():
+    """Get a configured Voyage AI client"""
+    return voyageai.Client()
+# General batch processing utilities
+def process_batch(items_batch: List[Any], processor_func: Callable) -> Dict:
+    """
+    Process a batch of items using the provided processor function
+    Args:
+        items_batch: List of items to process
+        processor_func: Function that processes a single item and returns (key, value)
+    Returns:
+        Dictionary of processing results
+    """
+    results = {}
+    for item in items_batch:
+        try:
+            key, value = processor_func(item)
+            results[key] = value
+        except Exception as e:
+            print(f"Error processing batch item '{item}': {e}")
+            results[item] = []
+    return results
+def process_in_parallel(
+    items: List[Any],
+    processor_func: Callable,
+    max_workers: int = 10,
+    progress_tracker: Any = None,
+    progress_start: float = 0.0,
+    progress_end: float = 1.0,
+    progress_desc: str = "Processing in parallel"
+) -> Dict:
+    """
+    Process items in parallel using thread pool
+    Args:
+        items: List of items to process
+        processor_func: Function that processes a single item
+        max_workers: Maximum number of threads
+        progress_tracker: Optional progress tracking object
+        progress_start: Starting progress percentage (0.0-1.0)
+        progress_end: Ending progress percentage (0.0-1.0)
+        progress_desc: Description for the progress tracker
+    Returns:
+        Combined results dictionary
+    """
+    # Ensure reasonable number of workers
+    max_workers = min(max_workers, len(items))
+    # Split items into batches
+    batch_size = max(1, len(items) // max_workers)
+    batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
+    # Process batches in parallel
+    results = {}
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_batch = {executor.submit(process_batch, batch, processor_func): i
+                          for i, batch in enumerate(batches)}
+        for i, future in enumerate(concurrent.futures.as_completed(future_to_batch)):
+            batch_index = future_to_batch[future]
+            # Update progress if tracker provided
+            if progress_tracker:
+                progress_percent = progress_start + ((progress_end - progress_start) * (i+1) / len(batches))
+                progress_tracker(progress_percent, desc=f"{progress_desc}: batch {batch_index+1}/{len(batches)}")
+            try:
+                batch_results = future.result()
+                results.update(batch_results)
+            except Exception as e:
+                print(f"Error processing batch {batch_index}: {e}")
+    return results
+def openai_structured_query(
+    prompt: str,
+    system_message: str = "You are a helpful assistant.",
+    schema: dict = None,
+    model: str = "o3-mini",
+    client=None,
+    schema_name: str = "structured_output"
+) -> dict:
+    """
+    Make an OpenAI API call with structured output format
+    Args:
+        prompt: The user prompt
+        system_message: The system message to guide the model
+        schema: JSON schema for structured output
+        model: OpenAI model to use
+        client: Optional pre-configured client, otherwise will be created
+        schema_name: Name for the schema
+    Returns:
+        Parsed JSON response as dictionary
+    """
+    if client is None:
+        client = get_openai_client()
+    try:
+        response = client.responses.create(
+            model=model,
+            input=[
+                {"role": "system", "content": system_message},
+                {"role": "user", "content": prompt}
+            ],
+            text={
+                "format": {
+                    "type": "json_schema",
+                    "name": schema_name,
+                    "schema": schema,
+                    "strict": True
+                }
+            }
+        )
+        # Parse the response
+        return json.loads(response.output_text)
+    except Exception as e:
+        print(f"Error in OpenAI structured query: {e}")
+        raise
+def rank_ingredients_openai(
+    product: str,
+    candidates: List[str],
+    expanded_description: str = None,
+    client=None,
+    model: str = "o3-mini",
+    max_results: int = 3,
+    confidence_threshold: float = 0.5,
+    debug: bool = False
+) -> List[Tuple[str, float]]:
+    """
+    Rank ingredients for a product using OpenAI
+    Args:
+        product: Product name
+        candidates: List of candidate ingredients
+        expanded_description: Optional expanded product description
+        client: Optional pre-configured client
+        model: OpenAI model to use
+        max_results: Maximum number of results to return
+        confidence_threshold: Minimum confidence threshold
+        debug: Whether to print debug info
+    Returns:
+        List of (ingredient, confidence) tuples
+    """
+    if not candidates:
+        return []
+    if client is None:
+        client = get_openai_client()
+    if debug:
+        print(f"Ranking for product: {product} with {len(candidates)} candidates")
+    # Format prompt with expanded description if available
+    prompt = f"Product: {product}"
+    if expanded_description:
+        prompt += f"\n\nExpanded description: {expanded_description}"
+    prompt += f"\n\nPotential ingredients: {', '.join(candidates)}"
+    # Define the ranking schema
+    ranking_schema = {
+        "type": "object",
+        "properties": {
+            "rankings": {
+                "type": "array",
+                "description": f"Only the top {max_results} most relevant ingredients with scores >= {confidence_threshold}",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "ingredient": {
+                            "type": "string",
+                            "description": "The name of the ingredient"
+                        },
+                        "relevance_score": {
+                            "type": "number",
+                            "description": "Score between 0 and 1 indicating relevance"
+                        },
+                        "explanation": {
+                            "type": "string",
+                            "description": "Brief explanation for the matching"
+                        }
+                    },
+                    "required": ["ingredient", "relevance_score", "explanation"],
+                    "additionalProperties": False
+                }
+            }
+        },
+        "required": ["rankings"],
+        "additionalProperties": False
+    }
+    try:
+        # Make the API call directly for more control
+        response = client.responses.create(
+            model=model,
+            reasoning={"effort": "low"},  # Include effort parameter from ui_expanded_matching
+            input=[
+                {"role": "system", "content": f"You are a food ingredient matching expert. Rank the top {max_results} ingredient based on how well they match the given product. Only include ingredients with relevance score >= {confidence_threshold}."},
+                {"role": "user", "content": prompt}
+            ],
+            text={
+                "format": {
+                    "type": "json_schema",
+                    "name": "ingredient_ranking",
+                    "schema": ranking_schema,
+                    "strict": True
+                }
+            }
+        )
+        # Parse the response
+        result = json.loads(response.output_text)
+        # Process ranking results
+        ingredients = []
+        for item in result["rankings"]:
+            ingredient = item["ingredient"]
+            score = float(item["relevance_score"])
+            ingredients.append((ingredient, score))
+        if debug:
+            print(f"Ranking results for {product}: {len(ingredients)} ingredients")
+            if ingredients:
+                print(f"Top match: {ingredients[0]}")
+        return ingredients
+    except Exception as e:
+        print(f"Error ranking ingredients for '{product}': {e}")
+        return []
+def rank_categories_openai(
+    product: str,
+    categories: dict,
+    expanded_description: str = None,
+    client=None,
+    model: str = "o3-mini",
+    max_results: int = 5,
+    confidence_threshold: float = 0.5,
+    debug: bool = False
+) -> List[Tuple[str, float]]:
+    """
+    Rank food categories for a product using OpenAI
+    Args:
+        product: Product name
+        categories: Dictionary of category data
+        expanded_description: Optional expanded product description
+        client: Optional pre-configured client
+        model: OpenAI model to use
+        max_results: Maximum number of results to return
+        confidence_threshold: Minimum confidence threshold
+        debug: Whether to print debug info
+    Returns:
+        List of (category, confidence) tuples
+    """
+    if not categories:
+        return []
+    if client is None:
+        client = get_openai_client()
+    if debug:
+        print(f"Category ranking for product: {product}")
+    # Format categories for the prompt - handle both string and dict formats
+    categories_text = ""
+    for category_id, category_data in categories.items():
+        if isinstance(category_data, str):
+            # Simple string description
+            categories_text += f"- {category_id}: {category_data}\n"
+        elif isinstance(category_data, dict) and 'description' in category_data:
+            # Dictionary with description field
+            categories_text += f"- {category_id}: {category_data['description']}\n"
+        else:
+            # Default case - just use the ID
+            categories_text += f"- {category_id}\n"
+        # categories_text += f"- {category_id}\n"
+    # Format prompt with expanded description if available
+    prompt = f"Product: {product}"
+    # if expanded_description:
+    #     prompt += f"\n\nExpanded description: {expanded_description}"
+    prompt += f"\n\nAvailable food categories:\n{categories_text}"
+    # Define the ranking schema
+    ranking_schema = {
+        "type": "object",
+        "properties": {
+            "rankings": {
+                "type": "array",
+                "description": f"Only the top most relevant category with scores >= {confidence_threshold}",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "reasoning": {
+                            "type": "string",
+                            "description": "Reasoning, , step by step, first weigh options, then consider the best match"
+                        },
+                        "category": {
+                            "type": "string",
+                            "description": "The name of the food category"
+                        },
+                        "relevance_score": {
+                            "type": "number",
+                            "description": "Score between 0 and 1 indicating relevance"
+                        },
+                    },
+                    "required": ["category", "relevance_score", "reasoning"],
+                    # "required": ["category", "relevance_score", "explanation"],
+                    "additionalProperties": False
+                }
+            }
+        },
+        "required": ["rankings"],
+        "additionalProperties": False
+    }
+    try:
+        # Make the API call
+        response = client.responses.create(
+            model=model,
+            # reasoning={"effort": "low"},
+            input=[
+                {"role": "system", "content": f"You are a food categorization expert. Think this through step by step: Rank the top category based on how well it match the given product. Only include categories with relevance score >= {confidence_threshold}."},
+                {"role": "user", "content": prompt}
+            ],
+            text={
+                "format": {
+                    "type": "json_schema",
+                    "name": "category_ranking",
+                    "schema": ranking_schema,
+                    "strict": True
+                }
+            }
+        )
+        # Parse the response
+        result = json.loads(response.output_text)
+        # Process ranking results
+        categories = []
+        for item in result["rankings"]:
+            category = item["category"]
+            score = float(item["relevance_score"])
+            categories.append((category, score))
+        if debug:
+            print(f"Category results for {product}: {len(categories)} categories")
+            if categories:
+                print(f"Top match: {categories[0]}")
+        return categories
+    except Exception as e:
+        print(f"Error categorizing {product}: {e}")
+        if debug:
+            import traceback
+            traceback.print_exc()
+        return []

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import sys
+import gradio as gr
+from utils import load_embeddings
+from ui import categorize_products, create_demo  # Updated imports
+# Path to the embeddings file
+EMBEDDINGS_PATH = "data/ingredient_embeddings_voyageai.pkl"
+# Check if embeddings file exists
+if not os.path.exists(EMBEDDINGS_PATH):
+    print(f"Error: Embeddings file {EMBEDDINGS_PATH} not found!")
+    print(f"Please ensure the file exists at {os.path.abspath(EMBEDDINGS_PATH)}")
+    sys.exit(1)
+# Load embeddings globally
+try:
+    embeddings_data = load_embeddings(EMBEDDINGS_PATH)
+    # Make embeddings available to the UI functions
+    import ui
+    ui.embeddings = embeddings_data
+except Exception as e:
+    print(f"Error loading embeddings: {e}")
+    sys.exit(1)
+# Launch the Gradio interface
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

categories.json ADDED Viewed

	@@ -0,0 +1,140 @@

+[
+    {"id": "alcoholic_beverages", "text": "Products containing alcohol for adult consumption. Includes beer (lagers, ales, IPAs), wine (red, white, rosé), spirits (vodka, whiskey, rum), hard seltzers, and pre-mixed alcoholic drinks."},
+    {"id": "beverages", "text": "Non-alcoholic drinks for hydration and refreshment. This parent category includes all drink types such as juices, sodas, water, coffee, tea, and milk alternatives."},
+    {"id": "cocktails_and_mixers", "text": "Products used specifically for creating mixed alcoholic drinks. Includes margarita mix, bloody mary mix, tonic water, sour mix, grenadine, bitters, and non-alcoholic cocktail components."},
+    {"id": "coffee", "text": "Coffee products in various forms and preparations. Includes whole/ground coffee beans, single-serve pods, instant coffee, cold brew, espresso, flavored coffee varieties, and coffee concentrates."},
+    {"id": "fruit_juice", "text": "Beverages made primarily from fruit. Includes 100% juice, juice blends, fresh-squeezed, from concentrate, smoothies, apple juice, orange juice, cranberry juice, and fruit nectars."},
+    {"id": "soft_drinks", "text": "Carbonated non-alcoholic beverages. Includes cola, root beer, ginger ale, lemon-lime soda, diet soda, sparkling water, flavored seltzer, and club soda."},
+    {"id": "specialty_drinks", "text": "Unique or premium non-alcoholic beverages with distinctive ingredients or processes. Includes kombucha, kefir drinks, drinking vinegars, botanical tonics, bubble tea, horchata, and craft sodas."},
+    {"id": "sports_and_energy_drinks", "text": "Beverages formulated for performance enhancement or energy boosting. Includes electrolyte drinks, protein shakes, caffeinated energy drinks, pre-workout beverages, and recovery drinks."},
+    {"id": "tea_and_hot_chocolate", "text": "Tea products and cocoa-based hot beverages. Includes black/green/herbal tea bags, loose leaf tea, chai, matcha, instant tea, hot cocoa mix, drinking chocolate, and cider mixes."},
+    {"id": "water", "text": "Bottled water products of various types. Includes purified water, spring water, mineral water, sparkling water, flavored water, alkaline water, and water with electrolytes."},
+    {"id": "bread_and_bakery", "text": "Bread products and baked goods. This parent category includes all types of bread, rolls, buns, bagels, pastries, and bakery desserts."},
+    {"id": "bagels_english_muffins_and_breakfast", "text": "Breakfast bread products typically served toasted. Includes plain/flavored bagels, English muffins, crumpets, breakfast breads, croissants, and morning buns."},
+    {"id": "desserts", "text": "Sweet baked goods meant as treats or meal finishers. Includes cakes, pies, cookies, brownies, tarts, cheesecakes, parfaits, and bakery dessert items."},
+    {"id": "donuts_and_pastries", "text": "Sweet baked or fried dough products. Includes donuts (glazed, filled, cake), pastries (Danish, croissants, turnovers), bear claws, fritters, and churros."},
+    {"id": "rolls_and_buns", "text": "Individual bread portions shaped for specific uses. Includes dinner rolls, hamburger buns, hot dog buns, submarine rolls, kaiser rolls, brioche buns, and slider buns."},
+    {"id": "sliced_bread", "text": "Pre-sliced loaves of bread for sandwiches and toast. Includes white, whole wheat, multigrain, sourdough, rye, pumpernickel, potato, and specialty grain varieties."},
+    {"id": "snack_cakes", "text": "Pre-packaged individual or small sweet baked goods. Includes cupcakes, mini pies, cream-filled cakes, coffee cakes, muffins, and packaged pastries with extended shelf life."},
+    {"id": "tortillas_and_flatbreads", "text": "Thin, unleavened or minimally leavened bread products. Includes corn/flour tortillas, pita bread, naan, lavash, flatbreads, wraps, and taco shells."},
+    {"id": "deli", "text": "Section featuring prepared ready-to-eat foods and freshly sliced ingredients. This parent category includes prepared meals, sliced meats, fresh dips, and specialty items requiring refrigeration."},
+    {"id": "cured_meats", "text": "Preserved meat products typically eaten uncooked. Includes salami, prosciutto, coppa, pancetta, pepperoni, chorizo, and specialty dried/cured meats."},
+    {"id": "deli_meals_and_sides", "text": "Ready-to-eat prepared foods sold by weight or package. Includes rotisserie chicken, prepared salads (potato, macaroni, coleslaw), heat-and-eat entrees, and prepared side dishes."},
+    {"id": "deli_meats", "text": "Cooked or processed meats sliced to order or pre-sliced. Includes turkey breast, ham, roast beef, pastrami, bologna, chicken breast, and specialty lunch meats."},
+    {"id": "fresh_pastas", "text": "Refrigerated pasta products requiring cooking. Includes fresh ravioli, tortellini, gnocchi, linguine, fettuccine, and stuffed pasta varieties with shorter shelf life than dried pasta."},
+    {"id": "hummus_fresh_dips_and_fresh_salsas", "text": "Refrigerated spreadable dips with limited shelf life. Includes hummus varieties, guacamole, fresh salsa, tzatziki, spinach dip, and refrigerated spreads requiring cold storage."},
+    {"id": "eggs_and_dairy", "text": "Products derived from animal milk and eggs. This parent category includes all dairy products and egg-based items requiring refrigeration."},
+    {"id": "butter_and_margarine", "text": "Spreadable fats for cooking and baking. Includes dairy butter (salted, unsalted, cultured), margarine, plant-based butter alternatives, ghee, and blended spreads."},
+    {"id": "cheese", "text": "Dairy products made from curdled milk. Includes cheddar, mozzarella, Swiss, provolone, American, brie, blue cheese, and various cheese formats (blocks, slices, shredded)."},
+    {"id": "ao_cheese", "text": "Specialty cheese products with artisanal production or organic certification. Includes imported cheeses, raw milk cheeses, aged specialty varieties, organic cheeses, and regional cheese specialties."},
+    {"id": "cream_and_creamers", "text": "Dairy and non-dairy products for coffee and cooking. Includes heavy cream, half & half, whipping cream, coffee creamers (dairy and non-dairy), and cooking creams."},
+    {"id": "dips", "text": "Dairy-based spreads for snacks and appetizers. Includes French onion dip, ranch dip, cream cheese-based dips, sour cream dips, and flavored spreadable dairy products."},
+    {"id": "eggs_and_egg_substitutes", "text": "Chicken eggs and egg alternatives. Includes whole eggs (white, brown, free-range, organic), liquid egg products, egg whites, and plant-based egg substitutes."},
+    {"id": "milk", "text": "Traditional dairy milk products. Includes whole milk, 2% reduced fat, 1% low fat, skim/fat-free milk, lactose-free milk, buttermilk, and flavored dairy milk varieties."},
+    {"id": "plant_based_milks", "text": "Non-dairy milk alternatives made from plants. Includes almond milk, soy milk, oat milk, coconut milk, rice milk, cashew milk, hemp milk, and blended plant milks."},
+    {"id": "pudding_and_gelatins", "text": "Ready-to-eat chilled desserts with soft texture. Includes dairy puddings, gelatin desserts, rice pudding, tapioca pudding, and parfait cups."},
+    {"id": "refrigerated_doughs_and_crusts", "text": "Ready-to-bake fresh dough products requiring refrigeration. Includes cookie dough, biscuit dough, pie crusts, pizza dough, cinnamon rolls, and crescent rolls."},
+    {"id": "sour_cream", "text": "Cultured dairy product with tangy flavor. Includes regular sour cream, light/reduced-fat sour cream, crème fraîche, and sour cream alternatives."},
+    {"id": "yogurt", "text": "Fermented dairy products with live cultures. Includes Greek yogurt, regular yogurt, Icelandic skyr, kefir, yogurt drinks, and varieties with different fat contents and flavors."},
+    {"id": "frozen", "text": "Foods stored and sold in frozen state requiring freezer storage. This parent category includes all items kept frozen until preparation or consumption."},
+    {"id": "frozen_beverages_and_ice", "text": "Frozen drink products and ice. Includes frozen juice concentrate, smoothie bases, frozen coffee drinks, popsicles, ice cubes, crushed ice, and frozen cocktail mixers."},
+    {"id": "frozen_bread_and_potatoes", "text": "Frozen starches requiring heating before serving. Includes frozen garlic bread, dinner rolls, French fries, hash browns, tater tots, potato wedges, and specialty potato products."},
+    {"id": "frozen_desserts", "text": "Sweet frozen treats besides traditional ice cream. Includes frozen yogurt, sherbet, sorbet, gelato, frozen novelties, ice cream cakes, and frozen pies."},
+    {"id": "frozen_family_meals", "text": "Multi-serving frozen entrees to feed multiple people. Includes frozen lasagna, casseroles, pot pies, complete dinners, meal kits, and large-format frozen meals."},
+    {"id": "frozen_fruits_and_vegetables", "text": "Flash-frozen produce for extended storage. Includes frozen berries, mixed fruits, vegetable medleys, stir-fry blends, broccoli, corn, peas, and individually quick frozen (IQF) produce."},
+    {"id": "frozen_meat_and_seafood", "text": "Frozen animal protein products. Includes frozen chicken breasts/tenders, ground beef, fish fillets, shrimp, scallops, meatballs, and specialty meat products requiring freezer storage."},
+    {"id": "frozen_pizza_and_pasta", "text": "Ready-to-heat frozen Italian-style convenience foods. Includes frozen pizza (thin crust, rising crust, specialty), pizza rolls, frozen pasta dishes, ravioli, and Italian entrees."},
+    {"id": "ice_cream", "text": "Frozen dairy desserts with high milk fat content. Includes traditional ice cream, premium ice cream, ice cream bars, ice cream sandwiches, and dairy-based frozen treats."},
+    {"id": "fruits_and_vegetables", "text": "Fresh produce items. This parent category includes all fresh fruits and vegetables, both whole and prepared."},
+    {"id": "fresh_fruit", "text": "Unprocessed whole fruits. Includes apples, bananas, citrus fruits, berries, grapes, stone fruits, tropical fruits, and seasonal fruit varieties."},
+    {"id": "fresh_herbs", "text": "Fresh culinary herbs for flavoring. Includes basil, cilantro, parsley, mint, rosemary, thyme, dill, chives, and other fresh herb varieties."},
+    {"id": "fresh_vegetables", "text": "Unprocessed whole vegetables. Includes tomatoes, peppers, onions, carrots, broccoli, cauliflower, cucumbers, and mainstream vegetable varieties."},
+    {"id": "ao_fresh_vegetables", "text": "Specialty, artisanal or organic vegetables. Includes heirloom tomatoes, organic produce, specialty greens, rare vegetable varieties, and premium vegetable selections."},
+    {"id": "leafy_greens", "text": "Edible plant leaves for salads and cooking. Includes lettuce varieties, spinach, kale, arugula, mixed salad greens, collards, chard, and cooking greens."},
+    {"id": "mushrooms", "text": "Edible fungi varieties. Includes button mushrooms, cremini, portobello, shiitake, oyster, enoki, chanterelle, and specialty mushroom varieties."},
+    {"id": "potatoes_and_starchy_vegetables", "text": "Root vegetables and high-starch produce. Includes potatoes, sweet potatoes, winter squash, yams, turnips, rutabagas, parsnips, and other starchy vegetables."},
+    {"id": "prepared_produce", "text": "Pre-processed fruits and vegetables for convenience. This parent category includes all ready-to-eat cut fruits and vegetables."},
+    {"id": "fresh_prepared_fruit", "text": "Ready-to-eat cut fruit products. Includes fruit salad, cut melon, pineapple chunks, apple slices, fruit platters, and fresh-cut fruit mixes."},
+    {"id": "fresh_prepared_vegetables", "text": "Ready-to-eat cut vegetable products. Includes vegetable trays, pre-cut stir fry mixes, spiralized vegetables, vegetable noodles, and prepared vegetable medleys."},
+    {"id": "vegetarian_protein_and_asian", "text": "Plant-based protein products and Asian ingredients. Includes tofu, tempeh, seitan, meat alternatives, edamame, Asian noodles, and vegetarian protein options."},
+    {"id": "meat", "text": "Animal protein products. This parent category includes all unprocessed and minimally processed animal proteins."},
+    {"id": "bacon_hot_dogs_and_sausage", "text": "Processed and formed meat products. Includes bacon, breakfast sausage, Italian sausage, hot dogs, bratwurst, kielbasa, and specialty sausage varieties."},
+    {"id": "beef", "text": "Meat products from cattle. Includes ground beef, steaks (ribeye, sirloin, filet), roasts, stew meat, brisket, and specialty beef cuts."},
+    {"id": "chicken", "text": "Poultry products from chickens. Includes whole chickens, breasts, thighs, wings, drumsticks, ground chicken, and boneless/bone-in varieties."},
+    {"id": "pork", "text": "Meat products from pigs. Includes pork chops, tenderloin, ribs, shoulder, ground pork, ham, and specialty pork cuts."},
+    {"id": "seafood", "text": "Edible aquatic animals. Includes fish (salmon, tuna, cod, tilapia), shellfish (shrimp, crab, lobster), mollusks (clams, mussels, oysters), and specialty seafood."},
+    {"id": "specialty_and_organic_meat", "text": "Premium meat products with special attributes. Includes organic meats, grass-fed beef, free-range poultry, heritage breed pork, halal/kosher meats, and specialty game meats."},
+    {"id": "turkey", "text": "Poultry products from turkeys. Includes whole turkeys, turkey breasts, ground turkey, turkey thighs, turkey sausage, and other turkey parts."},
+    {"id": "pantry", "text": "Shelf-stable foods stored at room temperature. This parent category includes all non-perishable food items with extended shelf life."},
+    {"id": "baking", "text": "Ingredients primarily used for baking. This parent category includes all baking ingredients, mixes, and decorating supplies."},
+    {"id": "ao_baking", "text": "Specialty baking ingredients with artisanal or organic attributes. Includes organic flour, specialty sugars, premium chocolate, heirloom grain products, and gourmet baking ingredients."},
+    {"id": "baking_mixes", "text": "Pre-measured dry ingredient combinations. Includes cake mixes, brownie mixes, pancake/waffle mixes, muffin mixes, bread mixes, and biscuit mixes."},
+    {"id": "baking_morsels_bars_and_cocoa", "text": "Chocolate and cocoa products for baking. Includes chocolate chips, baking chocolate bars, cocoa powder, white chocolate chips, and flavored baking morsels."},
+    {"id": "cake_decorations", "text": "Items used to decorate baked goods. Includes sprinkles, decorating icing, food coloring, fondant, sugar decorations, and cake toppers."},
+    {"id": "flour_and_meal", "text": "Ground grain products for baking and cooking. Includes all-purpose flour, bread flour, cake flour, whole wheat flour, almond flour, cornmeal, and specialty flours."},
+    {"id": "frosting", "text": "Ready-to-use cake and cookie toppings. Includes canned frosting, frosting tubes, glaze mixes, icing, cream cheese frosting, and specialty frosting varieties."},
+    {"id": "thickening_and_leavening_agents", "text": "Ingredients that change food texture or help it rise. Includes cornstarch, baking powder, baking soda, yeast, gelatin, pectin, xanthan gum, and arrowroot."},
+    {"id": "boxed_dinners", "text": "Shelf-stable meal kits with minimal preparation. Includes macaroni and cheese, hamburger helper, rice dishes, pasta meals, and boxed dinner kits requiring few additional ingredients."},
+    {"id": "broths_and_stocks", "text": "Liquid cooking bases for soups and recipes. Includes chicken broth, beef stock, vegetable broth, bone broth, bouillon, and cooking stock concentrates."},
+    {"id": "canned_goods", "text": "Food preserved in metal cans or glass jars. This parent category includes all canned and jarred shelf-stable foods."},
+    {"id": "ao_canned_goods", "text": "Premium preserved foods with artisanal or organic attributes. Includes organic canned vegetables, gourmet preserved items, imported specialty canned goods, and premium jarred items."},
+    {"id": "canned_beans", "text": "Legumes preserved in liquid. Includes kidney beans, black beans, chickpeas, pinto beans, baked beans, refried beans, and mixed bean varieties."},
+    {"id": "canned_fruit", "text": "Fruit preserved in syrup or juice. Includes peaches, pears, pineapple, mandarin oranges, fruit cocktail, applesauce, and specialty preserved fruits."},
+    {"id": "canned_meals", "text": "Ready-to-eat complete dishes in cans. Includes ravioli, chili, stew, pasta dishes, hash, and fully-prepared shelf-stable meals requiring minimal preparation."},
+    {"id": "canned_meat_poultry_and_hashes", "text": "Preserved meat products in cans. Includes canned chicken, potted meat, corned beef hash, SPAM, Vienna sausages, and shelf-stable meat products."},
+    {"id": "canned_seafood_and_tuna", "text": "Preserved fish and seafood in cans or pouches. Includes tuna (in water, oil), salmon, sardines, crab meat, clams, anchovies, and specialty canned seafood varieties."},
+    {"id": "canned_soups_and_stews", "text": "Ready-to-eat or condensed liquid meals. Includes cream soups, broth-based soups, chili, condensed soups, ready-to-eat soups, and hearty stews."},
+    {"id": "canned_tomatoes_and_dried_tomatoes", "text": "Preserved tomato products. Includes diced tomatoes, tomato sauce, paste, crushed tomatoes, whole peeled tomatoes, sun-dried tomatoes, and tomato puree."},
+    {"id": "canned_vegetables", "text": "Vegetables preserved in liquid. Includes green beans, corn, peas, carrots, mixed vegetables, mushrooms, asparagus, and specialty canned vegetable varieties."},
+    {"id": "cereal_and_breakfast_food", "text": "Ready-to-eat and hot morning meal foods. Includes cold cereals, hot cereals (oatmeal, grits, cream of wheat), breakfast bars, granola, and breakfast pastries."},
+    {"id": "condiments", "text": "Flavor-enhancing additions to prepared foods. This parent category includes all sauces, spices, and food enhancers."},
+    {"id": "ao_condiments", "text": "Specialty flavor enhancers with artisanal or organic attributes. Includes craft hot sauces, small-batch preserves, organic condiments, and gourmet flavor enhancers."},
+    {"id": "fruit_spreads", "text": "Sweet preserved fruit products. Includes jams, jellies, preserves, marmalade, fruit butters, honey, and specialty fruit spreads."},
+    {"id": "hot_sauces", "text": "Spicy condiments for food enhancement. Includes cayenne pepper sauce, habanero sauce, sriracha, tabasco, chipotle sauce, and specialty hot sauces of varying heat levels."},
+    {"id": "ketchup_mayo_and_mustards", "text": "Common sandwich and burger condiments. Includes ketchup, mayonnaise, yellow mustard, dijon mustard, specialty mustards, aioli, and basic table condiments."},
+    {"id": "nut_butters_and_spreads", "text": "Paste-like products made from ground nuts and seeds. Includes peanut butter, almond butter, cashew butter, sunflower seed butter, hazelnut spread, and specialty nut butters."},
+    {"id": "pickles_and_olives", "text": "Vegetables preserved in brine or vinegar. Includes dill pickles, sweet pickles, relish, green olives, kalamata olives, pickled vegetables, and specialty pickled items."},
+    {"id": "salad_dressings_and_toppings", "text": "Liquid and dry additions for salads. Includes ranch, Italian, balsamic, blue cheese dressings, croutons, salad toppings, and vinaigrettes."},
+    {"id": "sauces_marinades_and_gravy", "text": "Liquid flavor enhancers for cooking and finishing. Includes barbecue sauce, teriyaki sauce, pasta sauce, gravy, marinade, steak sauce, and cooking sauces."},
+    {"id": "sugars_sweeteners_and_honey", "text": "Sweet additions for beverages and baking. Includes granulated sugar, brown sugar, powdered sugar, honey, maple syrup, artificial sweeteners, and sugar alternatives."},
+    {"id": "cooking_oils_and_vinegar", "text": "Liquid fats for cooking and acidic flavor enhancers. Includes vegetable oil, olive oil, specialty oils, white vinegar, balsamic vinegar, apple cider vinegar, and specialty vinegars."},
+    {"id": "dried_pasta_and_pasta_sauces", "text": "Shelf-stable Italian-style noodles and accompanying sauces. This parent category includes all pasta and jarred sauces."},
+    {"id": "dried_pasta", "text": "Shelf-stable wheat or grain-based noodle products. Includes spaghetti, penne, fettuccine, elbow macaroni, specialty shapes, whole wheat pasta, and gluten-free pasta varieties."},
+    {"id": "pasta_sauces", "text": "Ready-to-use flavor bases for pasta dishes. Includes marinara, meat sauce, alfredo, pesto, vodka sauce, and specialty pasta sauce varieties in jars or pouches."},
+    {"id": "dried_soup_mixes_and_bouillon", "text": "Dehydrated soup bases and flavor enhancers. Includes ramen, bouillon cubes, soup mixes, broth concentrates, and instant soup packets requiring water addition."},
+    {"id": "herbs_spices_and_seasonings", "text": "Flavor additions for cooking. This parent category includes all dried herbs, spices, and seasoning blends."},
+    {"id": "dried_herbs_and_spices", "text": "Dehydrated plant parts for flavor enhancement. Includes basil, oregano, cinnamon, cumin, paprika, individual spices, and dried herb varieties."},
+    {"id": "salt_and_pepper", "text": "Basic seasoning agents for cooking and table use. Includes table salt, sea salt, kosher salt, specialty salts, black pepper, white pepper, and peppercorns."},
+    {"id": "seasoning_mixes", "text": "Pre-blended spice combinations for specific dishes. Includes taco seasoning, Italian seasoning, chili powder, poultry seasoning, meat rubs, and meal-specific spice blends."},
+    {"id": "international_foods", "text": "Products from global cuisines organized by region. Includes Mexican, Asian, Mediterranean, Indian, European, and other international food products and ingredients."},
+    {"id": "potatoes_and_stuffing", "text": "Shelf-stable potato products and bread mixes. Includes instant mashed potatoes, scalloped/au gratin potatoes, stuffing mix, and dehydrated potato products."},
+    {"id": "rice_grains_and_dried_beans", "text": "Shelf-stable carbohydrate staples. This parent category includes all uncooked grains and dry legumes."},
+    {"id": "dried_beans", "text": "Dehydrated legumes requiring cooking. Includes pinto beans, black beans, kidney beans, lentils, split peas, chickpeas, and dried bean varieties."},
+    {"id": "grains", "text": "Edible seeds from grass-like plants. Includes barley, quinoa, couscous, bulgur, farro, millet, and ancient grains."},
+    {"id": "rice", "text": "Various processed rice grain varieties. Includes white rice, brown rice, jasmine rice, basmati rice, arborio rice, wild rice, and specialty rice varieties."},
+    {"id": "snacks_and_candy", "text": "Ready-to-eat treats and sweets. This parent category includes all snack foods and confectionery items."},
+    {"id": "chips", "text": "Crispy snack foods in thin, flat format. Includes potato chips, tortilla chips, corn chips, vegetable chips, kettle chips, and flavored chip varieties."},
+    {"id": "chocolate_candy_and_gum", "text": "Sweet confectionery products. This parent category includes all candy items with and without chocolate."},
+    {"id": "candy_and_gum", "text": "Sweet non-chocolate confections. Includes hard candy, chewy candy, gummy candy, licorice, mints, chewing gum, caramels, and non-chocolate sweets."},
+    {"id": "chocolate", "text": "Cocoa-based confections and treats. Includes chocolate bars, chocolate candy, truffles, chocolate-covered nuts/fruits, and chocolate gift boxes."},
+    {"id": "cookies", "text": "Sweet baked treats in individual portions. Includes chocolate chip cookies, sandwich cookies, shortbread, specialty cookies, and packaged cookie varieties."},
+    {"id": "crackers", "text": "Crisp, dry, flat baked snack products. Includes saltines, cheese crackers, graham crackers, water crackers, snack crackers, and specialty cracker varieties."},
+    {"id": "fruit_snacks", "text": "Processed fruit-based treats. Includes fruit leather, fruit snacks, dried fruit rolls, fruit-flavored gummies, and portable fruit-based treats."},
+    {"id": "jerky_and_rinds", "text": "Dried meat snacks and crispy pork products. Includes beef jerky, turkey jerky, meat sticks, pork rinds, chicharrones, and dried meat snack varieties."},
+    {"id": "nuts_and_dried_fruit", "text": "Shelf-stable natural snacks. This parent category includes all nuts, seeds, and dried fruit products."},
+    {"id": "dried_fruit", "text": "Dehydrated fruit products for snacking. Includes raisins, dried cranberries, dried apricots, banana chips, apple rings, and mixed dried fruit varieties."},
+    {"id": "nuts", "text": "Edible seeds and kernels in shells or shelled. Includes almonds, peanuts, cashews, walnuts, pistachios, mixed nuts, and specialty nut varieties."},
+    {"id": "packaged_snack_cakes", "text": "Factory-produced sweet baked goods with extended shelf life. Includes snack cakes, mini muffins, donettes, cream-filled cakes, and individually wrapped sweet treats."},
+    {"id": "popcorn_and_pretzels", "text": "Crunchy grain-based snack foods. Includes microwave popcorn, ready-to-eat popcorn, hard pretzels, soft pretzels, pretzel bites, and flavored varieties."},
+    {"id": "snack_bars", "text": "Portable compressed food items in bar form. Includes granola bars, protein bars, cereal bars, energy bars, fruit bars, and meal replacement bars."},
+    {"id": "baby_and_child", "text": "Products specifically designed for infants and young children. Includes baby food, formula, diapers, baby wipes, child-specific snacks, and infant care items."},
+    {"id": "pet_products", "text": "Items for domestic animals. Includes dog food, cat food, pet treats, litter, pet supplies, toys, accessories, and pet care products."},
+    {"id": "personal_care", "text": "Products for human hygiene and grooming. Includes soap, shampoo, deodorant, lotion, toothpaste, feminine care, and toiletry items."},
+    {"id": "household_and_cleaning", "text": "Products for home maintenance. Includes cleaning supplies, laundry products, paper goods, storage items, and household essentials."},
+    {"id": "health_and_pharmacy", "text": "Products related to health and wellness. Includes over-the-counter medications, vitamins, supplements, first aid supplies, and pharmacy items."},
+    {"id": "floral_and_garden", "text": "Plant products and gardening supplies. Includes cut flowers, potted plants, bouquets, seeds, soil, garden tools, and seasonal plant items."},
+    {"id": "kitchenware", "text": "Tools and equipment for food preparation. Includes cookware, utensils, gadgets, small appliances, food storage, and kitchen accessories."},
+    {"id": "paper_products", "text": "Disposable paper-based household items. Includes paper towels, toilet paper, facial tissue, napkins, paper plates, and disposable tableware."},
+    {"id": "seasonal_and_holiday", "text": "Items specific to holidays or times of year. Includes decorations, seasonal foods, holiday-themed products, and limited-time specialty items."},
+    {"id": "electronics_and_media", "text": "Electronic devices and entertainment products. Includes batteries, chargers, headphones, small electronics, DVDs, magazines, and basic media items."}
+]

category_embeddings.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:abc3b4442b669e95e7e8c218fe5f5f9ea989dbe98b460f9b76dc0064a204725e
+size 1276161

category_matching.py ADDED Viewed

	@@ -0,0 +1,258 @@

+import json
+import numpy as np
+import pickle
+import os.path
+from typing import Dict, List, Any, Tuple
+from embeddings import create_product_embeddings
+from similarity import compute_similarities
+from utils import SafeProgress
+import voyageai
+# Update default path to be consistent
+DEFAULT_CATEGORY_EMBEDDINGS_PATH = "data/category_embeddings.pickle"
+def load_categories(file_path="categories.json") -> Dict[str, str]:
+    """
+    Load categories from JSON file
+    Args:
+        file_path: Path to the categories JSON file
+    Returns:
+        Dictionary mapping category IDs to their descriptions
+    """
+    try:
+        with open(file_path, 'r') as f:
+            categories_list = json.load(f)
+        # Convert to dictionary format with id as key and text as value
+        categories = {item["id"]: item["text"] for item in categories_list}
+        print(f"Loaded {len(categories)} categories")
+        return categories
+    except Exception as e:
+        print(f"Error loading categories: {e}")
+        return {}
+def create_category_embeddings(categories: Dict[str, str], progress=None,
+                              pickle_path=DEFAULT_CATEGORY_EMBEDDINGS_PATH,
+                              force_regenerate=False) -> Dict[str, Any]:
+    """
+    Create embeddings for category descriptions
+    Args:
+        categories: Dictionary mapping category IDs to their descriptions
+        progress: Optional progress tracking object
+        pickle_path: Path to the pickle file for caching embeddings
+        force_regenerate: If True, regenerate embeddings even if cache exists
+    Returns:
+        Dictionary mapping category IDs to their embeddings
+    """
+    progress_tracker = SafeProgress(progress, desc="Generating category embeddings")
+    # Try to load embeddings from pickle file if it exists and force_regenerate is False
+    if not force_regenerate and os.path.exists(pickle_path):
+        progress_tracker(0.1, desc=f"Loading cached embeddings from {pickle_path}")
+        try:
+            with open(pickle_path, 'rb') as f:
+                category_embeddings = pickle.load(f)
+            progress_tracker(1.0, desc=f"Loaded embeddings for {len(category_embeddings)} categories from cache")
+            return category_embeddings
+        except Exception as e:
+            print(f"Error loading cached embeddings: {e}")
+            # Continue with generating new embeddings
+    progress_tracker(0.1, desc=f"Processing {len(categories)} categories")
+    # Extract descriptions to create embeddings
+    category_ids = list(categories.keys())
+    category_texts = list(categories.values())
+    # Use the same embedding function used for products
+    texts_with_embeddings = create_product_embeddings(category_texts, progress=progress)
+    # Map embeddings back to category IDs
+    category_embeddings = {}
+    for i, category_id in enumerate(category_ids):
+        if i < len(category_texts) and category_texts[i] in texts_with_embeddings:
+            category_embeddings[category_id] = texts_with_embeddings[category_texts[i]]
+    # Ensure the data directory exists
+    os.makedirs(os.path.dirname(pickle_path), exist_ok=True)
+    # Save embeddings to pickle file
+    progress_tracker(0.9, desc=f"Saving embeddings to {pickle_path}")
+    try:
+        with open(pickle_path, 'wb') as f:
+            pickle.dump(category_embeddings, f)
+    except Exception as e:
+        print(f"Error saving embeddings to pickle file: {e}")
+    progress_tracker(1.0, desc=f"Completed embeddings for {len(category_embeddings)} categories")
+    return category_embeddings
+def load_category_embeddings(pickle_path=DEFAULT_CATEGORY_EMBEDDINGS_PATH) -> Dict[str, Any]:
+    """
+    Load pre-computed category embeddings from pickle file
+    Args:
+        pickle_path: Path to the pickle file with cached embeddings
+    Returns:
+        Dictionary mapping category IDs to their embeddings
+    """
+    if os.path.exists(pickle_path):
+        try:
+            with open(pickle_path, 'rb') as f:
+                category_embeddings = pickle.load(f)
+            print(f"Loaded embeddings for {len(category_embeddings)} categories from {pickle_path}")
+            return category_embeddings
+        except Exception as e:
+            print(f"Error loading cached embeddings: {e}")
+    print(f"No embeddings found at {pickle_path}")
+    return {}
+def match_products_to_categories(product_names: List[str], categories: Dict[str, str], top_n=5,
+                               confidence_threshold=0.5, progress=None,
+                               embeddings_path=DEFAULT_CATEGORY_EMBEDDINGS_PATH) -> Dict[str, List]:
+    """
+    Match products to their most likely categories
+    Args:
+        product_names: List of product names to categorize
+        categories: Dictionary mapping category IDs to their descriptions
+        top_n: Number of top categories to return per product
+        confidence_threshold: Minimum similarity score to include
+        progress: Optional progress tracking object
+        embeddings_path: Path to pre-computed category embeddings
+    Returns:
+        Dictionary mapping products to their matched categories with scores
+    """
+    progress_tracker = SafeProgress(progress, desc="Matching products to categories")
+    # Step 1: Load or create category embeddings
+    progress_tracker(0.2, desc="Loading category embeddings")
+    category_embeddings = load_category_embeddings(embeddings_path)
+    # If no embeddings were loaded, create them
+    if not category_embeddings:
+        progress_tracker(0.3, desc="Creating category embeddings")
+        category_embeddings = create_category_embeddings(categories, progress, pickle_path=embeddings_path)
+    # Step 2: Create product embeddings
+    progress_tracker(0.4, desc="Creating product embeddings")
+    product_embeddings = create_product_embeddings(product_names, progress=progress)
+    # Step 3: Compute similarities between products and categories
+    progress_tracker(0.6, desc="Computing similarities")
+    similarities = compute_similarities(category_embeddings, product_embeddings)
+    # Process results
+    results = {}
+    progress_tracker(0.8, desc="Processing results")
+    for product, product_similarities in similarities.items():
+        # Filter by threshold and take top N
+        filtered_categories = [(category_id, score)
+                              for category_id, score in product_similarities
+                              if score >= confidence_threshold]
+        top_categories = filtered_categories[:top_n]
+        # Add category texts to the results
+        results[product] = [(category_id, categories.get(category_id, "Unknown"), score)
+                           for category_id, score in top_categories]
+    progress_tracker(1.0, desc="Completed category matching")
+    return results
+def hybrid_category_matching(products: List[str], categories: Dict[str, str],
+                            embedding_top_n: int = 20, final_top_n: int = 5,
+                            confidence_threshold: float = 0.5,
+                            progress=None) -> Dict[str, List[Tuple]]:
+    """
+    Two-stage matching: first use embeddings to find candidates, then apply re-ranking
+    Args:
+        products: List of product names to categorize
+        categories: Dictionary mapping category IDs to their descriptions
+        embedding_top_n: Number of top categories to retrieve using embeddings
+        final_top_n: Number of final categories to return after re-ranking
+        confidence_threshold: Minimum score threshold for final results
+        progress: Optional progress tracking object
+    Returns:
+        Dictionary mapping products to their matched categories with scores
+    """
+    progress_tracker = SafeProgress(progress, desc="Hybrid category matching")
+    progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
+    # Stage 1: Use embeddings to find candidate categories
+    embedding_results = match_products_to_categories(
+        products,
+        categories,
+        top_n=embedding_top_n,  # Get more candidates from embeddings than we'll ultimately return
+        progress=progress_tracker
+    )
+    progress_tracker(0.4, desc="Stage 2: Re-ranking candidates")
+    # Initialize Voyage AI client
+    client = voyageai.Client()
+    # Stage 2: Re-rank the candidates for each product
+    final_results = {}
+    for i, product in enumerate(progress_tracker.tqdm(products, desc="Re-ranking product candidates")):
+        progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
+        # Get the embedding candidates for this product
+        if product not in embedding_results:
+            final_results[product] = []
+            continue
+        candidates = embedding_results[product]
+        if not candidates:
+            final_results[product] = []
+            continue
+        # Extract just the category descriptions for re-ranking
+        candidate_ids = [c[0] for c in candidates]
+        candidate_texts = [f"Category: {c[1]}" for c in candidates]
+        try:
+            # Apply re-ranking to the candidates
+            query = f"Which category best describes the product: {product}"
+            reranking = client.rerank(
+                query=query,
+                documents=candidate_texts,
+                model="rerank-2",
+                top_k=final_top_n
+            )
+            # Process re-ranking results
+            product_categories = []
+            for result in reranking.results:
+                # Find the category ID for this result
+                candidate_index = candidate_texts.index(result.document)
+                category_id = candidate_ids[candidate_index]
+                score = result.relevance_score
+                # Only include results above the confidence threshold
+                if score >= confidence_threshold:
+                    product_categories.append((category_id, result.document, score))
+            print(f"Product: {product}")
+            print(f"Top 3 candidates before re-ranking: {candidates[:3]}")
+            print(f"Top 3 candidates after re-ranking: {product_categories[:3]}")
+            final_results[product] = product_categories
+        except Exception as e:
+            print(f"Error during re-ranking for '{product}': {e}")
+            # Fall back to embedding results if re-ranking fails
+            final_results[product] = candidates[:final_top_n]
+    progress_tracker(1.0, desc="Hybrid matching complete")
+    return final_results

chicory_api.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import requests
+import json
+import time
+from typing import List, Dict, Any, Optional
+from utils import SafeProgress
+def call_chicory_parser(product_names: List[str], batch_size: int = 25, delay_seconds: float = 0.1, progress=None) -> Dict[str, Any]:
+    """
+    Call the Chicory Parser V3 API to get ingredient predictions
+    Args:
+        product_names: List of product names to parse
+        batch_size: Maximum number of products to process in one batch
+        delay_seconds: Delay between batches in seconds
+        progress: Optional progress tracking object (Gradio progress bar)
+    Returns:
+        Dictionary mapping product names to their Chicory Parser results
+    """
+    progress_tracker = SafeProgress(progress, desc="Parsing products")
+    # Check if batching is needed
+    if len(product_names) <= batch_size:
+        progress_tracker(0.1, desc=f"Parsing {len(product_names)} products...")
+        result = _make_chicory_api_call(product_names)
+        progress_tracker(1.0, desc="Parsing complete")
+        return result
+    # Process in batches
+    all_results = {}
+    total_batches = (len(product_names) + batch_size - 1) // batch_size
+    # Create batch index ranges
+    batch_ranges = [(i, min(i + batch_size, len(product_names)))
+                   for i in range(0, len(product_names), batch_size)]
+    # Process each batch with tqdm progress
+    for i, (start, end) in enumerate(progress_tracker.tqdm(batch_ranges, desc="Processing batches")):
+        batch = product_names[start:end]
+        batch_number = i + 1
+        # Update with more specific progress info
+        batch_desc = f"Batch {batch_number}/{total_batches}: {len(batch)} products"
+        progress_tracker((i + 0.5) / total_batches, desc=batch_desc)
+        batch_results = _make_chicory_api_call(batch)
+        all_results.update(batch_results)
+        # Add delay before processing the next batch (but not after the last batch)
+        if end < len(product_names):
+            time.sleep(delay_seconds)
+    progress_tracker(1.0, desc=f"Completed parsing {len(product_names)} products")
+    return all_results
+def _make_chicory_api_call(product_names: List[str]) -> Dict[str, Any]:
+    """
+    Makes the actual API call to Chicory Parser
+    """
+    url = "https://prod-parserv3.chicoryapp.com/api/v3/prediction"
+    # Prepare the payload
+    items = [{"id": i, "text": name} for i, name in enumerate(product_names)]
+    payload = json.dumps({"items": items})
+    # Set headers
+    headers = {
+        'Content-Type': 'application/json'
+    }
+    try:
+        response = requests.post(url, headers=headers, data=payload)
+        response.raise_for_status()  # Raise exception for HTTP errors
+        # Parse the response
+        results = response.json()
+        # Create a dictionary mapping product names to results
+        product_results = {}
+        for result in results:
+            product_name = result["input_text"]
+            product_results[product_name] = result
+        return product_results
+    except requests.exceptions.RequestException as e:
+        print(f"Error calling Chicory Parser API: {e}")
+        return {}
+    except json.JSONDecodeError:
+        print(f"Error parsing Chicory API response: {response.text}")
+        return {}

comparison.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import json
+import numpy as np
+from typing import Dict, List, Tuple, Any
+import concurrent.futures
+import time
+import os
+from api_utils import get_openai_client, get_voyage_client, process_in_parallel, rank_ingredients_openai
+from ui_formatters import format_comparison_html, create_results_container
+def compare_ingredient_methods(products: List[str], ingredients_dict: Dict[str, Any],
+                            embedding_top_n: int = 20, final_top_n: int = 3,
+                            confidence_threshold: float = 0.5,
+                            progress=None) -> Dict[str, Dict[str, List[Tuple]]]:
+    """
+    Compare four different methods for ingredient matching:
+    1. Base embeddings (without re-ranking)
+    2. Voyage AI reranker (via hybrid approach)
+    3. Chicory parser
+    4. GPT-4o structured output
+    Args:
+        products: List of product names to categorize
+        ingredients_dict: Dictionary of ingredient names to embeddings
+        embedding_top_n: Number of top ingredients to retrieve using embeddings
+        final_top_n: Number of final results to show for each method
+        confidence_threshold: Minimum score threshold for final results
+        progress: Optional progress tracking object
+    Returns:
+        Dictionary mapping products to results from each method
+    """
+    from utils import SafeProgress, preprocess_product_for_matching
+    from embeddings import create_product_embeddings
+    from chicory_api import call_chicory_parser
+    from similarity import compute_similarities
+    progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods")
+    # Step 1: Generate embeddings for all products (used by multiple methods)
+    progress_tracker(0.1, desc="Generating product embeddings")
+    product_embeddings = create_product_embeddings(products, progress=progress_tracker)
+    # Step 2: Get embedding-based candidates for all products
+    progress_tracker(0.2, desc="Finding embedding candidates")
+    similarities = compute_similarities(ingredients_dict, product_embeddings)
+    # Filter to top N candidates per product
+    embedding_results = {}
+    for product, product_similarities in similarities.items():
+        embedding_results[product] = product_similarities[:embedding_top_n]
+    # Step 3: Call Chicory Parser API (this is done for all products at once)
+    progress_tracker(0.3, desc="Calling Chicory Parser API")
+    chicory_results = call_chicory_parser(products, progress=progress_tracker)
+    # Create final results dictionary with base embeddings (which don't need any further processing)
+    comparison_results = {}
+    for product in products:
+        if product in embedding_results:
+            # Initialize with base embeddings already calculated
+            candidates = embedding_results[product]
+            base_results = [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
+            comparison_results[product] = {
+                "base": base_results,
+                "voyage": [],
+                "chicory": [],
+                "openai": []
+            }
+            # Also process Chicory results immediately as they're already fetched
+            chicory_matches = []
+            if product in chicory_results:
+                chicory_data = chicory_results[product]
+                if isinstance(chicory_data, dict):
+                    ingredient = chicory_data.get("ingredient", "")
+                    confidence = chicory_data.get("confidence", 0)
+                    if ingredient and confidence >= confidence_threshold:
+                        chicory_matches.append((ingredient, confidence))
+            comparison_results[product]["chicory"] = chicory_matches
+        else:
+            comparison_results[product] = {
+                "base": [],
+                "voyage": [],
+                "chicory": [],
+                "openai": []
+            }
+    # Initialize clients for reranking - REPLACED WITH UTILITY FUNCTIONS
+    voyage_client = get_voyage_client()
+    openai_client = get_openai_client()
+    # Define the methods that will be executed in parallel (now focused only on the API-heavy tasks)
+    def process_voyage_reranking(product):
+        if product not in embedding_results or not embedding_results[product]:
+            return product, []
+        candidates = embedding_results[product]
+        candidate_ingredients = [c[0] for c in candidates]
+        candidate_texts = [f"Ingredient: {c[0]}" for c in candidates]
+        try:
+            # Apply Voyage reranking to the candidates
+            query = product  # Use product directly as query
+            reranking = voyage_client.rerank(
+                query=query,
+                documents=candidate_texts,
+                model="rerank-2",
+                top_k=final_top_n
+            )
+            # Process reranking results
+            voyage_ingredients = []
+            for result in reranking.results:
+                # Find the ingredient for this result
+                candidate_index = candidate_texts.index(result.document)
+                ingredient = candidate_ingredients[candidate_index]
+                score = float(result.relevance_score)
+                # Only include results above the confidence threshold
+                if score >= confidence_threshold:
+                    voyage_ingredients.append((ingredient, score))
+            return product, voyage_ingredients
+        except Exception as e:
+            print(f"Error during Voyage reranking for '{product}': {e}")
+            # Fall back to embedding results
+            return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
+    def process_openai(product):
+        if product not in embedding_results or not embedding_results[product]:
+            return product, []
+        candidates = embedding_results[product]
+        candidate_ingredients = [c[0] for c in candidates]
+        try:
+            # Use the shared utility function
+            openai_ingredients = rank_ingredients_openai(
+                product=product,
+                candidates=candidate_ingredients,
+                client=openai_client,
+                model="o3-mini",
+                max_results=final_top_n,
+                confidence_threshold=confidence_threshold
+            )
+            return product, openai_ingredients
+        except Exception as e:
+            print(f"Error during OpenAI processing for '{product}': {e}")
+            # Fall back to embedding results
+            return product, [(c[0], c[1]) for c in candidates[:final_top_n] if c[1] >= confidence_threshold]
+    # Process Voyage AI reranking in parallel - REPLACED WITH SHARED UTILITY
+    progress_tracker(0.4, desc="Running Voyage AI reranking in parallel")
+    voyage_results = process_in_parallel(
+        items=products,
+        processor_func=process_voyage_reranking,
+        max_workers=min(20, len(products)),
+        progress_tracker=progress_tracker,
+        progress_start=0.4,
+        progress_end=0.65,
+        progress_desc="Voyage AI"
+    )
+    # Update comparison results with Voyage results
+    for product, results in voyage_results.items():
+        if product in comparison_results:
+            comparison_results[product]["voyage"] = results
+    # Process OpenAI queries in parallel - REPLACED WITH SHARED UTILITY
+    progress_tracker(0.7, desc="Running OpenAI processing in parallel")
+    openai_results = process_in_parallel(
+        items=products,
+        processor_func=process_openai,
+        max_workers=min(20, len(products)),
+        progress_tracker=progress_tracker,
+        progress_start=0.7,
+        progress_end=0.95,
+        progress_desc="OpenAI"
+    )
+    # Update comparison results with OpenAI results
+    for product, results in openai_results.items():
+        if product in comparison_results:
+            comparison_results[product]["openai"] = results
+    progress_tracker(1.0, desc="Comparison complete")
+    return comparison_results
+def compare_ingredient_methods_ui(product_input, is_file=False, embedding_top_n=20,
+                                final_top_n=3, confidence_threshold=0.5, progress=None):
+    """
+    Compare multiple ingredient matching methods on the same products
+    Args:
+        product_input: Text input with product names or file path
+        is_file: Whether the input is a file
+        embedding_top_n: Number of top ingredients to retrieve using embeddings
+        final_top_n: Number of final results to show for each method
+        confidence_threshold: Minimum score threshold for final results
+        progress: Optional progress tracking object
+    Returns:
+        HTML formatted comparison results
+    """
+    from utils import SafeProgress, load_embeddings
+    progress_tracker = SafeProgress(progress, desc="Comparing ingredient matching methods")
+    progress_tracker(0.1, desc="Processing input")
+    # Split text input by lines and remove empty lines
+    if not product_input:
+        return "Please enter at least one product."
+    product_names = [p.strip() for p in product_input.split('\n') if p.strip()]
+    if not product_names:
+        return "Please enter at least one product."
+    # Load ingredient embeddings
+    try:
+        progress_tracker(0.2, desc="Loading ingredient embeddings")
+        ingredients_dict = load_embeddings("data/ingredient_embeddings_voyageai.pkl")
+        progress_tracker(0.3, desc="Comparing methods")
+        comparison_results = compare_ingredient_methods(
+            products=product_names,
+            ingredients_dict=ingredients_dict,
+            embedding_top_n=embedding_top_n,
+            final_top_n=final_top_n,
+            confidence_threshold=confidence_threshold,
+            progress=progress_tracker
+        )
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        return f"<div style='color: red;'>Error comparing methods: {str(e)}<br><pre>{error_details}</pre></div>"
+    # Format results as HTML using centralized formatters
+    progress_tracker(0.9, desc="Formatting results")
+    result_elements = []
+    for product in product_names:
+        if product in comparison_results:
+            result_elements.append(format_comparison_html(product, comparison_results[product]))
+    output_html = create_results_container(
+        result_elements,
+        header_text=f"Comparing {len(product_names)} products using multiple ingredient matching methods."
+    )
+    progress_tracker(1.0, desc="Complete")
+    return output_html

config.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Add UI configuration
2	+ UI_THEME = "dark" # "light" or "dark"

data/category_embeddings.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c51642451d7f5853975e974b46d7466c1a4c238f9caaa302c7ad454111c4fed
+size 1275461

data/ingredient_embeddings_voyageai.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:394e5ca827ca948d6e44d830b12e071c24ac5898a52b9ce00ff54480a0f3e3c0
+size 27292336

debug_embeddings.py ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env python3
+"""
+Debug tool for checking ingredient embeddings
+Run with: python debug_embeddings.py [optional_embeddings_path]
+"""
+import os
+import sys
+import json
+import pickle
+import logging
+from pathlib import Path
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger('debug_embeddings')
+def check_embeddings_file(filepath):
+    """Check if embeddings file exists and is valid"""
+    logger.info(f"Checking embeddings file: {filepath}")
+    # Check if file exists
+    if not os.path.exists(filepath):
+        logger.error(f"ERROR: Embeddings file not found at {filepath}")
+        return False
+    # Check file size
+    file_size = os.path.getsize(filepath) / (1024 * 1024)  # Size in MB
+    logger.info(f"File size: {file_size:.2f} MB")
+    # Determine file type based on extension
+    is_pickle = filepath.endswith(('.pkl', '.pickle'))
+    # Check if file is valid
+    try:
+        if is_pickle:
+            with open(filepath, 'rb') as f:
+                data = pickle.load(f)
+        else:
+            with open(filepath, 'r') as f:
+                data = json.load(f)
+        if not isinstance(data, dict):
+            logger.error("ERROR: Embeddings file is not a valid dictionary")
+            return False
+        num_ingredients = len(data)
+        logger.info(f"Number of ingredients/categories: {num_ingredients}")
+        if num_ingredients == 0:
+            logger.error("ERROR: Embeddings dictionary is empty")
+            return False
+        # Check a few random entries
+        import random
+        sample_keys = random.sample(list(data.keys()), min(3, len(data)))
+        logger.info(f"Sample keys: {sample_keys}")
+        for key in sample_keys:
+            embedding = data[key]
+            if isinstance(embedding, list):
+                embedding_dim = len(embedding)
+                logger.info(f"Embedding for '{key}' is a list with dimension: {embedding_dim}")
+            elif hasattr(embedding, 'shape'):  # numpy array
+                logger.info(f"Embedding for '{key}' is a numpy array with shape: {embedding.shape}")
+            else:
+                logger.info(f"Embedding for '{key}' is of type: {type(embedding)}")
+        return True
+    except json.JSONDecodeError:
+        logger.error("ERROR: File is not valid JSON")
+        return False
+    except pickle.UnpicklingError:
+        logger.error("ERROR: File is not a valid pickle file")
+        return False
+    except Exception as e:
+        logger.error(f"ERROR: Unexpected error checking embeddings: {str(e)}")
+        return False
+def main():
+    # Get embeddings path from argument or environment or default
+    if len(sys.argv) > 1:
+        filepath = sys.argv[1]
+    else:
+        filepath = os.environ.get('EMBEDDINGS_PATH', 'data/ingredient_embeddings_voyageai.pkl')
+    # Check if path exists and is valid
+    if check_embeddings_file(filepath):
+        logger.info("✅ Embeddings file looks valid!")
+        # Suggest setting environment variable if not already set
+        if 'EMBEDDINGS_PATH' not in os.environ:
+            logger.info(f"TIP: Set the EMBEDDINGS_PATH environment variable to: {filepath}")
+            logger.info(f"     export EMBEDDINGS_PATH=\"{filepath}\"")
+    else:
+        logger.error("❌ Embeddings file has issues that need to be fixed")
+        # Look for specific pickle files
+        specific_files = [
+            'data/ingredient_embeddings_voyageai.pkl',
+            'data/category_embeddings.pickle'
+        ]
+        # Look for embedding files in data directory
+        data_dir = Path('data')
+        if data_dir.exists():
+            logger.info("Checking 'data' directory for embedding files:")
+            for file in data_dir.glob('*embed*.p*'):
+                logger.info(f"  - {file}")
+                if file.name in specific_files:
+                    logger.info(f"  ✓ Found target file: {file}")
+                    logger.info(f"  Try running with: python debug_embeddings.py {file}")
+        # Look for similar files that might be the correct embeddings
+        dir_path = os.path.dirname(filepath) or '.'
+        try:
+            similar_files = list(Path(dir_path).glob("*embed*.p*"))
+            if similar_files:
+                logger.info("Found similar files that might contain embeddings:")
+                for file in similar_files:
+                    logger.info(f"  - {file}")
+        except Exception:
+            pass
+if __name__ == "__main__":
+    main()

embeddings.py ADDED Viewed

	@@ -0,0 +1,128 @@

+from typing import List, Dict, Any, Optional
+from utils import SafeProgress
+import os
+import voyageai
+import time
+import numpy as np
+from concurrent.futures import ThreadPoolExecutor
+# Set Voyage AI API key directly
+voyageai.api_key = os.getenv("VOYAGE_API_KEY")
+def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
+    """Get embeddings for a list of texts in batches"""
+    all_embeddings = []
+    total_texts = len(texts)
+    # Pre-process all texts to replace newlines
+    texts = [text.replace("\n", " ") for text in texts]
+    for i in range(0, len(texts), batch_size):
+        batch = texts[i:i+batch_size]
+        current_count = min(i + batch_size, total_texts)
+        try:
+            response = voyageai.Embedding.create(input=batch, model=model)
+            batch_embeddings = [item['embedding'] for item in response['data']]
+            all_embeddings.extend(batch_embeddings)
+            # Sleep briefly to avoid rate limits
+            if i + batch_size < len(texts):
+                time.sleep(0.5)
+        except Exception as e:
+            print(f"Error in batch {i//batch_size + 1}: {e}")
+            # Add empty embeddings for failed batch
+            all_embeddings.extend([None] * len(batch))
+    return all_embeddings
+def create_product_embeddings(products: List[str], batch_size: int = 100, progress=None) -> Dict[str, Any]:
+    """
+    Create embeddings for product names with optimization for duplicates
+    Args:
+        products: List of product names to create embeddings for
+        batch_size: Maximum number of products to process in one batch
+        progress: Optional progress tracking object (Gradio progress bar)
+    Returns:
+        Dictionary mapping product names to their embeddings
+    """
+    progress_tracker = SafeProgress(progress, desc="Generating embeddings")
+    total_products = len(products)
+    # Initialize results dictionary
+    product_embeddings = {}
+    # Use the same model as for ingredients (voyage-3-large)
+    model = "voyage-3-large"
+    # Process in batches with de-duplication
+    progress_tracker(0.1, desc=f"Starting embeddings for {total_products} products")
+    # De-duplication step
+    unique_products = []
+    product_to_index = {}
+    index_map = {}  # Maps original index to index in unique_products
+    for i, product in enumerate(products):
+        if product in product_to_index:
+            # Product already seen, just store the mapping
+            index_map[i] = product_to_index[product]
+        else:
+            # New unique product
+            product_to_index[product] = len(unique_products)
+            index_map[i] = len(unique_products)
+            unique_products.append(product)
+    progress_tracker(0.2, desc=f"Found {len(unique_products)} unique products out of {total_products} total")
+    if len(unique_products) == 0:
+        progress_tracker(1.0, desc="No valid products to process")
+        return {}
+    # Get embeddings in batches for unique products only
+    try:
+        # Pre-process all texts to replace newlines
+        clean_products = [product.replace("\n", " ") for product in unique_products]
+        progress_tracker(0.3, desc=f"Calling VoyageAI API for {len(clean_products)} unique products")
+        # Process in smaller batches for better reliability
+        unique_embeddings = get_embeddings_batch(clean_products, model=model, batch_size=batch_size)
+        # Map embeddings back to all products
+        progress_tracker(0.8, desc=f"Mapping embeddings back to all products")
+        for i, product in enumerate(products):
+            unique_idx = index_map[i]
+            if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
+                # Store as dictionary with 'embedding' key for consistent format
+                product_embeddings[product] = {
+                    "embedding": unique_embeddings[unique_idx]
+                }
+        progress_tracker(0.9, desc="Processing embeddings completed")
+    except Exception as e:
+        progress_tracker(0.9, desc=f"Error generating embeddings: {str(e)}")
+        print(f"Error generating product embeddings: {e}")
+    progress_tracker(1.0, desc=f"Completed embeddings for {len(product_embeddings)} products")
+    return product_embeddings
+def _generate_embeddings_for_batch(batch: List[str]) -> Dict[str, Any]:
+    """
+    Generate embeddings for a batch of products
+    """
+    # This is a placeholder for your actual embedding generation logic
+    # Replace with your actual implementation
+    import time
+    # Your existing embedding code should go here instead of this placeholder
+    embeddings = {}
+    for product in batch:
+        # Replace with actual embedding creation
+        embeddings[product] = {"embedding": [0.1, 0.2, 0.3]}
+    return embeddings

generate_category_embeddings.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import argparse
+import pickle
+from category_matching import load_categories, create_category_embeddings
+def main(categories_file, output_file):
+    # Load categories from the JSON file
+    categories = load_categories(categories_file)
+    if not categories:
+        print("No categories loaded. Exiting.")
+        return
+    print(f"Loaded {len(categories)} categories.")
+    # Generate category embeddings using Voyage AI
+    print("Generating category embeddings...")
+    embeddings = create_category_embeddings(categories)
+    # Save embeddings to pickle file
+    with open(output_file, 'wb') as f:
+        pickle.dump(embeddings, f)
+    print(f"Category embeddings saved to {output_file}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate and pickle category embeddings using Voyage AI.")
+    parser.add_argument("--categories", type=str, default="categories.json",
+                        help="Path to the categories JSON file.")
+    parser.add_argument("--output", type=str, default="data/category_embeddings.pickle",
+                        help="Path to output pickle file for embeddings")
+    args = parser.parse_args()
+    main(args.categories, args.output)

main.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import argparse
+import os
+import sys
+import gradio as gr
+from utils import load_embeddings
+from ui import create_demo
+from config import UI_THEME
+from ui_formatters import set_theme
+def main():
+    """Main entry point for the application"""
+    parser = argparse.ArgumentParser(description='Run the Product Categorization web app')
+    parser.add_argument('--embeddings', default='data/ingredient_embeddings_voyageai.pkl',
+                        help='Path to the ingredient embeddings pickle file')
+    parser.add_argument('--share', action='store_true', help='Create a public link for sharing')
+    args = parser.parse_args()
+    # Check if embeddings file exists
+    if not os.path.exists(args.embeddings):
+        print(f"Error: Embeddings file {args.embeddings} not found!")
+        print(f"Please ensure the file exists at {os.path.abspath(args.embeddings)}")
+        sys.exit(1)
+    # Load embeddings
+    try:
+        embeddings_data = load_embeddings(args.embeddings)
+        # Update the embeddings in the ui_core module
+        import ui_core
+        ui_core.embeddings = embeddings_data
+    except Exception as e:
+        print(f"Error loading embeddings: {e}")
+        sys.exit(1)
+    # Set the application theme
+    set_theme(UI_THEME)
+    # Create and launch the interface
+    demo = create_demo()
+    # Launch with only supported parameters
+    demo.launch(
+        share=args.share,
+        show_api=False
+    )
+if __name__ == "__main__":
+    main()

openai_expansion.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import json
+from typing import List, Dict, Any
+from openai import OpenAI
+import concurrent.futures
+from utils import SafeProgress
+from api_utils import get_openai_client
+def expand_product_descriptions(products: List[str],
+                               max_workers: int = 5,
+                               progress=None) -> Dict[str, str]:
+    """
+    Expand product descriptions using OpenAI's structured output
+    Args:
+        products: List of product names to expand
+        max_workers: Maximum number of concurrent API calls
+        progress: Optional progress tracking object
+    Returns:
+        Dictionary mapping original product names to expanded descriptions
+    """
+    progress_tracker = SafeProgress(progress, desc="Expanding product descriptions")
+    # Set up OpenAI client
+    openai_client = get_openai_client()
+    expanded_descriptions = {}
+    def process_product(product):
+        try:
+            response = openai_client.responses.create(
+                # model="o3-mini",
+                model="gpt-4o-mini",
+                max_output_tokens=100,
+                # reasoning={"effort": "low"},
+                input=[
+                    {"role": "system", "content": """You are a product description expert. Your task is to expand product names into descriptions that would help an embedding model categorize them correctly.
+                     """},
+                    {"role": "user", "content": f'Describe "{product}" to an embedding model categorizing products'}
+                ],
+                text={
+                    "format": {
+                        "type": "json_schema",
+                        "name": "product_description",
+                        "schema": {
+                            "type": "object",
+                            "properties": {
+                                "expanded_description": {
+                                    "type": "string",
+                                    "description": "An expanded description of the product that includes its category, type, common ingredients or components, and typical use cases."
+                                }
+                            },
+                            "required": ["expanded_description"],
+                            "additionalProperties": False
+                        },
+                        "strict": True
+                    }
+                }
+            )
+            # Parse the response
+            result = json.loads(response.output_text)
+            return product, result["expanded_description"]
+        except Exception as e:
+            print(f"Error expanding description for '{product}': {e}")
+            return product, f"{product} - No expanded description available."
+    # Process in batches for better parallelism
+    total_products = len(products)
+    progress_tracker(0.1, desc=f"Processing {total_products} products")
+    # Use thread pool for concurrent API calls
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        future_to_product = {executor.submit(process_product, product): i
+                            for i, product in enumerate(products)}
+        for i, future in enumerate(concurrent.futures.as_completed(future_to_product)):
+            progress_percent = 0.1 + (0.8 * (i+1) / total_products)
+            product_index = future_to_product[future]
+            progress_tracker(progress_percent, desc=f"Expanded {i+1}/{total_products} products")
+            try:
+                original_product, expanded_description = future.result()
+                expanded_descriptions[original_product] = expanded_description
+            except Exception as e:
+                product = products[product_index]
+                print(f"Error processing expansion for '{product}': {e}")
+                expanded_descriptions[product] = product  # Fallback to original product name
+    progress_tracker(1.0, desc="Expansion complete")
+    return expanded_descriptions

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+voyageai
+numpy
+gradio
+openai
+requests
+tqdm

similarity.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import numpy as np
+from typing import Dict, List, Tuple, Any
+import json
+import voyageai
+from openai import OpenAI
+from api_utils import get_openai_client
+def compute_similarities(ingredients_dict, products_dict):
+    """
+    Compute cosine similarities between ingredient embeddings and product embeddings
+    Args:
+        ingredients_dict: Dictionary of ingredient names to embeddings
+        products_dict: Dictionary of product names to embedding dictionaries
+    Returns:
+        Dictionary of products with their similar ingredients and scores
+    """
+    # Validate inputs
+    if not ingredients_dict:
+        print("Warning: ingredients_dict is empty")
+        return {}
+    if not products_dict:
+        print("Warning: products_dict is empty")
+        return {}
+    # Process ingredients - ensure we have proper embeddings
+    ingredient_names = []
+    ingredient_embeddings_list = []
+    for name, emb in ingredients_dict.items():
+        # Make sure we have valid embeddings (not None, not empty)
+        if emb is None or (isinstance(emb, (list, np.ndarray)) and len(emb) == 0):
+            continue
+        # Handle both direct embedding vectors and dictionary formats
+        if isinstance(emb, dict) and "embedding" in emb:
+            embedding_vector = emb["embedding"]
+            if embedding_vector is not None:
+                ingredient_names.append(name)
+                ingredient_embeddings_list.append(embedding_vector)
+        elif isinstance(emb, (list, np.ndarray)):
+            ingredient_names.append(name)
+            ingredient_embeddings_list.append(emb)
+    if not ingredient_names:
+        print("Warning: No valid ingredient embeddings found")
+        return {}
+    # Convert to numpy array ensuring we have a 2D array
+    ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
+    if ingredient_embeddings.ndim == 1:
+        # If we got a 1D array, reshape it to 2D (1 x dimension)
+        print(f"Warning: Ingredient embeddings have only 1 dimension, reshaping. Shape: {ingredient_embeddings.shape}")
+        if len(ingredient_embeddings) > 0:
+            ingredient_embeddings = ingredient_embeddings.reshape(1, -1)
+        else:
+            print("Error: Empty ingredient embeddings array")
+            return {}
+    # Normalize ingredient embeddings for cosine similarity
+    # Add safety checks for zero norms
+    ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
+    # Avoid division by zero
+    ingredient_norms = np.where(ingredient_norms == 0, 1e-10, ingredient_norms)
+    normalized_ingredients = ingredient_embeddings / ingredient_norms
+    # Process products
+    product_names = []
+    valid_embeddings = []
+    # Extract the actual embedding vectors from product dictionaries
+    for product_name, product_data in products_dict.items():
+        # Skip None values
+        if product_data is None:
+            continue
+        # Check if the product has an embedding dictionary with the expected structure
+        if isinstance(product_data, dict) and "embedding" in product_data:
+            embedding_vector = product_data["embedding"]
+            if embedding_vector is not None:
+                product_names.append(product_name)
+                valid_embeddings.append(embedding_vector)
+        # If the product data is already a vector, use it directly
+        elif isinstance(product_data, (list, np.ndarray)):
+            product_names.append(product_name)
+            valid_embeddings.append(product_data)
+    if not product_names:
+        print("Warning: No valid product embeddings found")
+        return {}
+    # Convert to numpy array for calculations
+    product_embeddings = np.array(valid_embeddings, dtype=np.float32)
+    # Handle case where we got a 1D array
+    if product_embeddings.ndim == 1:
+        print(f"Warning: Product embeddings have only 1 dimension, reshaping. Shape: {product_embeddings.shape}")
+        if len(product_embeddings) > 0:
+            product_embeddings = product_embeddings.reshape(1, -1)
+        else:
+            print("Error: Empty product embeddings array")
+            return {}
+    # Check and handle embedding dimension mismatch
+    product_dim = product_embeddings.shape[1] if product_embeddings.ndim > 1 else len(product_embeddings)
+    ingredient_dim = normalized_ingredients.shape[1] if normalized_ingredients.ndim > 1 else len(normalized_ingredients)
+    if product_dim != ingredient_dim:
+        print(f"Warning: Dimension mismatch between product embeddings ({product_dim}) and ingredient embeddings ({ingredient_dim})")
+        # Return empty results if dimensions don't match
+        return {}
+    # Normalize product embeddings for cosine similarity
+    product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
+    # Avoid division by zero
+    product_norms = np.where(product_norms == 0, 1e-10, product_norms)
+    normalized_products = product_embeddings / product_norms
+    # Compute cosine similarity
+    similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
+    # Create result dictionary
+    results = {}
+    for i, product_name in enumerate(product_names):
+        similarities = similarity_matrix[i]
+        product_similarities = [(ingredient_names[j], float(similarities[j]))
+                                for j in range(len(ingredient_names))]
+        # Sort by similarity score (descending)
+        product_similarities.sort(key=lambda x: x[1], reverse=True)
+        results[product_name] = product_similarities
+    return results
+def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str, Any],
+                            embedding_top_n: int = 20, final_top_n: int = 5,
+                            confidence_threshold: float = 0.5,
+                            progress=None) -> Dict[str, List[Tuple]]:
+    """
+    Two-stage matching: first use embeddings to find candidate ingredients, then apply re-ranking
+    Args:
+        products: List of product names to categorize
+        ingredients_dict: Dictionary of ingredient names to embeddings
+        embedding_top_n: Number of top ingredients to retrieve using embeddings
+        final_top_n: Number of final ingredients to return after re-ranking
+        confidence_threshold: Minimum score threshold for final results
+        progress: Optional progress tracking object
+    Returns:
+        Dictionary mapping products to their matched ingredients with scores
+    """
+    from utils import SafeProgress
+    from embeddings import create_product_embeddings
+    progress_tracker = SafeProgress(progress, desc="Hybrid ingredient matching")
+    progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
+    # Stage 1: Use embeddings to find candidate ingredients
+    # Generate product embeddings
+    product_embeddings = create_product_embeddings(products, progress=progress_tracker)
+    # Compute similarities to get candidate ingredients
+    similarities = compute_similarities(ingredients_dict, product_embeddings)
+    # Filter to top N candidates per product
+    embedding_results = {}
+    for product, product_similarities in similarities.items():
+        embedding_results[product] = product_similarities[:embedding_top_n]
+    progress_tracker(0.4, desc="Stage 2: Re-ranking candidates")
+    # Initialize OpenAI client using the centralized function
+    openai_client = get_openai_client()
+    # Stage 2: Re-rank the candidates for each product
+    final_results = {}
+    for i, product in enumerate(products):
+        progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
+        # Get the embedding candidates for this product
+        if product not in embedding_results:
+            final_results[product] = []
+            continue
+        candidates = embedding_results[product]
+        if not candidates:
+            final_results[product] = []
+            continue
+        # Extract just the ingredient names for re-ranking
+        candidate_ingredients = [c[0] for c in candidates]
+        try:
+            # Apply re-ranking using OpenAI's structured output
+            response = openai_client.responses.create(
+            model="o3-mini",
+            # reasoning={"effort": "low"},
+            input=[
+                {"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
+                {"role": "user", "content": f"Product: {product}\n\nPotential ingredients: {', '.join(candidate_ingredients)}"}
+            ],
+            text={
+                "format": {
+                "type": "json_schema",
+                "name": "ingredient_selection",
+                "schema": {
+                    "type": "object",
+                    "properties": {
+                    "best_match": {
+                        "type": "object",
+                        "properties": {
+                        "ingredient": {
+                            "type": "string",
+                            "description": "The name of the best matching ingredient"
+                        },
+                        "explanation": {
+                            "type": "string",
+                            "description": "Brief explanation for the matching"
+                        },
+                        "relevance_score": {
+                            "type": "number",
+                            "description": "Score between 0 and 1 indicating relevance"
+                        }
+                        },
+                        "required": ["ingredient", "relevance_score", "explanation"],
+                        "additionalProperties": False
+                    }
+                    },
+                    "required": ["best_match"],
+                    "additionalProperties": False
+                },
+                "strict": True
+                }
+            }
+            )
+            # Parse the response
+            best_match = json.loads(response.output_text)["best_match"]
+            # Only include the result if it meets the confidence threshold
+            if best_match["relevance_score"] >= confidence_threshold:
+                final_results[product] = [(best_match["ingredient"], best_match["relevance_score"])]
+            else:
+                final_results[product] = []
+        except Exception as e:
+            print(f"Error during OpenAI re-ranking for '{product}': {e}")
+            # Fall back to embedding results if re-ranking fails
+            final_results[product] = candidates[:1]  # Select the top embedding result as fallback
+        except Exception as e:
+            print(f"Error during OpenAI re-ranking for '{product}': {e}")
+            # Fall back to embedding results if re-ranking fails
+            final_results[product] = candidates[:final_top_n]
+    progress_tracker(1.0, desc="Hybrid ingredient matching complete")
+    return final_results

ui.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import gradio as gr
+from comparison import compare_ingredient_methods_ui
+# Import from our new UI modules
+from ui_core import embeddings, get_css, load_examples
+from ui_ingredient_matching import categorize_products
+from ui_category_matching import categorize_products_by_category
+from ui_hybrid_matching import categorize_products_hybrid, categorize_products_hybrid_ingredients
+from ui_expanded_matching import categorize_products_with_expansion
+from ui_formatters import get_formatted_css
+def create_demo():
+    """Create the Gradio interface"""
+    with gr.Blocks(css=get_css()) as demo:
+        gr.Markdown("# Product Categorization Tool\nAnalyze products by matching to ingredients or categories using AI embeddings.")
+        with gr.Tabs() as tabs:
+            # Original Ingredient Matching Tab
+            with gr.TabItem("Ingredient Matching"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        # Input section
+                        text_input = gr.Textbox(
+                            lines=10,
+                            placeholder="Enter product names, one per line",
+                            label="Product Names"
+                        )
+                        input_controls = gr.Row()
+                        with input_controls:
+                            top_n = gr.Slider(1, 25, 10, step=1, label="Top N Results")
+                            confidence = gr.Slider(0.1, 0.9, 0.5, label="Similarity Threshold")
+                        with gr.Row():
+                            examples_btn = gr.Button("Load Examples", variant="secondary")
+                            categorize_btn = gr.Button("Find Similar Ingredients", variant="primary")
+                    with gr.Column(scale=1):
+                        # Results section
+                        text_output = gr.HTML(label="Similar Ingredients Results", elem_id="results-container")
+            # New Hybrid Ingredient Matching Tab
+            with gr.TabItem("Hybrid Ingredient Matching"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        # Input section
+                        hybrid_ing_text_input = gr.Textbox(
+                            lines=10,
+                            placeholder="Enter product names, one per line",
+                            label="Product Names"
+                        )
+                        hybrid_ing_input_controls = gr.Row()
+                        with hybrid_ing_input_controls:
+                            ing_embedding_top_n = gr.Slider(1, 50, 20, step=1, label="Embedding Top N Results")
+                            ing_final_top_n = gr.Slider(1, 10, 5, step=1, label="Final Top N Ingredients")
+                            hybrid_ing_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
+                        with gr.Row():
+                            hybrid_ing_examples_btn = gr.Button("Load Examples", variant="secondary")
+                            hybrid_ing_match_btn = gr.Button("Match Ingredients using Hybrid Approach", variant="primary")
+                    with gr.Column(scale=1):
+                        # Results section
+                        hybrid_ing_output = gr.HTML(label="Hybrid Ingredient Matching Results", elem_id="results-container")
+            # New Category Matching Tab
+            with gr.TabItem("Category Matching"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        # Input section
+                        category_text_input = gr.Textbox(
+                            lines=10,
+                            placeholder="Enter product names, one per line",
+                            label="Product Names"
+                        )
+                        category_input_controls = gr.Row()
+                        with category_input_controls:
+                            category_top_n = gr.Slider(1, 10, 5, step=1, label="Top N Categories")
+                            category_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
+                        with gr.Row():
+                            category_examples_btn = gr.Button("Load Examples", variant="secondary")
+                            match_categories_btn = gr.Button("Match to Categories", variant="primary")
+                    with gr.Column(scale=1):
+                        # Results section
+                        category_output = gr.HTML(label="Category Matching Results", elem_id="results-container")
+            # New Hybrid Matching Tab
+            with gr.TabItem("Hybrid Category Matching"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        # Input section
+                        hybrid_text_input = gr.Textbox(
+                            lines=10,
+                            placeholder="Enter product names, one per line",
+                            label="Product Names"
+                        )
+                        hybrid_input_controls = gr.Row()
+                        with hybrid_input_controls:
+                            embedding_top_n = gr.Slider(1, 50, 20, step=1, label="Embedding Top N Results")
+                            final_top_n = gr.Slider(1, 10, 5, step=1, label="Final Top N Categories")
+                            hybrid_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
+                        with gr.Row():
+                            hybrid_examples_btn = gr.Button("Load Examples", variant="secondary")
+                            hybrid_match_btn = gr.Button("Match using Hybrid Approach", variant="primary")
+                    with gr.Column(scale=1):
+                        # Results section
+                        hybrid_output = gr.HTML(label="Hybrid Matching Results", elem_id="results-container")
+            # New Comparison Tab
+            with gr.TabItem("Compare Methods"):
+                with gr.Row():
+                    with gr.Column():
+                        compare_product_input = gr.Textbox(
+                            label="Enter product names (one per line)",
+                            placeholder="4 Tbsp sweet pickle relish\nchocolate chips\nfresh parsley",
+                            lines=5
+                        )
+                        with gr.Row():
+                            compare_embedding_top_n = gr.Slider(
+                                minimum=5, maximum=50, value=20, step=5,
+                                label="Initial embedding candidates"
+                            )
+                            compare_final_top_n = gr.Slider(
+                                minimum=1, maximum=10, value=3, step=1,
+                                label="Final results per method"
+                            )
+                            compare_confidence_threshold = gr.Slider(
+                                minimum=0.0, maximum=1.0, value=0.5, step=0.05,
+                                label="Confidence threshold"
+                            )
+                        compare_btn = gr.Button("Compare Methods", variant="primary")
+                        compare_examples_btn = gr.Button("Load Examples", variant="secondary")
+                    with gr.Column():
+                        comparison_output = gr.HTML(label="Results", elem_id="results-container")
+                # Connect the compare button
+                compare_btn.click(
+                    fn=compare_ingredient_methods_ui,
+                    inputs=[
+                        compare_product_input,
+                        gr.State(False),  # Always text input mode
+                        compare_embedding_top_n,
+                        compare_final_top_n,
+                        compare_confidence_threshold
+                    ],
+                    outputs=comparison_output
+                )
+                # Add examples button functionality
+                compare_examples_btn.click(
+                    fn=load_examples,
+                    inputs=[],
+                    outputs=compare_product_input
+                )
+            # New Expanded Description Tab
+            with gr.TabItem("Expanded Description Matching"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        # Input section
+                        expanded_text_input = gr.Textbox(
+                            lines=10,
+                            placeholder="Enter product names, one per line",
+                            label="Product Names"
+                        )
+                        expanded_input_controls = gr.Row()
+                        with expanded_input_controls:
+                            expanded_top_n = gr.Slider(1, 20, 10, step=1, label="Top N Results")
+                            expanded_confidence = gr.Slider(0.1, 0.9, 0.5, label="Matching Threshold")
+                        # Add toggle here for matching type
+                        expanded_match_type = gr.Radio(
+                            choices=["ingredients", "categories"],
+                            value="ingredients",
+                            label="Match Type",
+                            info="Choose whether to match against ingredients or categories"
+                        )
+                        with gr.Row():
+                            expanded_match_btn = gr.Button("Match with Expanded Descriptions", variant="primary")
+                            expanded_examples_btn = gr.Button("Load Examples")
+                    with gr.Column(scale=1):
+                        # Results section
+                        expanded_output = gr.HTML(label="Results with Expanded Descriptions", elem_id="results-container")
+        # Connect buttons for ingredient matching
+        categorize_btn.click(
+            fn=categorize_products,
+            inputs=[text_input, gr.State(False), top_n, confidence],
+            outputs=[text_output],
+        )
+        # Connect buttons for category matching
+        match_categories_btn.click(
+            fn=categorize_products_by_category,
+            inputs=[category_text_input, gr.State(False), category_top_n, category_confidence],
+            outputs=[category_output],
+        )
+        # Connect buttons for hybrid matching
+        hybrid_match_btn.click(
+            fn=categorize_products_hybrid,
+            inputs=[hybrid_text_input, gr.State(False), embedding_top_n, final_top_n, hybrid_confidence],
+            outputs=[hybrid_output],
+        )
+        # Connect buttons for hybrid ingredient matching
+        hybrid_ing_match_btn.click(
+            fn=categorize_products_hybrid_ingredients,
+            inputs=[hybrid_ing_text_input, gr.State(False), ing_embedding_top_n, ing_final_top_n, hybrid_ing_confidence],
+            outputs=[hybrid_ing_output],
+        )
+        hybrid_ing_examples_btn.click(
+            fn=load_examples,  # Reuse the same examples
+            inputs=[],
+            outputs=hybrid_ing_text_input
+        )
+        # Connect buttons for expanded description matching
+        expanded_match_btn.click(
+            fn=categorize_products_with_expansion,
+            inputs=[expanded_text_input, gr.State(False), expanded_top_n, expanded_confidence, expanded_match_type],
+            outputs=[expanded_output],
+        )
+        expanded_examples_btn.click(
+            fn=load_examples,  # Reuse the same examples
+            inputs=[],
+            outputs=expanded_text_input
+        )
+        # Examples buttons
+        examples_btn.click(
+            fn=load_examples,
+            inputs=[],
+            outputs=text_input
+        )
+        category_examples_btn.click(
+            fn=load_examples,  # Reuse the same examples
+            inputs=[],
+            outputs=category_text_input
+        )
+        hybrid_examples_btn.click(
+            fn=load_examples,  # Reuse the same examples
+            inputs=[],
+            outputs=hybrid_text_input
+        )
+        gr.Markdown("Powered by Voyage AI embeddings • Built with Gradio")
+    return demo

ui_category_matching.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import gradio as gr
+from utils import SafeProgress
+from category_matching import load_categories, match_products_to_categories
+from ui_core import parse_input
+from ui_formatters import format_categories_html
+def categorize_products_by_category(product_input, is_file=False, top_n=5, confidence_threshold=0.5, progress=gr.Progress()):
+    """Categorize products by matching them to predefined categories"""
+    progress_tracker = SafeProgress(progress)
+    progress_tracker(0, desc="Starting categorization...")
+    # Parse input
+    product_names, error = parse_input(product_input, is_file)
+    if error:
+        return error
+    # Load categories
+    progress_tracker(0.2, desc="Loading categories...")
+    categories = load_categories()
+    # Match products to categories
+    progress_tracker(0.3, desc="Matching products to categories...")
+    match_results = match_products_to_categories(
+        product_names,
+        categories,
+        top_n=int(top_n),
+        confidence_threshold=confidence_threshold,
+        progress=progress
+    )
+    # Format results
+    progress_tracker(0.9, desc="Formatting results...")
+    output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
+    output_html += f"<p style='color: #555;'>Matched {len(product_names)} products to categories.</p>"
+    for product, categories in match_results.items():
+        output_html += format_categories_html(product, categories)
+        output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
+    output_html += "</div>"
+    if not match_results:
+        output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
+    progress_tracker(1.0, desc="Done!")
+    return output_html

ui_core.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import logging
+import json
+import pickle
+import numpy as np
+from typing import Tuple, List, Dict, Any, Optional
+import gradio as gr
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger('ui_core')
+# Global variables
+embeddings = {}
+# Update default path to point to the pickle file
+EMBEDDINGS_PATH = os.environ.get('EMBEDDINGS_PATH', 'data/ingredient_embeddings_voyageai.pkl')
+CATEGORY_EMBEDDINGS_PATH = os.environ.get('CATEGORY_EMBEDDINGS_PATH', 'data/category_embeddings.pickle')
+def load_embeddings(filepath: str = EMBEDDINGS_PATH) -> Dict[str, Any]:
+    """
+    Load ingredient embeddings from a pickle file
+    Args:
+        filepath: Path to the embeddings file
+    Returns:
+        Dictionary of ingredient embeddings
+    """
+    try:
+        logger.info(f"Attempting to load embeddings from: {filepath}")
+        if not os.path.exists(filepath):
+            logger.error(f"Embeddings file not found: {filepath}")
+            # Try alternative file formats
+            alt_paths = [
+                filepath.replace('.pkl', '.pickle'),
+                filepath.replace('.pickle', '.pkl'),
+                'data/ingredient_embeddings_voyageai.pkl',
+                'data/ingredient_embeddings.pickle'
+            ]
+            for alt_path in alt_paths:
+                if os.path.exists(alt_path) and alt_path != filepath:
+                    logger.info(f"Found alternative embeddings file: {alt_path}")
+                    filepath = alt_path
+                    break
+            else:
+                return {}
+        # Determine file type and load accordingly
+        if filepath.endswith(('.pkl', '.pickle')):
+            logger.info(f"Loading pickle file: {filepath}")
+            with open(filepath, 'rb') as f:
+                loaded_embeddings = pickle.load(f)
+        else:
+            logger.info(f"Loading JSON file: {filepath}")
+            with open(filepath, 'r') as f:
+                loaded_embeddings = json.load(f)
+        # Validate the loaded data
+        if not isinstance(loaded_embeddings, dict) or not loaded_embeddings:
+            logger.error(f"Invalid embeddings format in {filepath}")
+            return {}
+        # Convert lists to numpy arrays for faster processing
+        processed_embeddings = {}
+        for ingredient, embedding in loaded_embeddings.items():
+            if isinstance(embedding, list):
+                processed_embeddings[ingredient] = np.array(embedding)
+            else:
+                processed_embeddings[ingredient] = embedding
+        logger.info(f"Successfully loaded {len(processed_embeddings)} ingredient embeddings")
+        return processed_embeddings
+    except json.JSONDecodeError:
+        logger.error(f"Invalid JSON format in embeddings file: {filepath}")
+        return {}
+    except pickle.UnpicklingError:
+        logger.error(f"Invalid pickle format in embeddings file: {filepath}")
+        return {}
+    except Exception as e:
+        logger.error(f"Error loading embeddings: {str(e)}")
+        return {}
+# Load embeddings at module import time
+embeddings = load_embeddings()
+# If embeddings is empty, try loading category embeddings
+if not embeddings:
+    logger.info("No ingredient embeddings found, trying category embeddings...")
+    embeddings = load_embeddings(CATEGORY_EMBEDDINGS_PATH)
+# Sample product names for the example button
+EXAMPLE_PRODUCTS = """Nature's Promise Spring Water Multipack
+Red's Burritos
+Nature's Promise Spring Water Multipack
+Schweppes Seltzer 12 Pack
+Hunt's Pasta Sauce
+Buitoni Filled Pasta
+Buitoni Filled Pasta
+Samuel Adams or Blue Moon 12 Pack
+Mrs. T's Pierogies
+Buitoni Filled Pasta
+Pillsbury Dough
+Nature's Promise Organic Celery Hearts
+MorningStar Farms Meatless Nuggets, Patties or Crumbles
+Nature's Promise Organic Celery Hearts
+Boar's Head Mild Provolone Cheese
+Athenos Feta Crumbles"""
+def load_examples():
+    """Load example product names into the text input"""
+    return EXAMPLE_PRODUCTS
+from ui_formatters import get_formatted_css, THEME, set_theme
+def get_css():
+    """Return the CSS for the Gradio interface"""
+    return get_formatted_css()
+def parse_input(input_text, is_file=False) -> Tuple[List[str], Optional[str]]:
+    """Parse user input into a list of product names"""
+    try:
+        if is_file:
+            # Handle file input (assuming newline-separated product names)
+            product_names = [line.strip() for line in input_text.split('\n') if line.strip()]
+        else:
+            # Handle text input (assuming newline-separated product names)
+            product_names = [line.strip() for line in input_text.split('\n') if line.strip()]
+        if not product_names:
+            return [], "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No valid product names found. Please check your input.</div>"
+        return product_names, None
+    except Exception as e:
+        logger.error(f"Error parsing input: {str(e)}")
+        return [], f"<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error parsing input: {str(e)}</div>"

ui_expanded_matching.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import gradio as gr
+from utils import SafeProgress
+from embeddings import create_product_embeddings
+from similarity import compute_similarities
+from openai_expansion import expand_product_descriptions
+from ui_core import embeddings, parse_input, CATEGORY_EMBEDDINGS_PATH
+from ui_formatters import format_expanded_results_html, create_results_container
+from api_utils import get_openai_client, process_in_parallel, rank_ingredients_openai, rank_categories_openai
+from category_matching import load_categories, load_category_embeddings
+import json
+import os
+def categorize_products_with_expansion(product_input, is_file=False, top_n=10, confidence_threshold=0.5, match_type="ingredients", progress=gr.Progress()):
+    """
+    Categorize products using expanded descriptions from OpenAI
+    Args:
+        product_input: Text input with product names
+        is_file: Whether the input is a file
+        top_n: Number of top results to show
+        confidence_threshold: Confidence threshold for matches
+        match_type: Either "ingredients" or "categories"
+        progress: Progress tracking object
+    Returns:
+        HTML formatted results
+    """
+    progress_tracker = SafeProgress(progress)
+    progress_tracker(0, desc="Starting...")
+    # Parse input
+    product_names, error = parse_input(product_input, is_file)
+    if error:
+        return error
+    # Validate embeddings are loaded if doing ingredient matching
+    if match_type == "ingredients" and not embeddings:
+        return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
+    # Expand product descriptions
+    progress_tracker(0.2, desc="Expanding product descriptions...")
+    expanded_descriptions = expand_product_descriptions(product_names, progress=progress)
+    if not expanded_descriptions:
+        return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to expand product descriptions. Please try again or check your OpenAI API key.</div>"
+    # Get shared OpenAI client
+    openai_client = get_openai_client()
+    if match_type == "ingredients":
+        # Generate product embeddings
+        progress_tracker(0.4, desc="Generating product embeddings...")
+        product_embeddings = create_product_embeddings(product_names, progress=progress)
+        # Compute embedding similarities for ingredients
+        progress_tracker(0.6, desc="Computing ingredient similarities...")
+        all_similarities = compute_similarities(embeddings, product_embeddings)
+        if not all_similarities:
+            return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"
+        # Setup for OpenAI reranking
+        embedding_top_n = 20  # Number of candidates to consider from embeddings
+        progress_tracker(0.7, desc="Re-ranking with expanded descriptions...")
+        # Function for processing each product
+        def process_reranking(product):
+            if product not in all_similarities:
+                return product, []
+            candidates = all_similarities[product][:embedding_top_n]
+            if not candidates:
+                return product, []
+            candidate_ingredients = [c[0] for c in candidates]
+            expanded_text = expanded_descriptions.get(product, "")
+            try:
+                # Use the shared utility function
+                reranked_ingredients = rank_ingredients_openai(
+                    product=product,
+                    candidates=candidate_ingredients,
+                    expanded_description=expanded_text,
+                    client=openai_client,
+                    model="o3-mini",
+                    max_results=top_n,
+                    confidence_threshold=confidence_threshold,
+                    debug=True
+                )
+                return product, reranked_ingredients
+            except Exception as e:
+                print(f"Error reranking {product}: {e}")
+                # Fall back to top embedding match
+                return product, candidates[:1] if candidates[0][1] >= confidence_threshold else []
+        # Process all products in parallel
+        final_results = process_in_parallel(
+            items=product_names,
+            processor_func=process_reranking,
+            max_workers=min(10, len(product_names)),
+            progress_tracker=progress_tracker,
+            progress_start=0.7,
+            progress_end=0.9,
+            progress_desc="Re-ranking"
+        )
+    else:  # categories
+        # Load category embeddings instead of JSON categories
+        progress_tracker(0.5, desc="Loading category embeddings...")
+        category_embeddings = load_category_embeddings()
+        if not category_embeddings:
+            return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category embeddings found. Please check that the embeddings file exists at data/category_embeddings.pickle.</div>"
+        # Generate product embeddings
+        progress_tracker(0.6, desc="Generating product embeddings...")
+        product_embeddings = create_product_embeddings(product_names, progress=progress)
+        # Compute embedding similarities for categories
+        progress_tracker(0.7, desc="Computing category similarities...")
+        all_similarities = compute_similarities(category_embeddings, product_embeddings)
+        if not all_similarities:
+            return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category similarities found. Please try different product names.</div>"
+        embedding_top_n = min(20, top_n * 2)  # Number of candidates to consider from embeddings
+        # Collect all needed category IDs first
+        needed_category_ids = set()
+        for product, similarities in all_similarities.items():
+            for category_id, score in similarities[:embedding_top_n]:
+                if score >= confidence_threshold:
+                    needed_category_ids.add(category_id)
+        # Load only the needed categories from JSON
+        progress_tracker(0.75, desc="Loading category descriptions...")
+        category_descriptions = {}
+        if needed_category_ids:
+            try:
+                with open("categories.json", 'r') as f:
+                    categories_list = json.load(f)
+                    for item in categories_list:
+                        if item["id"] in needed_category_ids:
+                            category_descriptions[item["id"]] = item["text"]
+            except Exception as e:
+                print(f"Error loading category descriptions: {e}")
+        # Function to process each product
+        def process_category_matching(product):
+            if product not in all_similarities:
+                return product, []
+            candidates = all_similarities[product][:embedding_top_n]
+            if not candidates:
+                return product, []
+            # Get the expanded description
+            expanded_text = expanded_descriptions.get(product, "")
+            try:
+                # Use rank_categories_openai instead of match_products_to_categories_with_description
+                category_matches = rank_categories_openai(
+                    product=product,
+                    categories=category_descriptions,
+                    expanded_description=expanded_text,
+                    client=openai_client,
+                    # model="o3-mini",
+                    model="gpt-4o-mini",
+                    # model="gpt-4o",
+                    max_results=top_n,
+                    confidence_threshold=confidence_threshold,
+                    debug=True
+                )
+                # Format results with category descriptions if needed
+                formatted_matches = []
+                for category_id, score in category_matches:
+                    category_text = category_descriptions.get(category_id, "Unknown category")
+                    formatted_matches.append((category_id, category_text, score))
+                return product, formatted_matches
+            except Exception as e:
+                print(f"Error matching {product} to categories: {e}")
+                return product, []
+        # Process all products in parallel
+        final_results = process_in_parallel(
+            items=product_names,
+            processor_func=process_category_matching,
+            max_workers=min(10, len(product_names)),
+            progress_tracker=progress_tracker,
+            progress_start=0.7,
+            progress_end=0.9,
+            progress_desc="Category matching"
+        )
+    # Format results
+    progress_tracker(0.9, desc="Formatting results...")
+    result_elements = []
+    for product, matches in final_results.items():
+        result_elements.append(
+            format_expanded_results_html(
+                product=product,
+                results=matches,
+                expanded_description=expanded_descriptions.get(product, ""),
+                match_type=match_type
+            )
+        )
+    output_html = create_results_container(
+        result_elements,
+        header_text=f"Matched {len(product_names)} products to {match_type} using expanded descriptions."
+    )
+    if not final_results:
+        output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
+    progress_tracker(1.0, desc="Done!")
+    return output_html

ui_formatters.py ADDED Viewed

	@@ -0,0 +1,419 @@

+from typing import List, Dict, Tuple, Any
+from utils import get_confidence_color, get_confidence_bg_color
+# Theme configuration (can be easily switched between light/dark)
+THEME = "light"  # Options: "light", "dark"
+# Theme-specific colors
+THEMES = {
+    "light": {
+        "background": "#ffffff",
+        "card_bg": "#ffffff",
+        "card_border": "#ddd",
+        "header_bg": "#2c3e50",
+        "header_text": "#ffffff",
+        "text_primary": "#333333",
+        "text_secondary": "#555555",
+        "section_bg": "#f8f9fa",
+    },
+    "dark": {
+        "background": "#121212",
+        "card_bg": "#1e1e1e",
+        "card_border": "#333",
+        "header_bg": "#37474f",
+        "header_text": "#ffffff",
+        "text_primary": "#e0e0e0",
+        "text_secondary": "#b0bec5",
+        "section_bg": "#263238",
+    }
+}
+# Get current theme colors
+COLORS = THEMES[THEME]
+# Base styling constants (adjusted based on theme)
+STYLES = {
+    "card": f"margin-bottom: 20px; border: 1px solid {COLORS['card_border']}; border-radius: 8px; overflow: hidden; background-color: {COLORS['card_bg']};",
+    "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
+    "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
+    "flex_container": "display: flex; flex-wrap: wrap;",
+    "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
+    "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
+    "item_list": "list-style-type: none; padding-left: 0;",
+    "item": "margin-bottom: 8px; padding: 8px; border-radius: 4px;",
+    "empty_message": "color: #7f8c8d; font-style: italic;",
+    "info_panel": f"padding: 10px; background-color: {COLORS['section_bg']}; margin-bottom: 10px; border-radius: 4px;"
+}
+# Method colors (consistent across themes)
+METHOD_COLORS = {
+    "base": "#f39c12",        # Orange
+    "voyage": "#3498db",      # Blue
+    "chicory": "#9b59b6",     # Purple
+    "openai": "#2ecc71",      # Green
+    "expanded": "#e74c3c",    # Red
+    "hybrid": "#1abc9c",      # Turquoise
+    "categories": "#1abc9c"   # Same as hybrid
+}
+# Method display names
+METHOD_NAMES = {
+    "base": "Base Embeddings",
+    "voyage": "Voyage AI Reranker",
+    "chicory": "Chicory Parser",
+    "openai": "OpenAI o3-mini",
+    "expanded": "Expanded Description",
+    "hybrid": "Hybrid Matching",
+    "categories": "Category Matches"
+}
+def format_method_results(method_key, results, color_hex=None):
+    """
+    Format results for a single method section
+    Args:
+        method_key: Key identifying the method (base, voyage, etc.)
+        results: List of (name, score) tuples or format-specific data structure
+        color_hex: Optional color override (otherwise uses METHOD_COLORS)
+    Returns:
+        HTML string for the method section
+    """
+    # Get color from METHOD_COLORS if not provided
+    if color_hex is None:
+        color_hex = METHOD_COLORS.get(method_key, "#777777")
+    # Get method name from METHOD_NAMES or use the key with capitalization
+    method_name = METHOD_NAMES.get(method_key, method_key.replace('_', ' ').title())
+    html = f"<div class='method-results' style='{STYLES['method_container']}'>"
+    html += f"<h4 style='{STYLES['method_title']}; border-bottom: 2px solid {color_hex};'>{method_name}</h4>"
+    if results:
+        html += f"<ul style='{STYLES['item_list']}'>"
+        # Handle different result formats
+        for item in results:
+            # Handle tuple with 2 elements (name, score)
+            if isinstance(item, tuple) and len(item) == 2:
+                name, score = item
+            # Handle tuple with 3 elements (common in category results)
+            elif isinstance(item, tuple) and len(item) == 3:
+                id_val, text, score = item
+                name = f"<strong>{id_val}</strong>: {text}" if text else id_val
+            # Handle dictionary format
+            elif isinstance(item, dict) and "name" in item and "score" in item:
+                name = item["name"]
+                score = item["score"]
+            # Handle dictionary format with different keys
+            elif isinstance(item, dict) and "category" in item and "confidence" in item:
+                name = item["category"]
+                score = item["confidence"]
+            # Handle dictionary format for ingredients
+            elif isinstance(item, dict) and "ingredient" in item and "relevance_score" in item:
+                name = item["ingredient"]
+                score = item["relevance_score"]
+            # Default case - just convert to string
+            else:
+                name = str(item)
+                score = 0.0
+            # Ensure score is a float
+            try:
+                score = float(score)
+            except (ValueError, TypeError):
+                score = 0.0
+            confidence_percent = int(score * 100)
+            confidence_color = get_confidence_color(score)
+            bg_color = get_confidence_bg_color(score)
+            # Improved layout with better contrast and labeled confidence
+            html += f"<li style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 6px; padding: 6px; border-radius: 4px; background-color: rgba(240, 240, 240, 0.4);'>"
+            html += f"<span style='font-weight: 500; flex: 1;'>{name}</span>"
+            html += f"<span style='background-color: {bg_color}; border: 1px solid {confidence_color}; color: #000; font-weight: 600; padding: 2px 6px; border-radius: 4px; min-width: 70px; text-align: center; margin-left: 8px;'>Confidence: {confidence_percent}%</span>"
+            html += "</li>"
+        html += "</ul>"
+    else:
+        html += f"<p style='{STYLES['empty_message']}'>No results found</p>"
+    html += "</div>"
+    return html
+def format_result_card(title, content, header_bg_color=None):
+    """
+    Create a styled card with a header and content
+    Args:
+        title: Card title
+        content: HTML content for the card body
+        header_bg_color: Optional header background color
+    Returns:
+        HTML string for the card
+    """
+    if header_bg_color is None:
+        header_bg_color = COLORS['header_bg']  # Default header background color
+    html = f"<div class='result-card' style='{STYLES['card']}'>"
+    html += f"<div class='card-header' style='{STYLES['header']}; background-color: {header_bg_color};'>"
+    html += f"<h3 style='{STYLES['header_text']}'>{title}</h3>"
+    html += "</div>"
+    html += f"<div class='card-content'>{content}</div>"
+    html += "</div>"
+    return html
+def format_comparison_html(product, method_results):
+    """
+    Format the comparison results as HTML
+    Args:
+        product: Product name
+        method_results: Dictionary with results from different methods
+    Returns:
+        HTML string
+    """
+    # Create the methods comparison content
+    methods_html = f"<div class='methods-comparison' style='{STYLES['flex_container']}'>"
+    # Add results for each method
+    for method_key in ["base", "voyage", "chicory", "openai"]:
+        methods_html += format_method_results(
+            method_key=method_key,
+            results=method_results.get(method_key, [])
+        )
+    methods_html += "</div>"
+    # Create the full card with the methods content
+    return format_result_card(title=product, content=methods_html)
+def format_expanded_results_html(product, results, expanded_description, match_type="ingredients"):
+    """
+    Format results using expanded descriptions
+    Args:
+        product: Product name
+        results: List of tuples - either (match, score) for ingredients or (id, text, score) for categories
+        expanded_description: Expanded product description
+        match_type: Either "ingredients" or "categories"
+    Returns:
+        HTML for the result card
+    """
+    content = ""
+    # Add expanded description section
+    content += f"<div style='{STYLES['info_panel']}'>"
+    content += "<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>Expanded Description</h4>"
+    content += f"<p style='margin-bottom: 8px;'>{expanded_description}</p>"
+    content += "</div>"
+    # Format the results section - create custom section
+    color_hex = METHOD_COLORS.get(match_type, "#1abc9c")
+    # Add results section with custom title
+    content += f"<div class='method-results' style='margin-top: 15px; border-left: 3px solid {color_hex}; padding-left: 15px;'>"
+    title_text = "Ingredients" if match_type == "ingredients" else "Categories"
+    content += f"<h4 style='margin-top: 0; color: {color_hex};'>{title_text}</h4>"
+    if results:
+        content += "<ul style='margin-top: 5px; padding-left: 20px;'>"
+        for item in results:
+            # Handle both 2-value (match, score) and 3-value (id, text, score) tuples
+            if len(item) == 2:
+                match, score = item
+                display_text = match
+            elif len(item) == 3:
+                category_id, category_text, score = item  # For categories, use both id and text
+                display_text = f"<strong>{category_id}</strong>: {category_text}"
+            else:
+                continue  # Skip any invalid formats
+            confidence_percent = int(score * 100)
+            # Improved styling for confidence percentage - using black text for better contrast
+            confidence_color = get_confidence_color(score)
+            bg_color = get_confidence_bg_color(score)
+            content += f"<li style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 4px;'>"
+            content += f"<span style='font-weight: 500; flex: 1;'>{display_text}</span>"
+            content += f"<span style='background-color: {bg_color}; border: 1px solid {confidence_color}; color: #000; font-weight: 600; padding: 2px 6px; border-radius: 4px; min-width: 70px; text-align: center; margin-left: 8px;'>Confidence: {confidence_percent}%</span>"
+            content += "</li>"
+        content += "</ul>"
+    else:
+        content += "<p style='color: #777; font-style: italic; margin: 5px 0;'>No matches found above confidence threshold.</p>"
+    content += "</div>"
+    return format_result_card(title=product, content=content)
+def format_hybrid_results_html(product, results, summary=None):
+    """
+    Format hybrid matching results as HTML
+    Args:
+        product: Product name
+        results: List of (ingredient, score) tuples
+        summary: Optional matching summary or explanation
+    Returns:
+        HTML string
+    """
+    content = ""
+    # Add summary if available
+    if summary:
+        content += f"<div class='matching-summary' style='{STYLES['info_panel']}'>"
+        content += f"<p style='margin: 0; font-style: italic; color: {COLORS['text_secondary']};'>{summary}</p>"
+        content += "</div>"
+    # Add the ingredient results
+    content += format_method_results(
+        method_key="hybrid",
+        results=results
+    )
+    return format_result_card(title=product, content=content)
+def create_results_container(html_elements, header_text=None):
+    """
+    Create a container for multiple results
+    Args:
+        html_elements: List of HTML strings to include
+        header_text: Optional header text
+    Returns:
+        HTML string for the container
+    """
+    container = "<div class='results-container' style='font-family: Arial, sans-serif;'>"
+    if header_text:
+        container += f"<p style='color: {COLORS['text_secondary']};'>{header_text}</p>"
+    container += ''.join(html_elements)
+    container += "</div>"
+    return container
+def format_categories_html(product, categories, chicory_result=None, header_color=None):
+    """
+    Format category matching results as HTML
+    Args:
+        product: Product name
+        categories: List of (category, score) tuples
+        chicory_result: Optional chicory parser result for the product
+        header_color: Optional header background color
+    Returns:
+        HTML string
+    """
+    content = ""
+    # Add Chicory results if available
+    if chicory_result:
+        content += f"<div style='{STYLES['info_panel']}'>"
+        content += "<h4 style='margin-top: 0; border-bottom: 1px solid rgba(0,0,0,0.1); padding-bottom: 8px;'>Chicory Parser Results</h4>"
+        if isinstance(chicory_result, dict):
+            ingredient = chicory_result.get("ingredient", "Not found")
+            confidence = chicory_result.get("confidence", 0)
+            confidence_percent = int(confidence * 100)
+            content += f"<div style='display: flex; justify-content: space-between; align-items: center; padding: 8px; border-radius: 4px;'>"
+            content += f"<span style='font-weight: bold;'>{ingredient}</span>"
+            content += f"<span style='background-color: {get_confidence_bg_color(confidence)}; border: 1px solid {get_confidence_color(confidence)}; color: #000; font-weight: 600; padding: 2px 6px; border-radius: 4px; min-width: 70px; text-align: center;'>Confidence: {confidence_percent}%</span>"
+            content += "</div>"
+        else:
+            content += f"<p style='{STYLES['empty_message']}'>No Chicory results available</p>"
+        content += "</div>"
+    # Add the category results
+    content += format_method_results(
+        method_key="categories",
+        results=categories,
+        color_hex=header_color or METHOD_COLORS.get("categories", "#1abc9c")
+    )
+    return format_result_card(title=product, content=content)
+def get_formatted_css():
+    """
+    Generate CSS for the UI based on current theme
+    Returns:
+        CSS string ready to use in Gradio
+    """
+    return f"""
+        .gradio-container .prose {{
+            max-width: 100%;
+        }}
+        #results-container {{
+            height: 600px !important;
+            overflow-y: auto !important;
+            overflow-x: hidden !important;
+            padding: 15px !important;
+            border: 1px solid {COLORS['card_border']} !important;
+            background-color: {COLORS['background']} !important;
+            color: {COLORS['text_primary']} !important;
+        }}
+        /* Style for method columns */
+        .methods-comparison {{
+            display: flex;
+            flex-wrap: wrap;
+        }}
+        .method-results {{
+            flex: 1;
+            min-width: 200px;
+            padding: 15px;
+            border-right: 1px solid {COLORS['card_border']};
+        }}
+        /* Make the product header more visible */
+        .product-header {{
+            background-color: {COLORS['header_bg']} !important;
+            padding: 12px 15px !important;
+            border-bottom: 1px solid {COLORS['card_border']} !important;
+        }}
+        .product-header h3 {{
+            margin: 0 !important;
+            font-size: 18px !important;
+            color: {COLORS['header_text']} !important;
+            background-color: transparent !important;
+        }}
+        /* Remove all nested scrollbars */
+        #results-container * {{
+            overflow: visible !important;
+            height: auto !important;
+            max-height: none !important;
+        }}
+    """
+def set_theme(theme_name):
+    """
+    Set the UI theme (light or dark)
+    Args:
+        theme_name: 'light' or 'dark'
+    Returns:
+        None - updates global variables
+    """
+    global THEME, COLORS, STYLES
+    if theme_name in THEMES:
+        THEME = theme_name
+        COLORS = THEMES[THEME]
+        # Update styles with new theme colors
+        STYLES.update({
+            "card": f"margin-bottom: 20px; border: 1px solid {COLORS['card_border']}; border-radius: 8px; overflow: hidden; background-color: {COLORS['card_bg']};",
+            "header": f"background-color: {COLORS['header_bg']}; padding: 12px 15px; border-bottom: 1px solid {COLORS['card_border']};",
+            "header_text": f"margin: 0; font-size: 18px; color: {COLORS['header_text']};",
+            "method_container": f"flex: 1; min-width: 200px; padding: 15px; border-right: 1px solid {COLORS['card_border']};",
+            "method_title": f"margin-top: 0; color: {COLORS['text_primary']}; padding-bottom: 8px;",
+            "info_panel": f"padding: 10px; background-color: {COLORS['section_bg']}; margin-bottom: 10px; border-radius: 4px;"
+        })

ui_hybrid_matching.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import gradio as gr
+from utils import SafeProgress
+from category_matching import load_categories, hybrid_category_matching
+from similarity import hybrid_ingredient_matching
+from ui_core import embeddings, parse_input
+from ui_formatters import format_hybrid_results_html, create_results_container
+def categorize_products_hybrid_generic(product_input, is_file=False, embedding_top_n=20,
+                                     final_top_n=5, confidence_threshold=0.5,
+                                     match_type="categories",
+                                     progress=gr.Progress()):
+    """Generic hybrid matching for either categories or ingredients"""
+    progress_tracker = SafeProgress(progress)
+    progress_tracker(0, desc=f"Starting hybrid {match_type} matching...")
+    # Parse input
+    product_names, error = parse_input(product_input, is_file)
+    if error:
+        return error
+    # Determine which matching function to use
+    if match_type == "categories":
+        # Load categories
+        progress_tracker(0.2, desc="Loading categories...")
+        categories = load_categories()
+        # Use hybrid approach for categories
+        progress_tracker(0.3, desc="Finding and re-ranking categories...")
+        match_results = hybrid_category_matching(
+            product_names, categories,
+            embedding_top_n=int(embedding_top_n),
+            final_top_n=int(final_top_n),
+            confidence_threshold=confidence_threshold,
+            progress=progress
+        )
+    else:  # ingredients
+        # Use hybrid approach for ingredients
+        progress_tracker(0.3, desc="Finding and re-ranking ingredients...")
+        match_results = hybrid_ingredient_matching(
+            product_names, embeddings,
+            embedding_top_n=int(embedding_top_n),
+            final_top_n=int(final_top_n),
+            confidence_threshold=confidence_threshold,
+            progress=progress
+        )
+    # Format results using centralized formatters
+    progress_tracker(0.9, desc="Formatting results...")
+    result_elements = []
+    for product, matches in match_results.items():
+        result_elements.append(
+            format_hybrid_results_html(
+                product=product,
+                results=matches,
+                summary=f"{match_type.capitalize()} matches using hybrid approach."
+            )
+        )
+    output_html = create_results_container(
+        result_elements,
+        header_text=f"Matched {len(product_names)} products to {match_type} using hybrid approach."
+    )
+    if not match_results:
+        output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
+    progress_tracker(1.0, desc="Done!")
+    return output_html
+# Then use it like this:
+def categorize_products_hybrid(product_input, is_file=False, embedding_top_n=20,
+                             final_top_n=5, confidence_threshold=0.5,
+                             progress=gr.Progress()):
+    return categorize_products_hybrid_generic(
+        product_input, is_file, embedding_top_n, final_top_n,
+        confidence_threshold, match_type="categories", progress=progress
+    )
+def categorize_products_hybrid_ingredients(product_input, is_file=False, embedding_top_n=20,
+                                         final_top_n=5, confidence_threshold=0.5,
+                                         progress=gr.Progress()):
+    return categorize_products_hybrid_generic(
+        product_input, is_file, embedding_top_n, final_top_n,
+        confidence_threshold, match_type="ingredients", progress=progress
+    )

ui_ingredient_matching.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import gradio as gr
+from utils import SafeProgress
+from embeddings import create_product_embeddings
+from similarity import compute_similarities
+from chicory_api import call_chicory_parser
+from ui_core import embeddings, parse_input
+from ui_formatters import format_categories_html, create_results_container
+def categorize_products(product_input, is_file=False, top_n=10, confidence_threshold=0.5, progress=gr.Progress()):
+    """Categorize products from text input or file"""
+    progress_tracker = SafeProgress(progress)
+    progress_tracker(0, desc="Starting...")
+    # Parse input
+    product_names, error = parse_input(product_input, is_file)
+    if error:
+        return error
+    # Validate embeddings are loaded
+    if not embeddings:
+        return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
+    # Create embeddings
+    progress_tracker(0.2, desc="Generating product embeddings...")
+    products_embeddings = create_product_embeddings(product_names, progress=progress)
+    if not products_embeddings:
+        return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: Failed to generate product embeddings. Please try again with different product names.</div>"
+    # Call Chicory Parser API
+    progress_tracker(0.5, desc="Calling Chicory Parser API...")
+    chicory_results = call_chicory_parser(product_names, progress=progress)
+    # Compute similarities
+    progress_tracker(0.7, desc="Computing similarities...")
+    all_similarities = compute_similarities(embeddings, products_embeddings)
+    # Format results
+    progress_tracker(0.9, desc="Formatting results...")
+    output_html = "<div style='font-family: Arial, sans-serif; max-width: 100%; overflow-x: auto;'>"
+    output_html += f"<p style='color: #555;'>Processing {len(product_names)} products.</p>"
+    for product, similarities in all_similarities.items():
+        filtered_similarities = [(ingredient, score) for ingredient, score in similarities if score >= confidence_threshold]
+        top_similarities = filtered_similarities[:int(top_n)]
+        # Debug info for Chicory results
+        chicory_data = chicory_results.get(product, [])
+        output_html += format_categories_html(product, top_similarities, chicory_result=chicory_data)
+        output_html += "<hr style='margin: 15px 0; border: 0; border-top: 1px solid #eee;'>"
+    output_html += "</div>"
+    if not all_similarities:
+        output_html = "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"
+    progress_tracker(1.0, desc="Done!")
+    return output_html

utils.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from typing import Optional, Iterator, Any
+from tqdm import tqdm as tqdm_original
+import sys
+import pickle
+import json
+import os
+class SafeProgress:
+    """Wrapper for progress tracking that handles both tqdm (console) and Gradio progress"""
+    def __init__(self, progress_obj=None, desc="Processing", track_tqdm=True):
+        self.progress = progress_obj
+        self.desc = desc
+        self.track_tqdm = track_tqdm
+        self.console_progress = None
+    def __call__(self, value, desc=None):
+        """Update progress indicator directly with a value"""
+        if desc is None:
+            desc = self.desc
+        # Update Gradio progress if available
+        if self.progress is not None:
+            try:
+                self.progress(value, desc=desc)
+            except Exception as e:
+                print(f"Progress update error: {e}")
+        # Always show console progress
+        if value < 1.0 and self.console_progress is None:
+            # Initialize console progress bar
+            self.console_progress = tqdm_original(total=100, desc=desc, file=sys.stdout)
+            self.console_progress.update(int(value * 100))
+        elif value < 1.0:
+            # Update existing console progress bar
+            current = int(value * 100)
+            previous = self.console_progress.n
+            if current > previous:
+                self.console_progress.update(current - previous)
+                self.console_progress.set_description(desc)
+        elif self.console_progress is not None:
+            # Complete and close the progress bar
+            self.console_progress.update(100 - self.console_progress.n)
+            self.console_progress.close()
+            self.console_progress = None
+    def tqdm(self, iterable, desc=None, total=None):
+        """Wrap an iterable with a progress bar for iteration"""
+        if desc is None:
+            desc = self.desc
+        # Track with Gradio if available
+        if self.progress is not None:
+            if hasattr(self.progress, 'tqdm'):
+                # Use Gradio's tqdm if available
+                for item in self.progress.tqdm(iterable, desc=desc, total=total):
+                    yield item
+                return
+        # Always provide console progress bar
+        length = total if total is not None else len(iterable) if hasattr(iterable, "__len__") else None
+        with tqdm_original(iterable, desc=desc, total=length, file=sys.stdout) as pbar:
+            # Track progress in Gradio manually if needed
+            i = 0
+            for item in pbar:
+                if self.progress is not None and length:
+                    self.progress((i + 1) / length, desc=desc)
+                yield item
+                i += 1
+def load_embeddings(embeddings_path):
+    """Load ingredient embeddings from pickle file"""
+    print(f"Loading ingredient embeddings from {embeddings_path}")
+    with open(embeddings_path, "rb") as f:
+        ingredients_embeddings = pickle.load(f)
+    print(f"Loaded {len(ingredients_embeddings)} ingredient embeddings")
+    return ingredients_embeddings
+def preprocess_product_for_matching(product, progress=None, description=None):
+    """
+    Preprocess a product for ingredient matching.
+    Args:
+        product (dict): Product dictionary containing at minimum 'name' and 'ingredients'
+        progress (SafeProgress, optional): Progress bar to update
+        description (str, optional): Description for progress update
+    Returns:
+        dict: Processed product with normalized fields ready for matching
+    """
+    try:
+        # Extract essential product info
+        processed_product = {
+            'id': product.get('id', ''),
+            'name': product.get('name', '').strip(),
+            'ingredients': product.get('ingredients', '').strip(),
+            'image_url': product.get('image_url', ''),
+            'url': product.get('url', ''),
+        }
+        # Skip products without ingredients
+        if not processed_product['ingredients']:
+            if progress:
+                progress.update(1, description=f"{description}: Skipping product without ingredients")
+            return None
+        # Normalize ingredients text
+        processed_product['ingredients'] = processed_product['ingredients'].replace('\n', ' ').strip()
+        # Additional preprocessing could be added here
+        if progress:
+            progress.update(1, description=f"{description}: Processed {processed_product['name']}")
+        return processed_product
+    except Exception as e:
+        if progress:
+            progress.update(1, description=f"{description}: Error processing product: {str(e)}")
+        return None
+# Keep these color utility functions in utils.py as they're generic helpers:
+def get_confidence_color(score):
+    """Get color based on confidence score"""
+    if score >= 0.8:
+        return "#1a8a38"  # Strong green
+    elif score >= 0.65:
+        return "#4caf50"  # Medium green
+    elif score >= 0.5:
+        return "#8bc34a"  # Light green
+    else:
+        return "#9e9e9e"  # Gray
+def get_confidence_bg_color(score):
+    """Get background color for confidence badge based on score"""
+    if score >= 0.8:
+        return "#2e7d32"  # Dark green
+    elif score >= 0.65:
+        return "#558b2f"  # Medium green
+    elif score >= 0.5:
+        return "#9e9d24"  # Light green/yellow
+    else:
+        return "#757575"  # Gray
+def get_confidence_text_color(score):
+    """Get text color that's readable on the confidence background"""
+    if score >= 0.5:
+        return "#ffffff"  # White text on dark backgrounds
+    else:
+        return "#f5f5f5"  # Light gray on gray background
+# Remove any UI formatting-specific functions that now exist in ui_formatters.py:
+# - format_categories_html
+# - create_results_container
+# - Any other UI formatting functions