product_ingredient_demo / embeddings.py
esilver's picture
Initial commit
31ebc8b
raw
history blame
4.96 kB
from typing import List, Dict, Any, Optional
from utils import SafeProgress
import os
import voyageai
import time
import numpy as np
from concurrent.futures import ThreadPoolExecutor
# Set Voyage AI API key directly
voyageai.api_key = os.getenv("VOYAGE_API_KEY")
def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
"""Get embeddings for a list of texts in batches"""
all_embeddings = []
total_texts = len(texts)
# Pre-process all texts to replace newlines
texts = [text.replace("\n", " ") for text in texts]
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
current_count = min(i + batch_size, total_texts)
try:
response = voyageai.Embedding.create(input=batch, model=model)
batch_embeddings = [item['embedding'] for item in response['data']]
all_embeddings.extend(batch_embeddings)
# Sleep briefly to avoid rate limits
if i + batch_size < len(texts):
time.sleep(0.5)
except Exception as e:
print(f"Error in batch {i//batch_size + 1}: {e}")
# Add empty embeddings for failed batch
all_embeddings.extend([None] * len(batch))
return all_embeddings
def create_product_embeddings(products: List[str], batch_size: int = 100, progress=None) -> Dict[str, Any]:
"""
Create embeddings for product names with optimization for duplicates
Args:
products: List of product names to create embeddings for
batch_size: Maximum number of products to process in one batch
progress: Optional progress tracking object (Gradio progress bar)
Returns:
Dictionary mapping product names to their embeddings
"""
progress_tracker = SafeProgress(progress, desc="Generating embeddings")
total_products = len(products)
# Initialize results dictionary
product_embeddings = {}
# Use the same model as for ingredients (voyage-3-large)
model = "voyage-3-large"
# Process in batches with de-duplication
progress_tracker(0.1, desc=f"Starting embeddings for {total_products} products")
# De-duplication step
unique_products = []
product_to_index = {}
index_map = {} # Maps original index to index in unique_products
for i, product in enumerate(products):
if product in product_to_index:
# Product already seen, just store the mapping
index_map[i] = product_to_index[product]
else:
# New unique product
product_to_index[product] = len(unique_products)
index_map[i] = len(unique_products)
unique_products.append(product)
progress_tracker(0.2, desc=f"Found {len(unique_products)} unique products out of {total_products} total")
if len(unique_products) == 0:
progress_tracker(1.0, desc="No valid products to process")
return {}
# Get embeddings in batches for unique products only
try:
# Pre-process all texts to replace newlines
clean_products = [product.replace("\n", " ") for product in unique_products]
progress_tracker(0.3, desc=f"Calling VoyageAI API for {len(clean_products)} unique products")
# Process in smaller batches for better reliability
unique_embeddings = get_embeddings_batch(clean_products, model=model, batch_size=batch_size)
# Map embeddings back to all products
progress_tracker(0.8, desc=f"Mapping embeddings back to all products")
for i, product in enumerate(products):
unique_idx = index_map[i]
if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
# Store as dictionary with 'embedding' key for consistent format
product_embeddings[product] = {
"embedding": unique_embeddings[unique_idx]
}
progress_tracker(0.9, desc="Processing embeddings completed")
except Exception as e:
progress_tracker(0.9, desc=f"Error generating embeddings: {str(e)}")
print(f"Error generating product embeddings: {e}")
progress_tracker(1.0, desc=f"Completed embeddings for {len(product_embeddings)} products")
return product_embeddings
def _generate_embeddings_for_batch(batch: List[str]) -> Dict[str, Any]:
"""
Generate embeddings for a batch of products
"""
# This is a placeholder for your actual embedding generation logic
# Replace with your actual implementation
import time
# Your existing embedding code should go here instead of this placeholder
embeddings = {}
for product in batch:
# Replace with actual embedding creation
embeddings[product] = {"embedding": [0.1, 0.2, 0.3]}
return embeddings