from typing import List, Dict, Any, Optional from utils import SafeProgress import os import voyageai import time import numpy as np from concurrent.futures import ThreadPoolExecutor # Set Voyage AI API key directly voyageai.api_key = os.getenv("VOYAGE_API_KEY") def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100): """Get embeddings for a list of texts in batches""" all_embeddings = [] total_texts = len(texts) # Pre-process all texts to replace newlines texts = [text.replace("\n", " ") for text in texts] for i in range(0, len(texts), batch_size): batch = texts[i:i+batch_size] current_count = min(i + batch_size, total_texts) try: response = voyageai.Embedding.create(input=batch, model=model) batch_embeddings = [item['embedding'] for item in response['data']] all_embeddings.extend(batch_embeddings) # Sleep briefly to avoid rate limits if i + batch_size < len(texts): time.sleep(0.5) except Exception as e: print(f"Error in batch {i//batch_size + 1}: {e}") # Add empty embeddings for failed batch all_embeddings.extend([None] * len(batch)) return all_embeddings def create_product_embeddings(products: List[str], batch_size: int = 100, progress=None) -> Dict[str, Any]: """ Create embeddings for product names with optimization for duplicates Args: products: List of product names to create embeddings for batch_size: Maximum number of products to process in one batch progress: Optional progress tracking object (Gradio progress bar) Returns: Dictionary mapping product names to their embeddings """ progress_tracker = SafeProgress(progress, desc="Generating embeddings") total_products = len(products) # Initialize results dictionary product_embeddings = {} # Use the same model as for ingredients (voyage-3-large) model = "voyage-3-large" # Process in batches with de-duplication progress_tracker(0.1, desc=f"Starting embeddings for {total_products} products") # De-duplication step unique_products = [] product_to_index = {} index_map = {} # Maps original index to index in unique_products for i, product in enumerate(products): if product in product_to_index: # Product already seen, just store the mapping index_map[i] = product_to_index[product] else: # New unique product product_to_index[product] = len(unique_products) index_map[i] = len(unique_products) unique_products.append(product) progress_tracker(0.2, desc=f"Found {len(unique_products)} unique products out of {total_products} total") if len(unique_products) == 0: progress_tracker(1.0, desc="No valid products to process") return {} # Get embeddings in batches for unique products only try: # Pre-process all texts to replace newlines clean_products = [product.replace("\n", " ") for product in unique_products] progress_tracker(0.3, desc=f"Calling VoyageAI API for {len(clean_products)} unique products") # Process in smaller batches for better reliability unique_embeddings = get_embeddings_batch(clean_products, model=model, batch_size=batch_size) # Map embeddings back to all products progress_tracker(0.8, desc=f"Mapping embeddings back to all products") for i, product in enumerate(products): unique_idx = index_map[i] if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None: # Store as dictionary with 'embedding' key for consistent format product_embeddings[product] = { "embedding": unique_embeddings[unique_idx] } progress_tracker(0.9, desc="Processing embeddings completed") except Exception as e: progress_tracker(0.9, desc=f"Error generating embeddings: {str(e)}") print(f"Error generating product embeddings: {e}") progress_tracker(1.0, desc=f"Completed embeddings for {len(product_embeddings)} products") return product_embeddings def _generate_embeddings_for_batch(batch: List[str]) -> Dict[str, Any]: """ Generate embeddings for a batch of products """ # This is a placeholder for your actual embedding generation logic # Replace with your actual implementation import time # Your existing embedding code should go here instead of this placeholder embeddings = {} for product in batch: # Replace with actual embedding creation embeddings[product] = {"embedding": [0.1, 0.2, 0.3]} return embeddings