Spaces:
Sleeping
Sleeping
from typing import List, Dict, Any, Optional | |
from utils import SafeProgress | |
import os | |
import voyageai | |
import time | |
import numpy as np | |
from concurrent.futures import ThreadPoolExecutor | |
# Set Voyage AI API key directly | |
voyageai.api_key = os.getenv("VOYAGE_API_KEY") | |
def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100): | |
"""Get embeddings for a list of texts in batches""" | |
all_embeddings = [] | |
total_texts = len(texts) | |
# Pre-process all texts to replace newlines | |
texts = [text.replace("\n", " ") for text in texts] | |
for i in range(0, len(texts), batch_size): | |
batch = texts[i:i+batch_size] | |
current_count = min(i + batch_size, total_texts) | |
try: | |
response = voyageai.Embedding.create(input=batch, model=model) | |
batch_embeddings = [item['embedding'] for item in response['data']] | |
all_embeddings.extend(batch_embeddings) | |
# Sleep briefly to avoid rate limits | |
if i + batch_size < len(texts): | |
time.sleep(0.5) | |
except Exception as e: | |
print(f"Error in batch {i//batch_size + 1}: {e}") | |
# Add empty embeddings for failed batch | |
all_embeddings.extend([None] * len(batch)) | |
return all_embeddings | |
def create_product_embeddings(products: List[str], batch_size: int = 100, progress=None) -> Dict[str, Any]: | |
""" | |
Create embeddings for product names with optimization for duplicates | |
Args: | |
products: List of product names to create embeddings for | |
batch_size: Maximum number of products to process in one batch | |
progress: Optional progress tracking object (Gradio progress bar) | |
Returns: | |
Dictionary mapping product names to their embeddings | |
""" | |
progress_tracker = SafeProgress(progress, desc="Generating embeddings") | |
total_products = len(products) | |
# Initialize results dictionary | |
product_embeddings = {} | |
# Use the same model as for ingredients (voyage-3-large) | |
model = "voyage-3-large" | |
# Process in batches with de-duplication | |
progress_tracker(0.1, desc=f"Starting embeddings for {total_products} products") | |
# De-duplication step | |
unique_products = [] | |
product_to_index = {} | |
index_map = {} # Maps original index to index in unique_products | |
for i, product in enumerate(products): | |
if product in product_to_index: | |
# Product already seen, just store the mapping | |
index_map[i] = product_to_index[product] | |
else: | |
# New unique product | |
product_to_index[product] = len(unique_products) | |
index_map[i] = len(unique_products) | |
unique_products.append(product) | |
progress_tracker(0.2, desc=f"Found {len(unique_products)} unique products out of {total_products} total") | |
if len(unique_products) == 0: | |
progress_tracker(1.0, desc="No valid products to process") | |
return {} | |
# Get embeddings in batches for unique products only | |
try: | |
# Pre-process all texts to replace newlines | |
clean_products = [product.replace("\n", " ") for product in unique_products] | |
progress_tracker(0.3, desc=f"Calling VoyageAI API for {len(clean_products)} unique products") | |
# Process in smaller batches for better reliability | |
unique_embeddings = get_embeddings_batch(clean_products, model=model, batch_size=batch_size) | |
# Map embeddings back to all products | |
progress_tracker(0.8, desc=f"Mapping embeddings back to all products") | |
for i, product in enumerate(products): | |
unique_idx = index_map[i] | |
if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None: | |
# Store as dictionary with 'embedding' key for consistent format | |
product_embeddings[product] = { | |
"embedding": unique_embeddings[unique_idx] | |
} | |
progress_tracker(0.9, desc="Processing embeddings completed") | |
except Exception as e: | |
progress_tracker(0.9, desc=f"Error generating embeddings: {str(e)}") | |
print(f"Error generating product embeddings: {e}") | |
progress_tracker(1.0, desc=f"Completed embeddings for {len(product_embeddings)} products") | |
return product_embeddings | |
def _generate_embeddings_for_batch(batch: List[str]) -> Dict[str, Any]: | |
""" | |
Generate embeddings for a batch of products | |
""" | |
# This is a placeholder for your actual embedding generation logic | |
# Replace with your actual implementation | |
import time | |
# Your existing embedding code should go here instead of this placeholder | |
embeddings = {} | |
for product in batch: | |
# Replace with actual embedding creation | |
embeddings[product] = {"embedding": [0.1, 0.2, 0.3]} | |
return embeddings | |