product_ingredient_demo / embeddings.py
esilver's picture
refactored
a318724
raw
history blame
2.65 kB
import voyageai
import time
import numpy as np
# Set Voyage AI API key directly
voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
"""Get embeddings for a list of texts in batches"""
all_embeddings = []
total_texts = len(texts)
# Pre-process all texts to replace newlines
texts = [text.replace("\n", " ") for text in texts]
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
try:
response = voyageai.Embedding.create(input=batch, model=model)
batch_embeddings = [item['embedding'] for item in response['data']]
all_embeddings.extend(batch_embeddings)
# Sleep briefly to avoid rate limits
if i + batch_size < len(texts):
time.sleep(0.5)
except Exception as e:
print(f"Error in batch {i//batch_size + 1}: {e}")
# Add empty embeddings for failed batch
all_embeddings.extend([None] * len(batch))
return all_embeddings
def create_product_embeddings(products, batch_size=100):
"""Create embeddings for products using batch processing with deduplication"""
# De-duplication step
unique_products = []
product_to_index = {}
index_map = {} # Maps original index to index in unique_products
for i, product in enumerate(products):
if product in product_to_index:
# Product already seen, just store the mapping
index_map[i] = product_to_index[product]
else:
# New unique product
product_to_index[product] = len(unique_products)
index_map[i] = len(unique_products)
unique_products.append(product)
print(f"Found {len(unique_products)} unique products out of {len(products)} total")
if len(unique_products) == 0:
return {}
# Process only unique products
print(f"Processing {len(unique_products)} unique products")
# Get embeddings for unique products
unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
# Map embeddings back to all products
all_products_dict = {}
for i, product in enumerate(products):
unique_idx = index_map[i]
if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
all_products_dict[product] = unique_embeddings[unique_idx]
print(f"Created embeddings for {len(all_products_dict)} products")
return all_products_dict