Spaces:
Sleeping
Sleeping
import voyageai | |
import time | |
import numpy as np | |
# Set Voyage AI API key directly | |
voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0" | |
def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100): | |
"""Get embeddings for a list of texts in batches""" | |
all_embeddings = [] | |
total_texts = len(texts) | |
# Pre-process all texts to replace newlines | |
texts = [text.replace("\n", " ") for text in texts] | |
for i in range(0, len(texts), batch_size): | |
batch = texts[i:i+batch_size] | |
try: | |
response = voyageai.Embedding.create(input=batch, model=model) | |
batch_embeddings = [item['embedding'] for item in response['data']] | |
all_embeddings.extend(batch_embeddings) | |
# Sleep briefly to avoid rate limits | |
if i + batch_size < len(texts): | |
time.sleep(0.5) | |
except Exception as e: | |
print(f"Error in batch {i//batch_size + 1}: {e}") | |
# Add empty embeddings for failed batch | |
all_embeddings.extend([None] * len(batch)) | |
return all_embeddings | |
def create_product_embeddings(products, batch_size=100): | |
"""Create embeddings for products using batch processing with deduplication""" | |
# De-duplication step | |
unique_products = [] | |
product_to_index = {} | |
index_map = {} # Maps original index to index in unique_products | |
for i, product in enumerate(products): | |
if product in product_to_index: | |
# Product already seen, just store the mapping | |
index_map[i] = product_to_index[product] | |
else: | |
# New unique product | |
product_to_index[product] = len(unique_products) | |
index_map[i] = len(unique_products) | |
unique_products.append(product) | |
print(f"Found {len(unique_products)} unique products out of {len(products)} total") | |
if len(unique_products) == 0: | |
return {} | |
# Process only unique products | |
print(f"Processing {len(unique_products)} unique products") | |
# Get embeddings for unique products | |
unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size) | |
# Map embeddings back to all products | |
all_products_dict = {} | |
for i, product in enumerate(products): | |
unique_idx = index_map[i] | |
if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None: | |
all_products_dict[product] = unique_embeddings[unique_idx] | |
print(f"Created embeddings for {len(all_products_dict)} products") | |
return all_products_dict | |