Spaces:
Sleeping
Sleeping
File size: 2,645 Bytes
a318724 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import voyageai
import time
import numpy as np
# Set Voyage AI API key directly
voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"
def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
"""Get embeddings for a list of texts in batches"""
all_embeddings = []
total_texts = len(texts)
# Pre-process all texts to replace newlines
texts = [text.replace("\n", " ") for text in texts]
for i in range(0, len(texts), batch_size):
batch = texts[i:i+batch_size]
try:
response = voyageai.Embedding.create(input=batch, model=model)
batch_embeddings = [item['embedding'] for item in response['data']]
all_embeddings.extend(batch_embeddings)
# Sleep briefly to avoid rate limits
if i + batch_size < len(texts):
time.sleep(0.5)
except Exception as e:
print(f"Error in batch {i//batch_size + 1}: {e}")
# Add empty embeddings for failed batch
all_embeddings.extend([None] * len(batch))
return all_embeddings
def create_product_embeddings(products, batch_size=100):
"""Create embeddings for products using batch processing with deduplication"""
# De-duplication step
unique_products = []
product_to_index = {}
index_map = {} # Maps original index to index in unique_products
for i, product in enumerate(products):
if product in product_to_index:
# Product already seen, just store the mapping
index_map[i] = product_to_index[product]
else:
# New unique product
product_to_index[product] = len(unique_products)
index_map[i] = len(unique_products)
unique_products.append(product)
print(f"Found {len(unique_products)} unique products out of {len(products)} total")
if len(unique_products) == 0:
return {}
# Process only unique products
print(f"Processing {len(unique_products)} unique products")
# Get embeddings for unique products
unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
# Map embeddings back to all products
all_products_dict = {}
for i, product in enumerate(products):
unique_idx = index_map[i]
if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
all_products_dict[product] = unique_embeddings[unique_idx]
print(f"Created embeddings for {len(all_products_dict)} products")
return all_products_dict
|