import voyageai import time import numpy as np # Set Voyage AI API key directly voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0" def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100): """Get embeddings for a list of texts in batches""" all_embeddings = [] total_texts = len(texts) # Pre-process all texts to replace newlines texts = [text.replace("\n", " ") for text in texts] for i in range(0, len(texts), batch_size): batch = texts[i:i+batch_size] try: response = voyageai.Embedding.create(input=batch, model=model) batch_embeddings = [item['embedding'] for item in response['data']] all_embeddings.extend(batch_embeddings) # Sleep briefly to avoid rate limits if i + batch_size < len(texts): time.sleep(0.5) except Exception as e: print(f"Error in batch {i//batch_size + 1}: {e}") # Add empty embeddings for failed batch all_embeddings.extend([None] * len(batch)) return all_embeddings def create_product_embeddings(products, batch_size=100): """Create embeddings for products using batch processing with deduplication""" # De-duplication step unique_products = [] product_to_index = {} index_map = {} # Maps original index to index in unique_products for i, product in enumerate(products): if product in product_to_index: # Product already seen, just store the mapping index_map[i] = product_to_index[product] else: # New unique product product_to_index[product] = len(unique_products) index_map[i] = len(unique_products) unique_products.append(product) print(f"Found {len(unique_products)} unique products out of {len(products)} total") if len(unique_products) == 0: return {} # Process only unique products print(f"Processing {len(unique_products)} unique products") # Get embeddings for unique products unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size) # Map embeddings back to all products all_products_dict = {} for i, product in enumerate(products): unique_idx = index_map[i] if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None: all_products_dict[product] = unique_embeddings[unique_idx] print(f"Created embeddings for {len(all_products_dict)} products") return all_products_dict