File size: 2,645 Bytes
a318724
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import voyageai
import time
import numpy as np

# Set Voyage AI API key directly
voyageai.api_key = "pa-DvIuCX_5TrCyxS6y74sUYpyWWGd4gN0Kf52y642y6k0"

def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
    """Get embeddings for a list of texts in batches"""
    all_embeddings = []
    total_texts = len(texts)
    
    # Pre-process all texts to replace newlines
    texts = [text.replace("\n", " ") for text in texts]
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        
        try:
            response = voyageai.Embedding.create(input=batch, model=model)
            batch_embeddings = [item['embedding'] for item in response['data']]
            all_embeddings.extend(batch_embeddings)
            
            # Sleep briefly to avoid rate limits
            if i + batch_size < len(texts):
                time.sleep(0.5)
                
        except Exception as e:
            print(f"Error in batch {i//batch_size + 1}: {e}")
            # Add empty embeddings for failed batch
            all_embeddings.extend([None] * len(batch))
    
    return all_embeddings

def create_product_embeddings(products, batch_size=100):
    """Create embeddings for products using batch processing with deduplication"""
    # De-duplication step
    unique_products = []
    product_to_index = {}
    index_map = {}  # Maps original index to index in unique_products
    
    for i, product in enumerate(products):
        if product in product_to_index:
            # Product already seen, just store the mapping
            index_map[i] = product_to_index[product]
        else:
            # New unique product
            product_to_index[product] = len(unique_products)
            index_map[i] = len(unique_products)
            unique_products.append(product)
    
    print(f"Found {len(unique_products)} unique products out of {len(products)} total")
    
    if len(unique_products) == 0:
        return {}
    
    # Process only unique products
    print(f"Processing {len(unique_products)} unique products")
    
    # Get embeddings for unique products
    unique_embeddings = get_embeddings_batch(unique_products, batch_size=batch_size)
    
    # Map embeddings back to all products
    all_products_dict = {}
    for i, product in enumerate(products):
        unique_idx = index_map[i]
        if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
            all_products_dict[product] = unique_embeddings[unique_idx]
    
    print(f"Created embeddings for {len(all_products_dict)} products")
    
    return all_products_dict