Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

product_ingredient_demo / embeddings.py

esilver

Initial commit

31ebc8b 3 months ago

raw

history blame

4.96 kB

	from typing import List, Dict, Any, Optional
	from utils import SafeProgress
	import os
	import voyageai
	import time
	import numpy as np
	from concurrent.futures import ThreadPoolExecutor

	# Set Voyage AI API key directly
	voyageai.api_key = os.getenv("VOYAGE_API_KEY")

	def get_embeddings_batch(texts, model="voyage-3-large", batch_size=100):
	"""Get embeddings for a list of texts in batches"""
	all_embeddings = []
	total_texts = len(texts)

	# Pre-process all texts to replace newlines
	texts = [text.replace("\n", " ") for text in texts]

	for i in range(0, len(texts), batch_size):
	batch = texts[i:i+batch_size]
	current_count = min(i + batch_size, total_texts)

	try:
	response = voyageai.Embedding.create(input=batch, model=model)
	batch_embeddings = [item['embedding'] for item in response['data']]
	all_embeddings.extend(batch_embeddings)

	# Sleep briefly to avoid rate limits
	if i + batch_size < len(texts):
	time.sleep(0.5)

	except Exception as e:
	print(f"Error in batch {i//batch_size + 1}: {e}")
	# Add empty embeddings for failed batch
	all_embeddings.extend([None] * len(batch))

	return all_embeddings

	def create_product_embeddings(products: List[str], batch_size: int = 100, progress=None) -> Dict[str, Any]:
	"""
	Create embeddings for product names with optimization for duplicates

	Args:
	products: List of product names to create embeddings for
	batch_size: Maximum number of products to process in one batch
	progress: Optional progress tracking object (Gradio progress bar)

	Returns:
	Dictionary mapping product names to their embeddings
	"""
	progress_tracker = SafeProgress(progress, desc="Generating embeddings")
	total_products = len(products)

	# Initialize results dictionary
	product_embeddings = {}

	# Use the same model as for ingredients (voyage-3-large)
	model = "voyage-3-large"

	# Process in batches with de-duplication
	progress_tracker(0.1, desc=f"Starting embeddings for {total_products} products")

	# De-duplication step
	unique_products = []
	product_to_index = {}
	index_map = {} # Maps original index to index in unique_products

	for i, product in enumerate(products):
	if product in product_to_index:
	# Product already seen, just store the mapping
	index_map[i] = product_to_index[product]
	else:
	# New unique product
	product_to_index[product] = len(unique_products)
	index_map[i] = len(unique_products)
	unique_products.append(product)

	progress_tracker(0.2, desc=f"Found {len(unique_products)} unique products out of {total_products} total")

	if len(unique_products) == 0:
	progress_tracker(1.0, desc="No valid products to process")
	return {}

	# Get embeddings in batches for unique products only
	try:
	# Pre-process all texts to replace newlines
	clean_products = [product.replace("\n", " ") for product in unique_products]

	progress_tracker(0.3, desc=f"Calling VoyageAI API for {len(clean_products)} unique products")

	# Process in smaller batches for better reliability
	unique_embeddings = get_embeddings_batch(clean_products, model=model, batch_size=batch_size)

	# Map embeddings back to all products
	progress_tracker(0.8, desc=f"Mapping embeddings back to all products")
	for i, product in enumerate(products):
	unique_idx = index_map[i]
	if unique_idx < len(unique_embeddings) and unique_embeddings[unique_idx] is not None:
	# Store as dictionary with 'embedding' key for consistent format
	product_embeddings[product] = {
	"embedding": unique_embeddings[unique_idx]
	}

	progress_tracker(0.9, desc="Processing embeddings completed")

	except Exception as e:
	progress_tracker(0.9, desc=f"Error generating embeddings: {str(e)}")
	print(f"Error generating product embeddings: {e}")

	progress_tracker(1.0, desc=f"Completed embeddings for {len(product_embeddings)} products")
	return product_embeddings

	def _generate_embeddings_for_batch(batch: List[str]) -> Dict[str, Any]:
	"""
	Generate embeddings for a batch of products
	"""
	# This is a placeholder for your actual embedding generation logic
	# Replace with your actual implementation
	import time

	# Your existing embedding code should go here instead of this placeholder
	embeddings = {}
	for product in batch:
	# Replace with actual embedding creation
	embeddings[product] = {"embedding": [0.1, 0.2, 0.3]}

	return embeddings