Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

product_ingredient_demo / api_utils.py

esilver

Initial commit

31ebc8b 3 months ago

raw

history blame

13.5 kB

	import os
	import concurrent.futures
	from typing import List, Dict, Callable, Any, Tuple
	from openai import OpenAI
	import voyageai
	from utils import SafeProgress
	import json

	# Centralized API clients
	def get_openai_client():
	"""Get a configured OpenAI client"""
	OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
	return OpenAI(api_key=OPENAI_API_KEY)

	def get_voyage_client():
	"""Get a configured Voyage AI client"""
	return voyageai.Client()

	# General batch processing utilities
	def process_batch(items_batch: List[Any], processor_func: Callable) -> Dict:
	"""
	Process a batch of items using the provided processor function

	Args:
	items_batch: List of items to process
	processor_func: Function that processes a single item and returns (key, value)

	Returns:
	Dictionary of processing results
	"""
	results = {}
	for item in items_batch:
	try:
	key, value = processor_func(item)
	results[key] = value
	except Exception as e:
	print(f"Error processing batch item '{item}': {e}")
	results[item] = []
	return results

	def process_in_parallel(
	items: List[Any],
	processor_func: Callable,
	max_workers: int = 10,
	progress_tracker: Any = None,
	progress_start: float = 0.0,
	progress_end: float = 1.0,
	progress_desc: str = "Processing in parallel"
	) -> Dict:
	"""
	Process items in parallel using thread pool

	Args:
	items: List of items to process
	processor_func: Function that processes a single item
	max_workers: Maximum number of threads
	progress_tracker: Optional progress tracking object
	progress_start: Starting progress percentage (0.0-1.0)
	progress_end: Ending progress percentage (0.0-1.0)
	progress_desc: Description for the progress tracker

	Returns:
	Combined results dictionary
	"""
	# Ensure reasonable number of workers
	max_workers = min(max_workers, len(items))

	# Split items into batches
	batch_size = max(1, len(items) // max_workers)
	batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]

	# Process batches in parallel
	results = {}
	with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
	future_to_batch = {executor.submit(process_batch, batch, processor_func): i
	for i, batch in enumerate(batches)}

	for i, future in enumerate(concurrent.futures.as_completed(future_to_batch)):
	batch_index = future_to_batch[future]

	# Update progress if tracker provided
	if progress_tracker:
	progress_percent = progress_start + ((progress_end - progress_start) * (i+1) / len(batches))
	progress_tracker(progress_percent, desc=f"{progress_desc}: batch {batch_index+1}/{len(batches)}")

	try:
	batch_results = future.result()
	results.update(batch_results)
	except Exception as e:
	print(f"Error processing batch {batch_index}: {e}")

	return results

	def openai_structured_query(
	prompt: str,
	system_message: str = "You are a helpful assistant.",
	schema: dict = None,
	model: str = "o3-mini",
	client=None,
	schema_name: str = "structured_output"
	) -> dict:
	"""
	Make an OpenAI API call with structured output format

	Args:
	prompt: The user prompt
	system_message: The system message to guide the model
	schema: JSON schema for structured output
	model: OpenAI model to use
	client: Optional pre-configured client, otherwise will be created
	schema_name: Name for the schema

	Returns:
	Parsed JSON response as dictionary
	"""
	if client is None:
	client = get_openai_client()

	try:
	response = client.responses.create(
	model=model,
	input=[
	{"role": "system", "content": system_message},
	{"role": "user", "content": prompt}
	],
	text={
	"format": {
	"type": "json_schema",
	"name": schema_name,
	"schema": schema,
	"strict": True
	}
	}
	)

	# Parse the response
	return json.loads(response.output_text)
	except Exception as e:
	print(f"Error in OpenAI structured query: {e}")
	raise

	def rank_ingredients_openai(
	product: str,
	candidates: List[str],
	expanded_description: str = None,
	client=None,
	model: str = "o3-mini",
	max_results: int = 3,
	confidence_threshold: float = 0.5,
	debug: bool = False
	) -> List[Tuple[str, float]]:
	"""
	Rank ingredients for a product using OpenAI

	Args:
	product: Product name
	candidates: List of candidate ingredients
	expanded_description: Optional expanded product description
	client: Optional pre-configured client
	model: OpenAI model to use
	max_results: Maximum number of results to return
	confidence_threshold: Minimum confidence threshold
	debug: Whether to print debug info

	Returns:
	List of (ingredient, confidence) tuples
	"""
	if not candidates:
	return []

	if client is None:
	client = get_openai_client()

	if debug:
	print(f"Ranking for product: {product} with {len(candidates)} candidates")

	# Format prompt with expanded description if available
	prompt = f"Product: {product}"
	if expanded_description:
	prompt += f"\n\nExpanded description: {expanded_description}"
	prompt += f"\n\nPotential ingredients: {', '.join(candidates)}"

	# Define the ranking schema
	ranking_schema = {
	"type": "object",
	"properties": {
	"rankings": {
	"type": "array",
	"description": f"Only the top {max_results} most relevant ingredients with scores >= {confidence_threshold}",
	"items": {
	"type": "object",
	"properties": {
	"ingredient": {
	"type": "string",
	"description": "The name of the ingredient"
	},
	"relevance_score": {
	"type": "number",
	"description": "Score between 0 and 1 indicating relevance"
	},
	"explanation": {
	"type": "string",
	"description": "Brief explanation for the matching"
	}
	},
	"required": ["ingredient", "relevance_score", "explanation"],
	"additionalProperties": False
	}
	}
	},
	"required": ["rankings"],
	"additionalProperties": False
	}

	try:
	# Make the API call directly for more control
	response = client.responses.create(
	model=model,
	reasoning={"effort": "low"}, # Include effort parameter from ui_expanded_matching
	input=[
	{"role": "system", "content": f"You are a food ingredient matching expert. Rank the top {max_results} ingredient based on how well they match the given product. Only include ingredients with relevance score >= {confidence_threshold}."},
	{"role": "user", "content": prompt}
	],
	text={
	"format": {
	"type": "json_schema",
	"name": "ingredient_ranking",
	"schema": ranking_schema,
	"strict": True
	}
	}
	)

	# Parse the response
	result = json.loads(response.output_text)

	# Process ranking results
	ingredients = []
	for item in result["rankings"]:
	ingredient = item["ingredient"]
	score = float(item["relevance_score"])
	ingredients.append((ingredient, score))

	if debug:
	print(f"Ranking results for {product}: {len(ingredients)} ingredients")
	if ingredients:
	print(f"Top match: {ingredients[0]}")

	return ingredients
	except Exception as e:
	print(f"Error ranking ingredients for '{product}': {e}")
	return []

	def rank_categories_openai(
	product: str,
	categories: dict,
	expanded_description: str = None,
	client=None,
	model: str = "o3-mini",
	max_results: int = 5,
	confidence_threshold: float = 0.5,
	debug: bool = False
	) -> List[Tuple[str, float]]:
	"""
	Rank food categories for a product using OpenAI

	Args:
	product: Product name
	categories: Dictionary of category data
	expanded_description: Optional expanded product description
	client: Optional pre-configured client
	model: OpenAI model to use
	max_results: Maximum number of results to return
	confidence_threshold: Minimum confidence threshold
	debug: Whether to print debug info

	Returns:
	List of (category, confidence) tuples
	"""
	if not categories:
	return []

	if client is None:
	client = get_openai_client()

	if debug:
	print(f"Category ranking for product: {product}")

	# Format categories for the prompt - handle both string and dict formats
	categories_text = ""
	for category_id, category_data in categories.items():
	if isinstance(category_data, str):
	# Simple string description
	categories_text += f"- {category_id}: {category_data}\n"
	elif isinstance(category_data, dict) and 'description' in category_data:
	# Dictionary with description field
	categories_text += f"- {category_id}: {category_data['description']}\n"
	else:
	# Default case - just use the ID
	categories_text += f"- {category_id}\n"
	# categories_text += f"- {category_id}\n"

	# Format prompt with expanded description if available
	prompt = f"Product: {product}"
	# if expanded_description:
	# prompt += f"\n\nExpanded description: {expanded_description}"
	prompt += f"\n\nAvailable food categories:\n{categories_text}"

	# Define the ranking schema
	ranking_schema = {
	"type": "object",
	"properties": {
	"rankings": {
	"type": "array",
	"description": f"Only the top most relevant category with scores >= {confidence_threshold}",
	"items": {
	"type": "object",
	"properties": {
	"reasoning": {
	"type": "string",
	"description": "Reasoning, , step by step, first weigh options, then consider the best match"
	},
	"category": {
	"type": "string",
	"description": "The name of the food category"
	},
	"relevance_score": {
	"type": "number",
	"description": "Score between 0 and 1 indicating relevance"
	},

	},
	"required": ["category", "relevance_score", "reasoning"],
	# "required": ["category", "relevance_score", "explanation"],
	"additionalProperties": False
	}
	}
	},
	"required": ["rankings"],
	"additionalProperties": False
	}

	try:
	# Make the API call
	response = client.responses.create(
	model=model,
	# reasoning={"effort": "low"},
	input=[
	{"role": "system", "content": f"You are a food categorization expert. Think this through step by step: Rank the top category based on how well it match the given product. Only include categories with relevance score >= {confidence_threshold}."},
	{"role": "user", "content": prompt}
	],
	text={
	"format": {
	"type": "json_schema",
	"name": "category_ranking",
	"schema": ranking_schema,
	"strict": True
	}
	}
	)

	# Parse the response
	result = json.loads(response.output_text)

	# Process ranking results
	categories = []
	for item in result["rankings"]:
	category = item["category"]
	score = float(item["relevance_score"])
	categories.append((category, score))

	if debug:
	print(f"Category results for {product}: {len(categories)} categories")
	if categories:
	print(f"Top match: {categories[0]}")

	return categories

	except Exception as e:
	print(f"Error categorizing {product}: {e}")
	if debug:
	import traceback
	traceback.print_exc()
	return []