Spaces:
Sleeping
Sleeping
import os | |
import concurrent.futures | |
from typing import List, Dict, Callable, Any, Tuple | |
from openai import OpenAI | |
import voyageai | |
from utils import SafeProgress | |
import json | |
# Centralized API clients | |
def get_openai_client(): | |
"""Get a configured OpenAI client""" | |
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
return OpenAI(api_key=OPENAI_API_KEY) | |
def get_voyage_client(): | |
"""Get a configured Voyage AI client""" | |
return voyageai.Client() | |
# General batch processing utilities | |
def process_batch(items_batch: List[Any], processor_func: Callable) -> Dict: | |
""" | |
Process a batch of items using the provided processor function | |
Args: | |
items_batch: List of items to process | |
processor_func: Function that processes a single item and returns (key, value) | |
Returns: | |
Dictionary of processing results | |
""" | |
results = {} | |
for item in items_batch: | |
try: | |
key, value = processor_func(item) | |
results[key] = value | |
except Exception as e: | |
print(f"Error processing batch item '{item}': {e}") | |
results[item] = [] | |
return results | |
def process_in_parallel( | |
items: List[Any], | |
processor_func: Callable, | |
max_workers: int = 10, | |
progress_tracker: Any = None, | |
progress_start: float = 0.0, | |
progress_end: float = 1.0, | |
progress_desc: str = "Processing in parallel" | |
) -> Dict: | |
""" | |
Process items in parallel using thread pool while preserving original order | |
Args: | |
items: List of items to process | |
processor_func: Function that processes a single item | |
max_workers: Maximum number of threads | |
progress_tracker: Optional progress tracking object | |
progress_start: Starting progress percentage (0.0-1.0) | |
progress_end: Ending progress percentage (0.0-1.0) | |
progress_desc: Description for the progress tracker | |
Returns: | |
Combined results dictionary with preserved input order | |
""" | |
# Ensure reasonable number of workers | |
max_workers = min(max_workers, len(items)) | |
# Track original item positions | |
ordered_items = [(idx, item) for idx, item in enumerate(items)] | |
# Define a wrapper function to preserve order | |
def process_with_index(idx_item_pair): | |
idx, item = idx_item_pair | |
try: | |
key, value = processor_func(item) | |
return (idx, key, value) | |
except Exception as e: | |
print(f"Error processing item '{item}': {e}") | |
return (idx, item, None) | |
# Process items in parallel | |
indexed_results = [] # Store (idx, key, value) tuples | |
completed_count = 0 | |
total_count = len(items) | |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | |
futures = [executor.submit(process_with_index, item_pair) for item_pair in ordered_items] | |
for future in concurrent.futures.as_completed(futures): | |
completed_count += 1 | |
# Update progress if tracker provided | |
if progress_tracker: | |
progress_percent = progress_start + ((progress_end - progress_start) * completed_count / total_count) | |
progress_tracker(progress_percent, desc=f"{progress_desc}: {completed_count}/{total_count}") | |
try: | |
result = future.result() | |
if result[2] is not None: # (idx, key, value) - check if value is not None | |
indexed_results.append(result) | |
except Exception as e: | |
print(f"Error processing item: {e}") | |
# Sort results by original index and create ordered dictionary | |
ordered_results = {} | |
for idx, key, value in sorted(indexed_results, key=lambda x: x[0]): | |
ordered_results[key] = value | |
return ordered_results | |
def openai_structured_query( | |
prompt: str, | |
system_message: str = "You are a helpful assistant.", | |
schema: dict = None, | |
model: str = "gpt-4o-mini", | |
client=None, | |
schema_name: str = "structured_output" | |
) -> dict: | |
""" | |
Make an OpenAI API call with structured output format | |
Args: | |
prompt: The user prompt | |
system_message: The system message to guide the model | |
schema: JSON schema for structured output | |
model: OpenAI model to use | |
client: Optional pre-configured client, otherwise will be created | |
schema_name: Name for the schema | |
Returns: | |
Parsed JSON response as dictionary | |
""" | |
if client is None: | |
client = get_openai_client() | |
try: | |
response = client.responses.create( | |
model=model, | |
input=[ | |
{"role": "system", "content": system_message}, | |
{"role": "user", "content": prompt} | |
], | |
text={ | |
"format": { | |
"type": "json_schema", | |
"name": schema_name, | |
"schema": schema, | |
"strict": True | |
} | |
} | |
) | |
# Parse the response | |
return json.loads(response.output_text) | |
except Exception as e: | |
print(f"Error in OpenAI structured query: {e}") | |
raise | |
def rank_ingredients_openai( | |
product: str, | |
candidates: List[str], | |
expanded_description: str = None, | |
client=None, | |
model: str = "gpt-4o-mini", | |
max_results: int = 3, | |
confidence_threshold: float = 0.5, | |
debug: bool = False | |
) -> List[Tuple[str, float]]: | |
""" | |
Rank ingredients for a product using OpenAI | |
Args: | |
product: Product name | |
candidates: List of candidate ingredients | |
expanded_description: Optional expanded product description | |
client: Optional pre-configured client | |
model: OpenAI model to use | |
max_results: Maximum number of results to return | |
confidence_threshold: Minimum confidence threshold | |
debug: Whether to print debug info | |
Returns: | |
List of (ingredient, confidence) tuples | |
""" | |
if not candidates: | |
return [] | |
if client is None: | |
client = get_openai_client() | |
if debug: | |
print(f"Ranking for product: {product} with {len(candidates)} candidates") | |
# Format prompt with expanded description if available | |
prompt = f"Product: {product}" | |
if expanded_description: | |
prompt += f"\n\nExpanded description: {expanded_description}" | |
prompt += f"\n\nPotential ingredients: {', '.join(candidates)}" | |
# Define the ranking schema | |
ranking_schema = { | |
"type": "object", | |
"properties": { | |
"rankings": { | |
"type": "array", | |
"description": f"Only the top {max_results} most relevant ingredients with scores >= {confidence_threshold}", | |
"items": { | |
"type": "object", | |
"properties": { | |
"ingredient": { | |
"type": "string", | |
"description": "The name of the ingredient" | |
}, | |
"relevance_score": { | |
"type": "number", | |
"description": "Score between 0 and 1 indicating relevance" | |
}, | |
"explanation": { | |
"type": "string", | |
"description": "Brief explanation for the matching" | |
} | |
}, | |
"required": ["ingredient", "relevance_score", "explanation"], | |
"additionalProperties": False | |
} | |
} | |
}, | |
"required": ["rankings"], | |
"additionalProperties": False | |
} | |
try: | |
# Make the API call directly for more control | |
response = client.responses.create( | |
model=model, | |
# reasoning={"effort": "low"}, | |
input=[ | |
{"role": "system", "content": f""" | |
You are a product categorization expert. Your task is to match product descriptions to the most relevant categories from the PROVIDED LIST ONLY. | |
CRITICAL RULES: | |
1. You MUST ONLY select from the exact items listed in "Potential ingredients" - DO NOT create or invent new categories | |
2. Do not combine items from the list or add any words to them | |
3. Choose the items from the list that best match what the product IS or CONTAINS | |
4. If none of the items perfectly match, choose the closest matches from the provided list | |
For the rankings: | |
- Select ONLY from the exact items in the "Potential ingredients" list | |
- Assign relevance scores from 0.0 to 1.0 | |
- Rank the top {max_results} matching ingredients. | |
- Provide brief explanations for why each item is relevant | |
- Do not suggest alternatives outside the provided list | |
Aim to identify the specific product category a consumer would look for when shopping for this exact item. | |
Only include ingredients with relevance score >= {confidence_threshold}. | |
Remember: Your ONLY options are the exact items listed in "Potential ingredients" - no additions, modifications, or combinations. | |
"""}, | |
{"role": "user", "content": prompt} | |
], | |
text={ | |
"format": { | |
"type": "json_schema", | |
"name": "ingredient_ranking", | |
"schema": ranking_schema, | |
"strict": True | |
} | |
} | |
) | |
# Parse the response | |
result = json.loads(response.output_text) | |
# Process ranking results | |
ingredients = [] | |
for item in result["rankings"]: | |
ingredient = item["ingredient"] | |
score = float(item["relevance_score"]) | |
ingredients.append((ingredient, score)) | |
if debug: | |
print(f"Ranking results for {product}: {len(ingredients)} ingredients") | |
if ingredients: | |
print(f"Top match: {ingredients[0]}") | |
return ingredients | |
except Exception as e: | |
print(f"Error ranking ingredients for '{product}': {e}") | |
return [] | |
def rank_categories_openai( | |
product: str, | |
categories: dict, | |
expanded_description: str = None, | |
client=None, | |
model: str = "gpt-4o-mini", | |
max_results: int = 5, | |
confidence_threshold: float = 0.5, | |
debug: bool = False | |
) -> List[Tuple[str, float]]: | |
""" | |
Rank food categories for a product using OpenAI | |
Args: | |
product: Product name | |
categories: Dictionary of category data | |
expanded_description: Optional expanded product description | |
client: Optional pre-configured client | |
model: OpenAI model to use | |
max_results: Maximum number of results to return | |
confidence_threshold: Minimum confidence threshold | |
debug: Whether to print debug info | |
Returns: | |
List of (category, confidence) tuples | |
""" | |
if not categories: | |
return [] | |
if client is None: | |
client = get_openai_client() | |
if debug: | |
print(f"Category ranking for product: {product}") | |
# Format categories for the prompt - handle both string and dict formats | |
categories_text = "" | |
for category_id, category_data in categories.items(): | |
if isinstance(category_data, str): | |
# Simple string description | |
# print(f"Category data: {category_data}, format: {type(category_data)}") | |
# categories_text += f"- {category_id}: {category_data}\n" | |
categories_text += category_id + "\n" | |
# elif isinstance(category_data, dict) and 'description' in category_data: | |
# print(f"Category data: {category_data}, format: {type(category_data)}") | |
# # Dictionary with description field | |
# categories_text += f"- {category_id}: {category_data['description']}\n" | |
else: | |
# Default case - just use the ID | |
categories_text += f"- {category_id}\n" | |
# categories_text += f"- {category_id}\n" | |
# Format prompt with expanded description if available | |
prompt = f"Product: {product}" | |
if expanded_description: | |
prompt += f"\n\nExpanded description: {expanded_description}" | |
prompt += f"\n\nAvailable food categories:\n{categories_text}" | |
# Define the ranking schema | |
ranking_schema = { | |
"type": "object", | |
"properties": { | |
"rankings": { | |
"type": "array", | |
"description": f"Only the top most relevant category with scores >= {confidence_threshold}", | |
"items": { | |
"type": "object", | |
"properties": { | |
"reasoning": { | |
"type": "string", | |
"description": "Reasoning, , step by step, first weigh options, then consider the best match" | |
}, | |
"category": { | |
"type": "string", | |
"description": "The name of the food category" | |
}, | |
"relevance_score": { | |
"type": "number", | |
"description": "Score between 0 and 1 indicating relevance" | |
}, | |
}, | |
"required": ["category", "relevance_score", "reasoning"], | |
# "required": ["category", "relevance_score", "explanation"], | |
"additionalProperties": False | |
} | |
} | |
}, | |
"required": ["rankings"], | |
"additionalProperties": False | |
} | |
try: | |
# Make the API call | |
response = client.responses.create( | |
model=model, | |
# reasoning={"effort": "low"}, | |
input=[ | |
{"role": "system", "content": f""" | |
You are a product categorization expert. Your task is to match product descriptions to the most relevant categories from the PROVIDED LIST ONLY. | |
CRITICAL RULES: | |
1. You MUST ONLY select from the exact items listed in "Potential ingredients" - DO NOT create or invent new categories | |
2. Do not combine items from the list or add any words to them | |
3. Choose the items from the list that best match what the product IS or CONTAINS | |
4. If none of the items perfectly match, choose the closest matches from the provided list | |
For the rankings: | |
- Select ONLY from the exact items in the "Potential ingredients" list | |
- Assign relevance scores from 0.0 to 1.0 | |
- Rank the top {max_results} matching ingredients. | |
- Provide brief explanations for why each item is relevant | |
- Do not suggest alternatives outside the provided list | |
Aim to identify the specific product category a consumer would look for when shopping for this exact item. | |
Only include ingredients with relevance score >= {confidence_threshold}. | |
Remember: Your ONLY options are the exact items listed in "Potential ingredients" - no additions, modifications, or combinations. | |
"""}, | |
{"role": "user", "content": prompt} | |
], | |
text={ | |
"format": { | |
"type": "json_schema", | |
"name": "category_ranking", | |
"schema": ranking_schema, | |
"strict": True | |
} | |
} | |
) | |
# Parse the response | |
result = json.loads(response.output_text) | |
# Process ranking results | |
categories = [] | |
for item in result["rankings"]: | |
category = item["category"] | |
score = float(item["relevance_score"]) | |
categories.append((category, score)) | |
if debug: | |
print(f"Category results for {product}: {len(categories)} categories") | |
if categories: | |
print(f"Top match: {categories[0]}") | |
return categories | |
except Exception as e: | |
print(f"Error categorizing {product}: {e}") | |
if debug: | |
import traceback | |
traceback.print_exc() | |
return [] |