Spaces:
Sleeping
Sleeping
# import gradio as gr # Removed Gradio import | |
# from utils import SafeProgress # Removed SafeProgress import | |
from category_matching import load_categories, hybrid_category_matching | |
from similarity import hybrid_ingredient_matching, compute_similarities | |
from ui_core import embeddings, parse_input | |
from ui_formatters import format_reranking_results_html | |
from openai_expansion import expand_product_descriptions | |
from api_utils import get_voyage_client | |
def categorize_products_with_voyage_reranking(product_input, is_file=False, use_expansion=False, | |
embedding_top_n=20, final_top_n=5, confidence_threshold=0.5, | |
match_type="categories"): # Removed progress parameter | |
""" | |
Categorize products using Voyage reranking with optional description expansion | |
""" | |
# Removed Gradio progress tracking | |
# progress_tracker = SafeProgress(progress) | |
# progress_tracker(0, desc=f"Starting Voyage reranking for {match_type}...") | |
# Parse input | |
product_names, error = parse_input(product_input, is_file) | |
if error: | |
return error | |
# Optional description expansion | |
expanded_descriptions = {} | |
if use_expansion: | |
# progress_tracker(0.3, desc="Expanding product descriptions...") # Removed progress | |
expanded_descriptions = expand_product_descriptions(product_names) # Removed progress argument | |
match_results = {} | |
if match_type == "categories": | |
# Load categories | |
# progress_tracker(0.2, desc="Loading categories...") # Removed progress | |
categories = load_categories() | |
# Use hybrid approach for categories with optional expanded descriptions | |
# progress_tracker(0.5, desc="Finding and re-ranking categories...") # Removed progress | |
match_results = hybrid_category_matching( | |
product_names, categories, | |
embedding_top_n=int(embedding_top_n), | |
final_top_n=int(final_top_n), | |
confidence_threshold=0.0, # Don't apply threshold here - do it in display | |
expanded_descriptions=expanded_descriptions if use_expansion else None | |
# Removed progress argument | |
) | |
else: # ingredients | |
# Validate embeddings are loaded | |
if not embeddings: | |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>" | |
# Use hybrid approach for ingredients with optional expanded descriptions | |
# progress_tracker(0.5, desc="Finding and re-ranking ingredients...") # Removed progress | |
match_results = hybrid_ingredient_matching( | |
product_names, embeddings, | |
embedding_top_n=int(embedding_top_n), | |
final_top_n=int(final_top_n), | |
confidence_threshold=0.0, # Don't apply threshold here - do it in display | |
expanded_descriptions=expanded_descriptions if use_expansion else None | |
# Removed progress argument | |
) | |
# Format results | |
# progress_tracker(0.9, desc="Formatting results...") # Removed progress | |
# Convert to unified format for formatter | |
formatted_results = [] | |
for product, matches in match_results.items(): | |
# Include all products, even with no matches | |
formatted_result = { | |
"product_name": product, | |
"confidence": max([item[-1] for item in matches]) if matches else 0, | |
"matching_items": [], | |
"item_scores": [], | |
"explanation": expanded_descriptions.get(product, "") if use_expansion else "" | |
} | |
# Format matching items based on match type | |
if match_type == "ingredients": | |
# Extract ingredient names and scores | |
formatted_result["matching_items"] = [item[0] for item in matches] | |
formatted_result["item_scores"] = [item[1] for item in matches] | |
else: # categories | |
for match in matches: | |
if len(match) >= 2: | |
cat_id = match[0] | |
# Some category matches might include a text description | |
cat_text = match[1] if len(match) > 2 else "" | |
score = match[-1] | |
if isinstance(cat_text, (int, float)): # This is not text but a score | |
cat_text = "" | |
formatted_result["matching_items"].append( | |
f"{cat_id}: {cat_text}" if cat_text else f"{cat_id}" | |
) | |
formatted_result["item_scores"].append(score) | |
formatted_results.append(formatted_result) | |
if not formatted_results: | |
return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>" | |
print(f"FORMAT RESULTS: {formatted_results}") | |
result_html = format_reranking_results_html( | |
results=formatted_results, | |
match_type=match_type, | |
show_scores=True, | |
include_explanation=use_expansion, | |
method="voyage", | |
confidence_threshold=confidence_threshold # Pass the threshold to the formatter | |
) | |
# progress_tracker(1.0, desc="Done!") # Removed progress | |
return result_html | |
# Update the function in ui_hybrid_matching.py | |
def hybrid_ingredient_matching_voyage(products, ingredients_dict, | |
embedding_top_n=20, final_top_n=5, | |
confidence_threshold=0.5, | |
expanded_descriptions=None, | |
): # Removed progress parameter | |
"""Use Voyage AI for reranking instead of OpenAI""" | |
# from utils import SafeProgress # Removed SafeProgress import | |
from embeddings import create_product_embeddings | |
# Removed Gradio progress tracking | |
# progress_tracker = SafeProgress(progress, desc="Voyage ingredient matching") | |
# progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings") | |
# Stage 1: Same as before - use embeddings to find candidates | |
if expanded_descriptions: | |
# Use expanded descriptions for embedding creation when available | |
products_for_embedding = [expanded_descriptions.get(name, name) for name in products] | |
# Map expanded descriptions back to original product names for consistent keys | |
product_embeddings = {} | |
temp_embeddings = create_product_embeddings(products_for_embedding, original_products=products) # Removed progress, pass original names | |
# Ensure we use original product names as keys | |
for i, product_name in enumerate(products): | |
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings: | |
product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]] | |
else: | |
# Standard embedding creation with just product names | |
product_embeddings = create_product_embeddings(products) # Removed progress | |
similarities = compute_similarities(ingredients_dict, product_embeddings) | |
# Filter to top N candidates per product | |
embedding_results = {} | |
for product, product_similarities in similarities.items(): | |
embedding_results[product] = product_similarities[:embedding_top_n] | |
# progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI") # Removed progress | |
# Initialize Voyage client | |
voyage_client = get_voyage_client() | |
# Stage 2: Re-rank using Voyage instead of OpenAI | |
final_results = {} | |
for i, product in enumerate(products): | |
# progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") # Removed progress | |
if product not in embedding_results or not embedding_results[product]: | |
final_results[product] = [] | |
continue | |
candidates = embedding_results[product] | |
candidate_ingredients = [c[0] for c in candidates] | |
try: | |
# Use expanded description if available | |
product_text = product | |
if expanded_descriptions and product in expanded_descriptions: | |
product_text = expanded_descriptions[product] | |
# Use plain strings for the documents instead of objects with text property | |
documents = candidate_ingredients # Simply use the list of strings directly | |
# Use Voyage reranking | |
reranked = voyage_client.rerank( | |
query=f"Which ingredient best matches: {product_text}", | |
documents=documents, | |
model="rerank-2" | |
) | |
# Process results - include all results but keep the threshold for later filtering | |
voyage_results = [] | |
for result in reranked["results"]: | |
score = result["relevance_score"] | |
text = result["document"] # Now this is the direct string, not an object | |
voyage_results.append((text, score)) | |
# Still limit to final_top_n but don't filter by threshold here | |
final_results[product] = voyage_results[:final_top_n] | |
except Exception as e: | |
print(f"Error during Voyage reranking for '{product}': {e}") | |
# Fall back to embedding results | |
final_results[product] = candidates[:1] | |
# progress_tracker(1.0, desc="Voyage ingredient matching complete") # Removed progress | |
return final_results | |
# Add this function to ui_hybrid_matching.py | |
def hybrid_category_matching_voyage(products, categories_dict, | |
embedding_top_n=20, final_top_n=5, | |
confidence_threshold=0.5, | |
expanded_descriptions=None, | |
): # Removed progress parameter | |
"""Use Voyage AI for reranking categories instead of OpenAI""" | |
# from utils import SafeProgress # Removed SafeProgress import | |
from embeddings import create_product_embeddings | |
# Removed Gradio progress tracking | |
# progress_tracker = SafeProgress(progress, desc="Voyage category matching") | |
# progress_tracker(0.1, desc="Stage 1: Finding candidate categories with embeddings") | |
# Stage 1: Same as before - use embeddings to find candidates | |
if expanded_descriptions: | |
# Use expanded descriptions for embedding creation when available | |
products_for_embedding = [expanded_descriptions.get(name, name) for name in products] | |
# Map expanded descriptions back to original product names for consistent keys | |
product_embeddings = {} | |
temp_embeddings = create_product_embeddings(products_for_embedding, original_products=products) # Removed progress, pass original names | |
# Ensure we use original product names as keys | |
for i, product_name in enumerate(products): | |
if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings: | |
product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]] | |
else: | |
# Standard embedding creation with just product names | |
product_embeddings = create_product_embeddings(products) # Removed progress | |
from similarity import compute_similarities | |
similarities = compute_similarities(categories_dict, product_embeddings) | |
# Filter to top N candidates per product | |
embedding_results = {} | |
for product, product_similarities in similarities.items(): | |
embedding_results[product] = product_similarities[:embedding_top_n] | |
# progress_tracker(0.4, desc="Stage 2: Re-ranking with Voyage AI") # Removed progress | |
# Initialize Voyage client | |
voyage_client = get_voyage_client() | |
# Stage 2: Re-rank using Voyage AI | |
final_results = {} | |
for i, product in enumerate(products): | |
# progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}") # Removed progress | |
if product not in embedding_results or not embedding_results[product]: | |
final_results[product] = [] | |
continue | |
candidates = embedding_results[product] | |
candidate_categories = [c[0] for c in candidates] | |
try: | |
# Use expanded description if available | |
product_text = product | |
if expanded_descriptions and product in expanded_descriptions: | |
product_text = expanded_descriptions[product] | |
# Use plain strings for the documents | |
documents = candidate_categories | |
# Use Voyage reranking | |
reranked = voyage_client.rerank( | |
query=f"Which food category best matches: {product_text}", | |
documents=documents, | |
model="rerank-2" | |
) | |
# Process results - include all results but keep the threshold for later filtering | |
voyage_results = [] | |
for result in reranked["results"]: | |
score = result["relevance_score"] | |
text = result["document"] | |
voyage_results.append((text, score)) | |
# Limit to final_top_n but don't filter by threshold here | |
final_results[product] = voyage_results[:final_top_n] | |
except Exception as e: | |
print(f"Error during Voyage category reranking for '{product}': {e}") | |
# Fall back to embedding results | |
final_results[product] = candidates[:1] | |
# progress_tracker(1.0, desc="Voyage category matching complete") # Removed progress | |
return final_results |