Spaces:

eliago
/

product_ingredient_demo

Sleeping

App Files Files Community

product_ingredient_demo / ui_expanded_matching.py

esilver

Use streamlit

164730f 3 months ago

raw

history blame

13.2 kB

	# import gradio as gr # Removed Gradio import
	# from utils import SafeProgress # Removed SafeProgress import
	from embeddings import create_product_embeddings
	from similarity import compute_similarities
	from openai_expansion import expand_product_descriptions
	from ui_core import embeddings, parse_input, CATEGORY_EMBEDDINGS_PATH
	from ui_formatters import format_reranking_results_html
	from api_utils import get_openai_client, process_in_parallel, rank_ingredients_openai, rank_categories_openai
	from category_matching import load_categories, load_category_embeddings
	import json

	def categorize_products_with_openai_reranking(product_input, is_file=False, use_expansion=False,
	embedding_top_n=20, top_n=10, confidence_threshold=0.5,
	match_type="ingredients"): # Removed progress parameter
	"""
	Categorize products using OpenAI reranking with optional description expansion
	"""
	# Removed Gradio progress tracking
	# progress_tracker = SafeProgress(progress)
	# progress_tracker(0, desc="Starting OpenAI reranking...")
	# Parse input
	product_names, error = parse_input(product_input, is_file)
	if error:
	return error

	# Validate embeddings are loaded if doing ingredient matching
	if match_type == "ingredients" and not embeddings:
	return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No ingredient embeddings loaded. Please check that the embeddings file exists and is properly formatted.</div>"
	# Optional description expansion
	expanded_descriptions = {}
	if use_expansion:
	# progress_tracker(0.2, desc="Expanding product descriptions...") # Removed progress
	expanded_descriptions = expand_product_descriptions(product_names) # Removed progress argument

	# Get shared OpenAI client
	openai_client = get_openai_client()

	products_for_embedding = ''

	if match_type == "ingredients":
	# Generate product embeddings
	# progress_tracker(0.4, desc="Generating product embeddings...") # Removed progress
	if use_expansion and expanded_descriptions:
	# Use expanded descriptions for embedding creation when available
	products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
	# Map expanded descriptions back to original product names for consistent keys
	product_embeddings = {}
	temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names) # Removed progress, pass original names

	# Ensure we use original product names as keys
	for i, product_name in enumerate(product_names):
	if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
	product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
	else:
	# Standard embedding creation with just product names
	product_embeddings = create_product_embeddings(product_names) # Removed progress

	# Compute embedding similarities for ingredients
	# progress_tracker(0.6, desc="Computing ingredient similarities...") # Removed progress
	all_similarities = compute_similarities(embeddings, product_embeddings)

	print(f"product_names: {product_names}")
	print(f"products_for_embedding: {products_for_embedding}")
	# print(f"all_similarities: {all_similarities}")

	if not all_similarities:
	return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No similarities found. Please try different product names.</div>"

	# progress_tracker(0.7, desc="Re-ranking with OpenAI...") # Removed progress

	# Function for processing each product
	def process_reranking(product):
	if product not in all_similarities:
	return product, []

	candidates = all_similarities[product][:embedding_top_n]
	if not candidates:
	return product, []

	candidate_ingredients = [c[0] for c in candidates]
	expanded_text = expanded_descriptions.get(product, product) if use_expansion else product

	try:
	# Use the shared utility function - now passing 0.0 as threshold to get all results
	# We'll apply the threshold at display time
	reranked_ingredients = rank_ingredients_openai(
	product=product,
	candidates=candidate_ingredients,
	expanded_description=expanded_text,
	client=openai_client,
	model="gpt-4o-mini",
	max_results=top_n,
	confidence_threshold=0.0, # Don't filter here, do it at display time
	debug=True
	)

	return product, reranked_ingredients

	except Exception as e:
	print(f"Error reranking {product}: {e}")
	# Fall back to top embedding match
	return product, candidates[:1] # Don't filter here

	# Process all products in parallel
	final_results = process_in_parallel(
	items=product_names,
	processor_func=process_reranking,
	max_workers=min(10, len(product_names)) # Moved max_workers inside
	# Removed progress tracking arguments
	) # Corrected closing parenthesis

	else: # categories
	# Load category embeddings instead of JSON categories
	# progress_tracker(0.5, desc="Loading category embeddings...") # Removed progress
	category_embeddings = load_category_embeddings()

	if not category_embeddings:
	return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category embeddings found. Please check that the embeddings file exists at data/category_embeddings.pickle.</div>"

	# Generate product embeddings
	# progress_tracker(0.6, desc="Generating product embeddings...") # Removed progress
	if use_expansion and expanded_descriptions:
	# Use expanded descriptions for embedding creation when available
	products_for_embedding = [expanded_descriptions.get(name, name) for name in product_names]
	# Map expanded descriptions back to original product names for consistent keys
	product_embeddings = {}
	temp_embeddings = create_product_embeddings(products_for_embedding, original_products=product_names) # Removed progress, pass original names

	# Ensure we use original product names as keys
	for i, product_name in enumerate(product_names):
	if i < len(products_for_embedding) and products_for_embedding[i] in temp_embeddings:
	product_embeddings[product_name] = temp_embeddings[products_for_embedding[i]]
	else:
	# Standard embedding creation with just product names
	product_embeddings = create_product_embeddings(product_names) # Removed progress

	# Compute embedding similarities for categories
	# progress_tracker(0.7, desc="Computing category similarities...") # Removed progress
	all_similarities = compute_similarities(category_embeddings, product_embeddings)

	if not all_similarities:
	return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>Error: No category similarities found. Please try different product names.</div>"

	# Collect all needed category IDs first - don't filter by threshold here
	needed_category_ids = set()
	for product, similarities in all_similarities.items():
	for category_id, score in similarities[:embedding_top_n]:
	needed_category_ids.add(category_id)

	# Load only the needed categories from JSON
	# progress_tracker(0.75, desc="Loading category descriptions...") # Removed progress
	category_descriptions = {}
	if needed_category_ids:
	try:
	with open("categories.json", 'r') as f:
	categories_list = json.load(f)
	for item in categories_list:
	if item["id"] in needed_category_ids:
	category_descriptions[item["id"]] = item["text"]
	except Exception as e:
	print(f"Error loading category descriptions: {e}")

	# Function to process each product
	def process_category_matching(product):
	if product not in all_similarities:
	return product, []

	candidates = all_similarities[product][:embedding_top_n]

	print(f"candidates: {candidates}")

	if not candidates:
	return product, []

	# Get the expanded description or use product name if no expansion
	expanded_text = expanded_descriptions.get(product, product) if use_expansion else product

	try:
	# FIXED: Filter categories to only include those in the current product's candidates
	product_category_ids = [cat_id for cat_id, _ in candidates]
	filtered_categories = {cat_id: category_descriptions[cat_id]
	for cat_id in product_category_ids
	if cat_id in category_descriptions}

	# Pass 0.0 as threshold to get all results - apply threshold at display time
	category_matches = rank_categories_openai(
	product=product,
	categories=filtered_categories, # Pass only this product's relevant categories
	expanded_description=expanded_text,
	client=openai_client,
	model="gpt-4o-mini",
	max_results=top_n,
	confidence_threshold=0.0, # Don't filter here
	debug=True
	)

	# Format results with category descriptions if needed
	formatted_matches = []
	for category_id, score in category_matches:
	category_text = category_descriptions.get(category_id, "Unknown category")
	formatted_matches.append((category_id, category_text, score))

	return product, formatted_matches
	except Exception as e:
	print(f"Error matching {product} to categories: {e}")
	return product, []

	# Process all products in parallel
	final_results = process_in_parallel(
	items=product_names,
	processor_func=process_category_matching,
	max_workers=min(10, len(product_names)) # Restored max_workers inside the call
	# Removed progress tracking arguments
	) # Correctly placed closing parenthesis

	# Format results
	# progress_tracker(0.9, desc="Formatting results...") # Removed progress

	# Create a list of result dictionaries in consistent format
	formatted_results = []

	for product, matches in final_results.items():
	# Include all products, even with no matches
	formatted_result = {
	"product_name": product,
	"confidence": max([item[-1] for item in matches]) if matches else 0,
	"matching_items": [],
	"item_scores": [], # Add item_scores to align with Voyage implementation
	"explanation": expanded_descriptions.get(product, "") if use_expansion else ""
	}

	# Format matching items based on match type
	if match_type == "ingredients":
	formatted_result["matching_items"] = [item for item, score in matches]
	formatted_result["item_scores"] = [score for item, score in matches]
	else: # categories
	for cat_id, cat_desc, score in matches:
	formatted_result["matching_items"].append(
	f"{cat_id}: {cat_desc}" if cat_desc else f"{cat_id}"
	)
	formatted_result["item_scores"].append(score)

	formatted_results.append(formatted_result)

	if not formatted_results:
	return "<div style='color: #d32f2f; font-weight: bold; padding: 20px;'>No results found. Please check your input or try different products.</div>"

	result_html = format_reranking_results_html(
	results=formatted_results,
	match_type=match_type,
	show_scores=True,
	include_explanation=use_expansion,
	method="openai",
	confidence_threshold=confidence_threshold # Pass the threshold to the formatter
	)

	# progress_tracker(1.0, desc="Done!") # Removed progress
	return result_html