import numpy as np
from typing import Dict, List, Tuple, Any
import json
import voyageai
from openai import OpenAI
from api_utils import get_openai_client

def compute_similarities(ingredients_dict, products_dict):
    """
    Compute cosine similarities between ingredient embeddings and product embeddings
    
    Args:
        ingredients_dict: Dictionary of ingredient names to embeddings
        products_dict: Dictionary of product names to embedding dictionaries
        
    Returns:
        Dictionary of products with their similar ingredients and scores
    """
    # Validate inputs
    if not ingredients_dict:
        print("Warning: ingredients_dict is empty")
        return {}
    
    if not products_dict:
        print("Warning: products_dict is empty")
        return {}
    
    # Process ingredients - ensure we have proper embeddings
    ingredient_names = []
    ingredient_embeddings_list = []
    
    for name, emb in ingredients_dict.items():
        # Make sure we have valid embeddings (not None, not empty)
        if emb is None or (isinstance(emb, (list, np.ndarray)) and len(emb) == 0):
            continue
            
        # Handle both direct embedding vectors and dictionary formats
        if isinstance(emb, dict) and "embedding" in emb:
            embedding_vector = emb["embedding"]
            if embedding_vector is not None:
                ingredient_names.append(name)
                ingredient_embeddings_list.append(embedding_vector)
        elif isinstance(emb, (list, np.ndarray)):
            ingredient_names.append(name)
            ingredient_embeddings_list.append(emb)
    
    if not ingredient_names:
        print("Warning: No valid ingredient embeddings found")
        return {}
    
    # Convert to numpy array ensuring we have a 2D array
    ingredient_embeddings = np.array(ingredient_embeddings_list, dtype=np.float32)
    if ingredient_embeddings.ndim == 1:
        # If we got a 1D array, reshape it to 2D (1 x dimension)
        print(f"Warning: Ingredient embeddings have only 1 dimension, reshaping. Shape: {ingredient_embeddings.shape}")
        if len(ingredient_embeddings) > 0:
            ingredient_embeddings = ingredient_embeddings.reshape(1, -1)
        else:
            print("Error: Empty ingredient embeddings array")
            return {}
    
    # Normalize ingredient embeddings for cosine similarity
    # Add safety checks for zero norms
    ingredient_norms = np.linalg.norm(ingredient_embeddings, axis=1, keepdims=True)
    # Avoid division by zero
    ingredient_norms = np.where(ingredient_norms == 0, 1e-10, ingredient_norms)
    normalized_ingredients = ingredient_embeddings / ingredient_norms
    
    # Process products
    product_names = []
    valid_embeddings = []
    
    # Extract the actual embedding vectors from product dictionaries
    for product_name, product_data in products_dict.items():
        # Skip None values
        if product_data is None:
            continue
            
        # Check if the product has an embedding dictionary with the expected structure
        if isinstance(product_data, dict) and "embedding" in product_data:
            embedding_vector = product_data["embedding"]
            if embedding_vector is not None:
                product_names.append(product_name)
                valid_embeddings.append(embedding_vector)
        # If the product data is already a vector, use it directly
        elif isinstance(product_data, (list, np.ndarray)):
            product_names.append(product_name)
            valid_embeddings.append(product_data)
    
    if not product_names:
        print("Warning: No valid product embeddings found")
        return {}
    
    # Convert to numpy array for calculations
    product_embeddings = np.array(valid_embeddings, dtype=np.float32)
    
    # Handle case where we got a 1D array
    if product_embeddings.ndim == 1:
        print(f"Warning: Product embeddings have only 1 dimension, reshaping. Shape: {product_embeddings.shape}")
        if len(product_embeddings) > 0:
            product_embeddings = product_embeddings.reshape(1, -1)
        else:
            print("Error: Empty product embeddings array")
            return {}
    
    # Check and handle embedding dimension mismatch
    product_dim = product_embeddings.shape[1] if product_embeddings.ndim > 1 else len(product_embeddings)
    ingredient_dim = normalized_ingredients.shape[1] if normalized_ingredients.ndim > 1 else len(normalized_ingredients)
    
    if product_dim != ingredient_dim:
        print(f"Warning: Dimension mismatch between product embeddings ({product_dim}) and ingredient embeddings ({ingredient_dim})")
        # Return empty results if dimensions don't match
        return {}
    
    # Normalize product embeddings for cosine similarity
    product_norms = np.linalg.norm(product_embeddings, axis=1, keepdims=True)
    # Avoid division by zero
    product_norms = np.where(product_norms == 0, 1e-10, product_norms)
    normalized_products = product_embeddings / product_norms
    
    # Compute cosine similarity
    similarity_matrix = np.dot(normalized_products, normalized_ingredients.T)
    
    # Create result dictionary
    results = {}
    for i, product_name in enumerate(product_names):
        similarities = similarity_matrix[i]
        product_similarities = [(ingredient_names[j], float(similarities[j])) 
                                for j in range(len(ingredient_names))]
        
        # Sort by similarity score (descending)
        product_similarities.sort(key=lambda x: x[1], reverse=True)
        
        results[product_name] = product_similarities
        
    return results

def hybrid_ingredient_matching(products: List[str], ingredients_dict: Dict[str, Any],
                            embedding_top_n: int = 20, final_top_n: int = 5,
                            confidence_threshold: float = 0.5,
                            expanded_descriptions: Dict[str, str] = None,
                            progress=None) -> Dict[str, List[Tuple]]:
    """
    Two-stage matching: first use embeddings to find candidate ingredients, then apply re-ranking
    
    Args:
        products: List of product names to categorize
        ingredients_dict: Dictionary of ingredient names to embeddings
        embedding_top_n: Number of top ingredients to retrieve using embeddings
        final_top_n: Number of final ingredients to return after re-ranking
        confidence_threshold: Minimum score threshold for final results
        expanded_descriptions: Optional dict mapping products to their expanded descriptions
        progress: Optional progress tracking object
        
    Returns:
        Dictionary mapping products to their matched ingredients with scores
    """
    from utils import SafeProgress
    from embeddings import create_product_embeddings
    
    progress_tracker = SafeProgress(progress, desc="Hybrid ingredient matching")
    progress_tracker(0.1, desc="Stage 1: Finding candidates with embeddings")
    
    # Stage 1: Use embeddings to find candidate ingredients
    # Generate product embeddings
    product_embeddings = create_product_embeddings(products, progress=progress_tracker)
    
    # Compute similarities to get candidate ingredients
    similarities = compute_similarities(ingredients_dict, product_embeddings)
    
    # Filter to top N candidates per product
    embedding_results = {}
    for product, product_similarities in similarities.items():
        embedding_results[product] = product_similarities[:embedding_top_n]
    
    progress_tracker(0.4, desc="Stage 2: Re-ranking candidates")
    
    # Initialize OpenAI client using the centralized function
    openai_client = get_openai_client()
    
    # Stage 2: Re-rank the candidates for each product
    final_results = {}
    
    for i, product in enumerate(products):
        progress_tracker((0.4 + 0.5 * i / len(products)), desc=f"Re-ranking: {product}")
        
        # Get the embedding candidates for this product
        if product not in embedding_results:
            final_results[product] = []
            continue
            
        candidates = embedding_results[product]
        if not candidates:
            final_results[product] = []
            continue
        
        # Extract just the ingredient names for re-ranking
        candidate_ingredients = [c[0] for c in candidates]
        
        try:
            # Use expanded description if available
            product_text = product
            if expanded_descriptions and product in expanded_descriptions:
                product_text = expanded_descriptions[product]
                
            # Apply re-ranking using OpenAI's structured output
            response = openai_client.responses.create(
                model="gpt-4o-mini",
                # reasoning={"effort": "low"},
                input=[
                    {"role": "system", "content": "You are a food ingredient matching expert. Select the single best ingredient that matches the given product."},
                    {"role": "user", "content": f"Product: {product_text}\n\nPotential ingredients: {', '.join(candidate_ingredients)}"}
                ],
                text={
                    "format": {
                    "type": "json_schema",
                    "name": "ingredient_selection",
                    "schema": {
                        "type": "object",
                        "properties": {
                        "best_match": {
                            "type": "object",
                            "properties": {
                            "ingredient": {
                                "type": "string",
                                "description": "The name of the best matching ingredient"
                            },
                            "explanation": {
                                "type": "string",
                                "description": "Brief explanation for the matching"
                            },
                            "relevance_score": {
                                "type": "number",
                                "description": "Score between 0 and 1 indicating relevance"
                            }
                            },
                            "required": ["ingredient", "relevance_score", "explanation"],
                            "additionalProperties": False
                        }
                        },
                        "required": ["best_match"],
                        "additionalProperties": False
                    },
                    "strict": True
                    }
                }
            )
            
            # Parse the response
            best_match = json.loads(response.output_text)["best_match"]
            
            # Only include the result if it meets the confidence threshold
            if best_match["relevance_score"] >= confidence_threshold:
                final_results[product] = [(best_match["ingredient"], best_match["relevance_score"])]
            else:
                final_results[product] = []
            
        except Exception as e:
            print(f"Error during OpenAI re-ranking for '{product}': {e}")
            # Fall back to embedding results if re-ranking fails
            final_results[product] = candidates[:1]  # Select the top embedding result as fallback
    
    progress_tracker(1.0, desc="Hybrid ingredient matching complete")
    return final_results