"""
Preprocessing utilities for polymer classification app.
Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.
Supports both Raman and FTIR spectroscopy modalities.
"""

from __future__ import annotations
import numpy as np
from numpy.typing import DTypeLike
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter
from typing import Tuple, Literal, Optional

TARGET_LENGTH = 500  # Frozen default per PREPROCESSING_BASELINE

# Modality-specific validation ranges (cm⁻¹)
MODALITY_RANGES = {
    "raman": (200, 4000),  # Typical Raman range
    "ftir": (400, 4000),  # FTIR wavenumber range
}

# Modality-specific preprocessing parameters
MODALITY_PARAMS = {
    "raman": {
        "baseline_degree": 2,
        "smooth_window": 11,
        "smooth_polyorder": 2,
        "cosmic_ray_removal": False,
    },
    "ftir": {
        "baseline_degree": 2,
        "smooth_window": 13,  # Slightly larger window for FTIR
        "smooth_polyorder": 2,
        "cosmic_ray_removal": False,
        "atmospheric_correction": False,  # Placeholder for future implementation
    },
}


def _ensure_1d_equal(x: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    if x.ndim != 1 or y.ndim != 1 or x.size != y.size or x.size < 2:
        raise ValueError("x and y must be 1D arrays of equal length >= 2")
    return x, y


def resample_spectrum(
    x: np.ndarray, y: np.ndarray, target_len: int = TARGET_LENGTH
) -> tuple[np.ndarray, np.ndarray]:
    """Linear re-sampling onto a uniform grid of length target_len."""
    x, y = _ensure_1d_equal(x, y)
    order = np.argsort(x)
    x_sorted, y_sorted = x[order], y[order]
    x_new = np.linspace(x_sorted[0], x_sorted[-1], int(target_len))
    f = interp1d(x_sorted, y_sorted, kind="linear", assume_sorted=True)
    y_new = f(x_new)
    return x_new, y_new


def remove_baseline(y: np.ndarray, degree: int = 2) -> np.ndarray:
    """Polynomial baseline subtraction (degree=2 default)"""
    y = np.asarray(y, dtype=float)
    x_idx = np.arange(y.size, dtype=float)
    coeffs = np.polyfit(x_idx, y, deg=int(degree))
    baseline = np.polyval(coeffs, x_idx)
    return y - baseline


def smooth_spectrum(
    y: np.ndarray, window_length: int = 11, polyorder: int = 2
) -> np.ndarray:
    """Savitzky-Golay smoothing with safe/odd window enforcement"""
    y = np.asarray(y, dtype=float)
    window_length = int(window_length)
    polyorder = int(polyorder)
    # === window must be odd and >= polyorder+1 ===
    if window_length % 2 == 0:
        window_length += 1
    min_win = polyorder + 1
    if min_win % 2 == 0:
        min_win += 1
    window_length = max(window_length, min_win)
    return savgol_filter(
        y, window_length=window_length, polyorder=polyorder, mode="interp"
    )


def normalize_spectrum(y: np.ndarray) -> np.ndarray:
    """Min-max normalization to [0, 1] with constant-signal guard."""
    y = np.asarray(y, dtype=float)
    y_min = float(np.min(y))
    y_max = float(np.max(y))
    if np.isclose(y_max - y_min, 0.0):
        return np.zeros_like(y)
    return (y - y_min) / (y_max - y_min)


def validate_spectrum_range(x: np.ndarray, modality: str = "raman") -> bool:
    """Validate that spectrum wavenumbers are within expected range for modality."""
    if modality not in MODALITY_RANGES:
        raise ValueError(
            f"Unknown modality '{modality}'. Supported: {list(MODALITY_RANGES.keys())}"
        )

    min_range, max_range = MODALITY_RANGES[modality]
    x_min, x_max = np.min(x), np.max(x)

    # Check if majority of data points are within range
    in_range = np.sum((x >= min_range) & (x <= max_range))
    total_points = len(x)

    return bool((in_range / total_points) >= 0.7)  # At least 70% should be in range


def validate_spectrum_modality(
    x_data: np.ndarray, y_data: np.ndarray, selected_modality: str
) -> Tuple[bool, list[str]]:
    """
    Validate that spectrum characteristics match the selected modality.

    Args:
        x_data: Wavenumber array (cm⁻¹)
        y_data: Intensity array
        selected_modality: Selected modality ('raman' or 'ftir')

    Returns:
        Tuple of (is_valid, list_of_issues)
    """
    x_data = np.asarray(x_data)
    y_data = np.asarray(y_data)
    issues = []

    if selected_modality not in MODALITY_RANGES:
        issues.append(f"Unknown modality: {selected_modality}")
        return False, issues

    expected_min, expected_max = MODALITY_RANGES[selected_modality]
    actual_min, actual_max = np.min(x_data), np.max(x_data)

    # Check wavenumber range
    if actual_min < expected_min * 0.8:  # Allow 20% tolerance
        issues.append(
            f"Minimum wavenumber ({actual_min:.0f} cm⁻¹) is below typical {selected_modality.upper()} range (>{expected_min} cm⁻¹)"
        )

    if actual_max > expected_max * 1.2:  # Allow 20% tolerance
        issues.append(
            f"Maximum wavenumber ({actual_max:.0f} cm⁻¹) is above typical {selected_modality.upper()} range (<{expected_max} cm⁻¹)"
        )

    # Check for reasonable data range coverage
    data_range = actual_max - actual_min
    expected_range = expected_max - expected_min
    if data_range < expected_range * 0.3:  # Should cover at least 30% of expected range
        issues.append(
            f"Data range ({data_range:.0f} cm⁻¹) seems narrow for {selected_modality.upper()} spectroscopy"
        )

    # FTIR-specific checks
    if selected_modality == "ftir":
        # Check for typical FTIR characteristics
        if actual_min > 1000:  # FTIR usually includes fingerprint region
            issues.append(
                "FTIR data should typically include fingerprint region (400-1500 cm⁻¹)"
            )

    # Raman-specific checks
    if selected_modality == "raman":
        # Check for typical Raman characteristics
        if actual_max < 1000:  # Raman usually extends to higher wavenumbers
            issues.append(
                "Raman data typically extends to higher wavenumbers (>1000 cm⁻¹)"
            )

    return len(issues) == 0, issues


def preprocess_spectrum(
    x: np.ndarray,
    y: np.ndarray,
    *,
    target_len: int = TARGET_LENGTH,
    modality: str = "raman",  # New parameter for modality-specific processing
    do_baseline: bool = True,
    degree: int | None = None,  # Will use modality default if None
    do_smooth: bool = True,
    window_length: int | None = None,  # Will use modality default if None
    polyorder: int | None = None,  # Will use modality default if None
    do_normalize: bool = True,
    out_dtype: DTypeLike = np.float32,
    validate_range: bool = True,
) -> tuple[np.ndarray, np.ndarray]:
    """
    Modality-aware preprocessing: resample -> baseline -> smooth -> normalize

    Args:
        x, y: Input spectrum data
        target_len: Target length for resampling
        modality: 'raman' or 'ftir' for modality-specific processing
        do_baseline: Enable baseline correction
        degree: Polynomial degree for baseline (uses modality default if None)
        do_smooth: Enable smoothing
        window_length: Smoothing window length (uses modality default if None)
        polyorder: Polynomial order for smoothing (uses modality default if None)
        do_normalize: Enable normalization
        out_dtype: Output data type
        validate_range: Check if wavenumbers are in expected range for modality

    Returns:
        Tuple of (resampled_x, processed_y)
    """
    # Validate modality
    if modality not in MODALITY_PARAMS:
        raise ValueError(
            f"Unsupported modality '{modality}'. Supported: {list(MODALITY_PARAMS.keys())}"
        )

    # Get modality-specific parameters
    modality_config = MODALITY_PARAMS[modality]

    # Use modality defaults if parameters not specified
    if degree is None:
        degree = modality_config["baseline_degree"]
    if window_length is None:
        window_length = modality_config["smooth_window"]
    if polyorder is None:
        polyorder = modality_config["smooth_polyorder"]

    # Validate spectrum range if requested
    if validate_range:
        if not validate_spectrum_range(x, modality):
            print(
                f"Warning: Spectrum wavenumbers may not be optimal for {modality.upper()} analysis"
            )

    # Standard preprocessing pipeline
    x_rs, y_rs = resample_spectrum(x, y, target_len=target_len)

    if do_baseline:
        y_rs = remove_baseline(y_rs, degree=degree)

    if do_smooth:
        y_rs = smooth_spectrum(y_rs, window_length=window_length, polyorder=polyorder)

    # FTIR-specific processing
    if modality == "ftir":
        if modality_config.get("atmospheric_correction", False):
            y_rs = remove_atmospheric_interference(y_rs)
        if modality_config.get("water_correction", False):
            y_rs = remove_water_vapor_bands(y_rs, x_rs)

    if do_normalize:
        y_rs = normalize_spectrum(y_rs)

    # === Coerce to a real dtype to satisfy static checkers & runtime ===
    out_dt = np.dtype(out_dtype)
    return x_rs.astype(out_dt, copy=False), y_rs.astype(out_dt, copy=False)


def remove_atmospheric_interference(y: np.ndarray) -> np.ndarray:
    """Remove atmospheric CO2 and H2O interference common in FTIR."""
    y = np.asarray(y, dtype=float)

    # Simple atmospheric correction using median filtering
    # This is a basic implementation - in practice would use reference spectra
    from scipy.signal import medfilt

    # Apply median filter to reduce sharp atmospheric lines
    corrected = medfilt(y, kernel_size=5)

    # Blend with original to preserve peak structure
    alpha = 0.7  # Weight for original spectrum
    return alpha * y + (1 - alpha) * corrected


def remove_water_vapor_bands(y: np.ndarray, x: np.ndarray) -> np.ndarray:
    """Remove water vapor interference bands in FTIR spectra."""
    y = np.asarray(y, dtype=float)
    x = np.asarray(x, dtype=float)

    # Common water vapor regions in FTIR (cm⁻¹)
    water_regions = [(3500, 3800), (1300, 1800)]

    corrected_y = y.copy()

    for low, high in water_regions:
        # Find indices in water vapor region
        mask = (x >= low) & (x <= high)
        if np.any(mask):
            # Simple linear interpolation across water regions
            indices = np.where(mask)[0]
            if len(indices) > 2:
                start_idx, end_idx = indices[0], indices[-1]
                if start_idx > 0 and end_idx < len(y) - 1:
                    # Linear interpolation between boundary points
                    start_val = y[start_idx - 1]
                    end_val = y[end_idx + 1]
                    interp_vals = np.linspace(start_val, end_val, len(indices))
                    corrected_y[mask] = interp_vals

    return corrected_y


def apply_ftir_specific_processing(
    x: np.ndarray,
    y: np.ndarray,
    atmospheric_correction: bool = False,
    water_correction: bool = False,
) -> tuple[np.ndarray, np.ndarray]:
    """Apply FTIR-specific preprocessing steps."""
    processed_y = y.copy()

    if atmospheric_correction:
        processed_y = remove_atmospheric_interference(processed_y)

    if water_correction:
        processed_y = remove_water_vapor_bands(processed_y, x)

    return x, processed_y


def get_modality_info(modality: str) -> dict:
    """Get processing parameters and validation ranges for a modality."""
    if modality not in MODALITY_PARAMS:
        raise ValueError(f"Unknown modality '{modality}'")

    return {
        "range": MODALITY_RANGES[modality],
        "params": MODALITY_PARAMS[modality].copy(),
    }