Spaces:

dev-jas
/

polymer-aging-ml

Sleeping

App Files Files Community

devjas1 commited on Aug 27

Commit

6373c5a

1 Parent(s): 8013c07

(SYNC): bring parity backend (utils/ scripts/ models/ tests/) from feat/ui-parity-rebuild; no UI changes

Browse files

Files changed (8) hide show

models/registry.py +12 -1
scripts/preprocess_dataset.py +59 -91
scripts/run_inference.py +142 -123
tests/conftest.py +8 -0
tests/test_preprocessing.py +17 -0
utils/__init__.py +0 -4
utils/audit.py +56 -0
utils/preprocessing.py +75 -98

models/registry.py CHANGED Viewed

@@ -21,4 +21,15 @@ def build(name: str, input_length: int):
         raise ValueError(f"Unknown model '{name}'. Choices: {choices()}")
     return _REGISTRY[name](input_length)
-__all__ = ["choices", "build"]

         raise ValueError(f"Unknown model '{name}'. Choices: {choices()}")
     return _REGISTRY[name](input_length)
+def spec(name: str):
+    """Return expected input length and number of classes for a model key."""
+    if name == "figure2":
+        return {"input_length": 500, "num_classes": 2}
+    if name == "resnet":
+        return {"input_length": 500, "num_classes": 2}
+    if name == "resnet18vision":
+        return {"input_length": 500, "num_classes": 2}
+    raise KeyError(f"Unknown model '{name}'")
+__all__ = ["choices", "build"]

scripts/preprocess_dataset.py CHANGED Viewed

@@ -1,86 +1,42 @@
-"""
-This script preprocesses a dataset of spectra by resampling and labeling the data.
-Functions:
-- resample_spectrum(x, y, target_len): Resamples a spectrum to a fixed number of points.
-- preprocess_dataset(...): Loads, resamples, and applies optional preprocessing steps:
-  - baseline correction
-  - Savitzky-Golay smoothing
-  - min-max normalization
-The script expects the dataset directory to contain text files representing spectra.
-Each file is:
-1. Listed using `list_txt_files()`
-2. Labeled using `label_file()`
-3. Loaded using `load_spectrum()`
-4. Resampled and optionally cleaned
-5. Returned as arrays suitable for ML training
-Dependencies:
-- numpy
-- scipy.interpolate, scipy.signal
-- sklearn.preprocessing
-- list_spectra (custom)
-- plot_spectrum (custom)
 """
 import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 import numpy as np
-from scipy.interpolate import interp1d
-from scipy.signal import savgol_filter
-from sklearn.preprocessing import minmax_scale
-from scripts.discover_raman_files import list_txt_files, label_file
-from scripts.plot_spectrum import load_spectrum
-# Default resample target
-TARGET_LENGTH = 500
-# Optional preprocessing steps
-def remove_baseline(y):
-    """Simple baseline correction using polynomial fitting (order 2)"""
-    x = np.arange(len(y))
-    coeffs = np.polyfit(x, y, deg=2)
-    baseline = np.polyval(coeffs, x)
-    return y - baseline
-def normalize_spectrum(y):
-    """Min-max normalization to [0, 1]"""
-    return minmax_scale(y)
-def smooth_spectrum(y, window_length=11, polyorder=2):
-    """Apply Savitzky-Golay smoothing."""
-    return savgol_filter(y, window_length, polyorder)
-def resample_spectrum(x, y, target_len=TARGET_LENGTH):
-    """Resample a spectrum to a fixed number of points."""
-    f_interp = interp1d(x, y, kind='linear', fill_value='extrapolate')
-    x_uniform = np.linspace(min(x), max(x), target_len)
-    y_uniform = f_interp(x_uniform)
-    return y_uniform
 def preprocess_dataset(
-    dataset_dir,
-    target_len=500,
-    baseline_correction=False,
-    apply_smoothing=False,
-    normalize=False
 ):
     """
-    Load, resample, and preprocess all valid spectra in the dataset.
-    Args:
-        dataset_dir (str): Path to the dataset
-        target_len (int): Number of points to resample to
-        baseline_correction (bool): Whether to apply baseline removal
-        apply_smoothing (bool): Whether to apply Savitzky-Golay smoothing
-        normalize (bool): Whether to apply min-max normalization
-    Returns:
-        X (np.ndarray): Preprocessed spectra
-        y (np.ndarray): Corresponding labels
     """
     txt_paths = list_txt_files(dataset_dir)
     X, y_labels = [], []
@@ -93,29 +49,41 @@ def preprocess_dataset(
         if len(x_raw) < 10:
             continue  # Skip files with too few points
-        # Resample
-        y_processed = resample_spectrum(x_raw, y_raw, target_len=target_len)
-        # Optional preprocessing
-        if baseline_correction:
-            y_processed = remove_baseline(y_processed)
-        if apply_smoothing:
-            y_processed = smooth_spectrum(y_processed)
-        if normalize:
-            y_processed = normalize_spectrum(y_processed)
         X.append(y_processed)
-        y_labels.append(label)
-    return np.array(X), np.array(y_labels)
-# Optional: Run directly for testing
 if __name__ == "__main__":
-    dataset_dir = os.path.join(
-        "datasets", "rdwp"
-    )
-    X, y = preprocess_dataset(dataset_dir)
-    print(f"X shape: {X.shape}")
-    print(f"y shape: {y.shape}")
-    print(f"Label distribution: {np.bincount(y)}")

+"""preprocess_dataset.py
+Canonical Raman preprocessing for dataset splits.
+Uses the single source of truth in utils.preprocessing:
+resample → baseline (deg=2) → smooth (w=11,o=2) → normalize.
 """
 import os
 import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 import numpy as np
+from utils.preprocessing import (
+    TARGET_LENGTH,
+    preprocess_spectrum
+)
+from scripts.discover_raman_files import list_txt_files, label_file
+from scripts.plot_spectrum import load_spectrum
 def preprocess_dataset(
+    dataset_dir: str,
+    target_len: int = TARGET_LENGTH,
+    baseline_correction: bool = True,
+    apply_smoothing: bool = True,
+    normalize: bool = True,
+    out_dtype: str = "float32",
 ):
     """
+    Load, preprocess, and label Raman spectra in dataset_dir.
+    Returns
+    -------
+    X : np.ndarray, shape (N, target_len), dtype=out_dtype
+        Preprocessed spectra (resampled and transformed).
+    y : np.ndarray, shape (N,), dtype=int64
+        Integer labels (e.g., 0 = stable, 1 = weathered).
     """
     txt_paths = list_txt_files(dataset_dir)
     X, y_labels = [], []
         if len(x_raw) < 10:
             continue  # Skip files with too few points
+        # === Single-source-of-truth path ===
+        _, y_processed = preprocess_spectrum(
+            np.asarray(x_raw),
+            np.asarray(y_raw),
+            target_len=target_len,
+            do_baseline=baseline_correction,
+            do_smooth=apply_smoothing,
+            do_normalize=normalize,
+            out_dtype=out_dtype # str is OK (DTypeLike),
+        )
+        # === Collect ===
         X.append(y_processed)
+        y_labels.append(int(label))
+    if not X:
+        # === No valid samples ===
+        return np.empty((0, target_len), dtype=out_dtype), np.empty((0,), dtype=np.int64)
+    X_arr = np.asarray(X, dtype=np.dtype(out_dtype))
+    Y_arr = np.asarray(y_labels, dtype=np.int64)
+    return X_arr, Y_arr
+# === Optional: Run directly for quick smoke test ===
 if __name__ == "__main__":
+    test_dataset_dir = os.path.join("datasets", "rdwp")
+    X, y = preprocess_dataset(test_dataset_dir)
+    print(f"X shape: {X.shape} dtype={X.dtype}")
+    print(f"y shape: {y.shape} dtype={y.dtype}")
+    if y.size:
+        try:
+            counts = np.bincount(y, minlength=2)
+            print(f"Label distribution: {counts} (stable, weathered)")
+        except Exception as e:
+            print(f"Could not compute label distribution {e}")

scripts/run_inference.py CHANGED Viewed

@@ -1,142 +1,161 @@
-import sys
 import os
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
-from pathlib import Path
 import argparse
-import warnings
 import logging
 import numpy as np
 import torch
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-from scripts.preprocess_dataset import resample_spectrum, label_file
-from models.registry import choices as model_choices, build as build_model
-# =============================================
-# ✅ Raman-Only Inference Script
-# This script supports prediction on a single Raman spectrum (.txt file).
-# FTIR inference has been deprecated and removed for scientific integrity.
-# See: @raman-pipeline-focus-milestone
-# =============================================
-warnings.filterwarnings(
-    "ignore",
-    message=".*weights_only=False.*",
-    category=FutureWarning
-)
-def load_raman_spectrum(filepath):
-    """Load a 2-column Raman spectrum from a .txt file"""
-    x_vals, y_vals = [], []
-    with open(filepath, 'r', encoding='utf-8') as f:
-        for line in f:
-            parts = line.strip().split()
-            if len(parts) == 2:
-                try:
-                    x, y = float(parts[0]), float(parts[1])
-                    x_vals.append(x)
-                    y_vals.append(y)
-                except ValueError:
-                    continue
-    return np.array(x_vals), np.array(y_vals)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Run inference on a single Raman spectrum (.txt file)."
-    )
-    parser.add_argument("--arch", type=str, default="figure2", choices=model_choices(),
-                    help="Model architecture (must match the provided weights).")  # NEW
-    parser.add_argument(
-        "--target-len", type=int, required=True,
-        help="Target length to match model input"
-    )
-    parser.add_argument(
-        "--input", required=True,
-        help="Path to Raman .txt file."
-    )
-    parser.add_argument(
-        "--model", default="random",
-        help="Path to .pth model file, or specify 'random' to use untrained weights."
-    )
-    parser.add_argument(
-        "--output", default=None,
-        help="Where to write prediction result. If omitted, prints to stdout."
-    )
-    verbosity = parser.add_mutually_exclusive_group()
-    verbosity.add_argument(
-        "--quiet", action="store_true",
-        help="Show only warnings and errors"
-    )
-    verbosity.add_argument(
-        "--verbose", action="store_true",
-        help="Show debug-level logging"
     )
-    args = parser.parse_args()
-    # configure logging
-    level = logging.INFO
-    if args.verbose:
-        level = logging.DEBUG
-    elif args.quiet:
-        level = logging.WARNING
-    logging.basicConfig(level=level, format="%(levelname)s: %(message)s")
-    try:
-        # Load & preprocess Raman spectrum
-        if os.path.isdir(args.input):
-            parser.error(f"Input must be a single Raman .txt file, got a directory: {args.input}")
-        x_raw, y_raw = load_raman_spectrum(args.input)
-        if len(x_raw) < 10:
-            parser.error("Spectrum too short for inference.")
-        data = resample_spectrum(x_raw, y_raw, target_len=args.target_len)
-        # Shape = (1, 1, target_len) — valid input for Raman inference
-        input_tensor = torch.tensor(data, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(DEVICE)
-        # 2. Load Model (via shared model registry)
-        model = build_model(args.arch, args.target_len).to(DEVICE)
-        if args.model != "random":
-            state = torch.load(args.model, map_location="cpu") # broad compatibility
-            model.load_state_dict(state)
-        model.eval()
-        # 3. Inference
-        with torch.no_grad():
-            logits = model(input_tensor)
-            pred = torch.argmax(logits, dim=1).item()
-        # 4. True Label
-        try:
-            true_label = label_file(args.input)
-            label_str = f"True Label: {true_label}"
-        except FileNotFoundError:
-            label_str = "True Label: Unknown"
-        result = f"Predicted Label: {pred} {label_str}\nRaw Logits: {logits.tolist()}"
-        logging.info(result)
-        # 5. Save or stdout
-        if args.output:
-            # ensure parent dir exists (e.g., outputs/inference/)
-            Path(args.output).parent.mkdir(parents=True, exist_ok=True)
-            with open(args.output, "w", encoding="utf-8") as fout:
-                fout.write(result)
-            logging.info("Result saved to %s", args.output)
-        sys.exit(0)
-    except Exception as e:
-        logging.error(e)
-        sys.exit(1)

+# scripts/run_inference.py
+"""
+CLI inference with preprocessing parity.
+Applies: resample → baseline (deg=2) → smooth (w=11,o=2) → normalize
+unless explicitly disabled via flags.
+Usage (examples):
+python scripts/run_inference.py \
+    --input datasets/rdwp/sta-1.txt \
+    --arch figure2 \
+    --weights outputs/figure2_model.pth \
+    --target-len 500
+# Disable smoothing only:
+python scripts/run_inference.py --input ... --arch resnet --weights ... --disable-smooth
+"""
 import os
+import sys
 sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
 import argparse
+import json
 import logging
+from pathlib import Path
+from typing import cast
+from torch import nn
 import numpy as np
 import torch
+import torch.nn.functional as F
+from models.registry import build, choices
+from utils.preprocessing import preprocess_spectrum, TARGET_LENGTH
+from scripts.plot_spectrum import load_spectrum
+from scripts.discover_raman_files import label_file
+def parse_args():
+    p = argparse.ArgumentParser(description="Raman spectrum inference (parity with CLI preprocessing).")
+    p.add_argument("--input", required=True, help="Path to a single Raman .txt file (2 columns: x, y).")
+    p.add_argument("--arch", required=True, choices=choices(), help="Model architecture key.")
+    p.add_argument("--weights", required=True, help="Path to model weights (.pth).")
+    p.add_argument("--target-len", type=int, default=TARGET_LENGTH, help="Resample length (default: 500).")
+    # Default = ON; use disable- flags to turn steps off explicitly.
+    p.add_argument("--disable-baseline", action="store_true", help="Disable baseline correction.")
+    p.add_argument("--disable-smooth", action="store_true", help="Disable Savitzky–Golay smoothing.")
+    p.add_argument("--disable-normalize", action="store_true", help="Disable min-max normalization.")
+    p.add_argument("--output", default=None, help="Optional output JSON path (defaults to outputs/inference/<name>.json).")
+    p.add_argument("--device", default="cpu", choices=["cpu", "cuda"], help="Compute device (default: cpu).")
+    return p.parse_args()
+def _load_state_dict_safe(path: str):
+    """Load a state dict safely across torch versions & checkpoint formats."""
+    try:
+        obj = torch.load(path, map_location="cpu", weights_only=True)  # newer torch
+    except TypeError:
+        obj = torch.load(path, map_location="cpu")  # fallback for older torch
+    # Accept either a plain state_dict or a checkpoint dict that contains one
+    if isinstance(obj, dict):
+        for k in ("state_dict", "model_state_dict", "model"):
+            if k in obj and isinstance(obj[k], dict):
+                obj = obj[k]
+                break
+    if not isinstance(obj, dict):
+        raise ValueError(
+            "Loaded object is not a state_dict or checkpoint with a state_dict. "
+            f"Type={type(obj)} from file={path}"
+        )
+    # Strip DataParallel 'module.' prefixes if present
+    if any(key.startswith("module.") for key in obj.keys()):
+        obj = {key.replace("module.", "", 1): val for key, val in obj.items()}
+    return obj
+def main():
+    logging.basicConfig(level=logging.INFO, format="INFO: %(message)s")
+    args = parse_args()
+    in_path = Path(args.input)
+    if not in_path.exists():
+        raise FileNotFoundError(f"Input file not found: {in_path}")
+    # --- Load raw spectrum
+    x_raw, y_raw = load_spectrum(str(in_path))
+    if len(x_raw) < 10:
+        raise ValueError("Input spectrum has too few points (<10).")
+    # --- Preprocess (single source of truth)
+    _, y_proc = preprocess_spectrum(
+        np.array(x_raw),
+        np.array(y_raw),
+        target_len=args.target_len,
+        do_baseline=not args.disable_baseline,
+        do_smooth=not args.disable_smooth,
+        do_normalize=not args.disable_normalize,
+        out_dtype="float32",
     )
+    # --- Build model & load weights (safe)
+    device = torch.device(args.device if (args.device == "cuda" and torch.cuda.is_available()) else "cpu")
+    model = cast(nn.Module, build(args.arch, args.target_len)).to(device)
+    state = _load_state_dict_safe(args.weights)
+    missing, unexpected = model.load_state_dict(state, strict=False)
+    if missing or unexpected:
+        logging.info("Loaded with non-strict keys. missing=%d unexpected=%d", len(missing), len(unexpected))
+    model.eval()
+    # Shape: (B, C, L) = (1, 1, target_len)
+    x_tensor = torch.from_numpy(y_proc[None, None, :]).to(device)
+    with torch.no_grad():
+        logits = model(x_tensor).float().cpu()  # shape (1, num_classes)
+        probs = F.softmax(logits, dim=1)
+    probs_np = probs.numpy().ravel().tolist()
+    logits_np = logits.numpy().ravel().tolist()
+    pred_label = int(np.argmax(probs_np))
+    # Optional ground-truth from filename (if encoded)
+    true_label = label_file(str(in_path))
+    # --- Prepare output
+    out_dir = Path("outputs") / "inference"
+    out_dir.mkdir(parents=True, exist_ok=True)
+    out_path = Path(args.output) if args.output else (out_dir / f"{in_path.stem}_{args.arch}.json")
+    result = {
+        "input_file": str(in_path),
+        "arch": args.arch,
+        "weights": str(args.weights),
+        "target_len": args.target_len,
+        "preprocessing": {
+            "baseline": not args.disable_baseline,
+            "smooth": not args.disable_smooth,
+            "normalize": not args.disable_normalize,
+        },
+        "predicted_label": pred_label,
+        "true_label": true_label,
+        "probs": probs_np,
+        "logits": logits_np,
+    }
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, indent=2)
+    logging.info("Predicted Label: %d  True Label: %s", pred_label, true_label)
+    logging.info("Raw Logits: %s", logits_np)
+    logging.info("Result saved to %s", out_path)
+if __name__ == "__main__":
+    main()

tests/conftest.py ADDED Viewed

	@@ -0,0 +1,8 @@

+# tests/conftest.py
+import sys
+from pathlib import Path
+# Add repo root to sys.path so "utils", "models", "scripts" are importable in tests
+ROOT = Path(__file__).resolve().parents[1]
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))

tests/test_preprocessing.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import numpy as np
+from utils.preprocessing import preprocess_spectrum, TARGET_LENGTH
+def test_shapes_and_monotonicity():
+    x = np.linspace(100, 200, 300)
+    y = np.sin(x/10.0) + 0.01*(x - 100)
+    x2, y2 = preprocess_spectrum(x, y, target_len=TARGET_LENGTH)
+    assert x2.shape == (TARGET_LENGTH,)
+    assert y2.shape == (TARGET_LENGTH,)
+    assert np.all(np.diff(x2) > 0)
+def test_idempotency():
+    x = np.linspace(0, 100, 400)
+    y = np.cos(x/7.0) + 0.002*x
+    _, y1 = preprocess_spectrum(x, y, target_len=TARGET_LENGTH)
+    _, y2 = preprocess_spectrum(np.linspace(x.min(), x.max(), TARGET_LENGTH), y1, target_len=TARGET_LENGTH)
+    np.testing.assert_allclose(y1, y2, rtol=1e-6, atol=1e-7)

utils/__init__.py CHANGED Viewed

@@ -1,4 +0,0 @@
-"""Utility functions for the polymer classification app"""
-from .preprocessing import resample_spectrum
-__all__ = ['resample_spectrum']

utils/audit.py ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/usr/bin/env python3
+"""
+audit.py - quick audit tool for preprocessing baseline
+Searches for relevant keywords in the ml-polymer-recycling repo
+to confirm what preprocessing steps (resample, baseline, smooth,
+normalize, etc.) are actually implemented in code/docs.
+"""
+import re
+from pathlib import Path
+# ||== KEYWORDS TO TRACE ==||
+KEYWORDS = [
+    "resample", "baseline", "smooth", "Savitz",
+    "normalize", "minmax" "TARGET_LENGTH", "WINDOW_LENGTH",
+    "POLYORDER", "DEGREE", "input_length", "target_len", "Figure2CNN", "ResNet"
+]
+# ||==== DIRECTORIES/FILES TO SCAN ====||
+TARGETS = [
+    "scripts/preprocess_dataset.py",
+    "scripts/run_inferece.py",
+    "models/",
+    "utils/",
+    "README.md",
+    "GROUND_TRUTH_PIPELINE.md",
+    "docs/"
+]
+# ||==== COMPILE REGEX FOR KEYWORDS  ====||
+pattern = re.compile("|".join(KEYWORDS), re.IGNORECASE)
+def scan_file(path: Path):
+    try:
+        with path.open(encoding="utf-8", errors="ignore") as f:
+            for i, line in enumerate(f, 1):
+                if pattern.search(line):
+                    print(f"{path}:{i}: {line.strip()}")
+    except Exception as e:
+        print(f"[ERR] Could not read {path}: {e}")
+def main():
+    root = Path(".").resolve()
+    for target in TARGETS:
+        p = root / target
+        if p.is_file():
+            scan_file(p)
+        elif p.is_dir():
+            for sub in p.rglob("*.py"):
+                scan_file(sub)
+            for sub in p.rglob("*.md"):
+                scan_file(sub)
+if __name__ == "__main__":
+    main()

utils/preprocessing.py CHANGED Viewed

@@ -3,107 +3,84 @@ Preprocessing utilities for polymer classification app.
 Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.
 """
 import numpy as np
 from scipy.interpolate import interp1d
 from scipy.signal import savgol_filter
-from sklearn.preprocessing import minmax_scale
-# Default resample target
-TARGET_LENGTH = 500
-def remove_baseline(y):
-    """Simple baseline correction using polynomial fitting (order 2)"""
-    x = np.arange(len(y))
-    coeffs = np.polyfit(x, y, deg=2)
-    baseline = np.polyval(coeffs, x)
     return y - baseline
-def normalize_spectrum(y):
-    """Min-max normalization to [0, 1]"""
-    return minmax_scale(y)
-def smooth_spectrum(y, window_length=11, polyorder=2):
-    """Apply Savitzky-Golay smoothing."""
-    if len(y) < window_length:
-        window_length = len(y) if len(y) % 2 == 1 else len(y) - 1
-        if window_length < 3:
-            return y
-    return savgol_filter(y, window_length, polyorder)
-def resample_spectrum(x, y, target_len=TARGET_LENGTH):
-    """
-    Resample a spectrum to a fixed number of points using linear interpolation.
-    Args:
-        x (array-like): Wavenumber values
-        y (array-like): Intensity values
-        target_len (int): Target number of points
-    Returns:
-        np.ndarray: Resampled intensity values
-    """
-    # Ensure inputs are numpy arrays
-    x = np.asarray(x)
-    y = np.asarray(y)
-    # Check for valid input
-    if len(x) != len(y):
-        raise ValueError(f"x and y must have same length: {len(x)} vs {len(y)}")
-    if len(x) < 2:
-        raise ValueError("Need at least 2 points for interpolation")
-    # Sort by x values to ensure monotonic order
-    sort_idx = np.argsort(x)
-    x_sorted = x[sort_idx]
-    y_sorted = y[sort_idx]
-    # Check for duplicate x values
-    if len(np.unique(x_sorted)) != len(x_sorted):
-        # Remove duplicates by averaging y values for same x
-        x_unique, inverse_indices = np.unique(x_sorted, return_inverse=True)
-        y_unique = np.zeros_like(x_unique, dtype=float)
-        for i in range(len(x_unique)):
-            mask = inverse_indices == i
-            y_unique[i] = np.mean(y_sorted[mask])
-        x_sorted, y_sorted = x_unique, y_unique
-    # Create interpolation function
-    f_interp = interp1d(x_sorted, y_sorted, kind='linear', bounds_error=False, fill_value=np.nan)
-    # Generate uniform grid
-    x_uniform = np.linspace(min(x_sorted), max(x_sorted), target_len)
-    y_uniform = f_interp(x_uniform)
-    return y_uniform
-def preprocess_spectrum(x, y, target_len=500, baseline_correction=False,
-                       apply_smoothing=False, normalize=False):
-    """
-    Complete preprocessing pipeline for a single spectrum.
-    Args:
-        x (array-like): Wavenumber values
-        y (array-like): Intensity values
-        target_len (int): Number of points to resample to
-        baseline_correction (bool): Whether to apply baseline removal
-        apply_smoothing (bool): Whether to apply Savitzky-Golay smoothing
-        normalize (bool): Whether to apply min-max normalization
-    Returns:
-        np.ndarray: Preprocessed spectrum
-    """
-    # Resample first
-    y_processed = resample_spectrum(x, y, target_len=target_len)
-    # Optional preprocessing steps
-    if baseline_correction:
-        y_processed = remove_baseline(y_processed)
-    if apply_smoothing:
-        y_processed = smooth_spectrum(y_processed)
-    if normalize:
-        y_processed = normalize_spectrum(y_processed)
-    return y_processed

 Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.
 """
+from __future__ import annotations
 import numpy as np
+from numpy.typing import DTypeLike
 from scipy.interpolate import interp1d
 from scipy.signal import savgol_filter
+from scipy.interpolate import interp1d
+TARGET_LENGTH = 500     # Frozen default per PREPROCESSING_BASELINE
+def __ensure_1d_equal(x: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
+    x = np.asarray(x, dtype=float)
+    y = np.asarray(x, dtype=float)
+    if x.ndim != 1 or y.ndim != 1 or x.size != y.size or x.size < 2:
+        raise ValueError("x and y must be 1D arrays of equal length >= 2")
+    return x, y
+def resample_spectrum(x: np.ndarray, y: np.ndarray, target_len: int = TARGET_LENGTH) -> tuple[np.ndarray, np.ndarray]:
+    """Linear re-sampling onto a uniform grid of length target_len."""
+    x, y = __ensure_1d_equal(x, y)
+    order = np.argsort(x)
+    x_sorted, y_sorted = x[order], y[order]
+    x_new = np.linspace(x_sorted[0], x_sorted[-1], int(target_len))
+    f = interp1d(x_sorted, y_sorted, kind="linear", assume_sorted=True)
+    y_new = f(x_new)
+    return x_new, y_new
+def remove_baseline(y: np.ndarray, degree: int = 2) -> np.ndarray:
+    """Polynomial baseline subtraction (degree=2 default)"""
+    y = np.asarray(y, dtype=float)
+    x_idx = np.arange(y.size, dtype=float)
+    coeffs = np.polyfit(x_idx, y, deg=int(degree))
+    baseline = np.polyval(coeffs, x_idx)
     return y - baseline
+def smooth_spectrum(y: np.ndarray, window_length: int = 11, polyorder: int = 2) -> np.ndarray:
+    """Savitzky-Golay smoothing with safe/odd window enforcement"""
+    y = np.asarray(y, dtype=float)
+    window_length = int(window_length)
+    polyorder = int(polyorder)
+    # === window must be odd and >= polyorder+1 ===
+    if window_length % 2 == 0:
+        window_length += 1
+    min_win = polyorder + 1
+    if min_win % 2 == 0:
+        min_win += 1
+    window_length = max(window_length, min_win)
+    return savgol_filter(y, window_length=window_length, polyorder=polyorder, mode="interp")
+def normalize_spectrum(y: np.ndarray) -> np.ndarray:
+    """Min-max normalization to [0, 1] with constant-signal guard."""
+    y = np.asarray(y, dtype=float)
+    y_min = float(np.min(y))
+    y_max = float(np.max(y))
+    if np.isclose(y_max - y_min, 0.0):
+        return np.zeros_like(y)
+    return (y - y_min) / (y_max - y_min)
+def preprocess_spectrum(
+    x: np.ndarray,
+    y: np.ndarray,
+    *,
+    target_len: int = TARGET_LENGTH,
+    do_baseline: bool = True,
+    degree: int = 2,
+    do_smooth: bool = True,
+    window_length: int = 11,
+    polyorder: int = 2,
+    do_normalize: bool = True,
+    out_dtype: DTypeLike = np.float32,
+) -> tuple[np.ndarray, np.ndarray]:
+    """Exact CLI baseline: resample -> baseline -> smooth -> normalize"""
+    x_rs, y_rs = resample_spectrum(x, y, target_len=target_len)
+    if do_baseline:
+        y_rs = remove_baseline(y_rs, degree=degree)
+    if do_smooth:
+        y_rs = smooth_spectrum(y_rs, window_length=window_length, polyorder=polyorder)
+    if do_normalize:
+        y_rs = normalize_spectrum(y_rs)
+    # === Coerce to a real dtype to satisfy static checkers & runtime ===
+    out_dt = np.dtype(out_dtype)
+    return x_rs.astype(out_dt, copy=False), y_rs.astype(out_dt, copy=False)