File size: 7,586 Bytes
65f2520
 
 
f5cad9a
65f2520
 
6373c5a
65f2520
6373c5a
65f2520
 
6373c5a
f5cad9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65f2520
6373c5a
b1b7e3c
6373c5a
b1b7e3c
6373c5a
 
 
 
f5cad9a
 
 
 
6373c5a
b1b7e3c
6373c5a
 
 
 
 
 
 
f5cad9a
6373c5a
 
 
 
 
 
65f2520
 
f5cad9a
 
 
 
6373c5a
 
 
 
 
 
f5cad9a
6373c5a
 
 
 
f5cad9a
 
 
 
6373c5a
 
 
 
 
 
 
 
 
 
f5cad9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6373c5a
 
 
 
 
f5cad9a
6373c5a
f5cad9a
6373c5a
f5cad9a
 
6373c5a
 
f5cad9a
6373c5a
f5cad9a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6373c5a
f5cad9a
6373c5a
 
f5cad9a
6373c5a
 
f5cad9a
 
 
 
 
 
 
 
 
 
6373c5a
 
f5cad9a
6373c5a
 
f5cad9a
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
"""

Preprocessing utilities for polymer classification app.

Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.

Supports both Raman and FTIR spectroscopy modalities.

"""

from __future__ import annotations
import numpy as np
from numpy.typing import DTypeLike
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter
from scipy.interpolate import interp1d
from typing import Tuple, Literal

TARGET_LENGTH = 500  # Frozen default per PREPROCESSING_BASELINE

# Modality-specific validation ranges (cm⁻¹)
MODALITY_RANGES = {
    "raman": (200, 4000),  # Typical Raman range
    "ftir": (400, 4000),  # FTIR wavenumber range
}

# Modality-specific preprocessing parameters
MODALITY_PARAMS = {
    "raman": {
        "baseline_degree": 2,
        "smooth_window": 11,
        "smooth_polyorder": 2,
        "cosmic_ray_removal": False,
    },
    "ftir": {
        "baseline_degree": 2,
        "smooth_window": 13,  # Slightly larger window for FTIR
        "smooth_polyorder": 2,
        "cosmic_ray_removal": False,  # Could add atmospheric correction
        "atmospheric_correction": False,  # Placeholder for future implementation
    },
}


def _ensure_1d_equal(x: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    x = np.asarray(x, dtype=float)
    y = np.asarray(y, dtype=float)
    if x.ndim != 1 or y.ndim != 1 or x.size != y.size or x.size < 2:
        raise ValueError("x and y must be 1D arrays of equal length >= 2")
    return x, y


def resample_spectrum(

    x: np.ndarray, y: np.ndarray, target_len: int = TARGET_LENGTH

) -> tuple[np.ndarray, np.ndarray]:
    """Linear re-sampling onto a uniform grid of length target_len."""
    x, y = _ensure_1d_equal(x, y)
    order = np.argsort(x)
    x_sorted, y_sorted = x[order], y[order]
    x_new = np.linspace(x_sorted[0], x_sorted[-1], int(target_len))
    f = interp1d(x_sorted, y_sorted, kind="linear", assume_sorted=True)
    y_new = f(x_new)
    return x_new, y_new


def remove_baseline(y: np.ndarray, degree: int = 2) -> np.ndarray:
    """Polynomial baseline subtraction (degree=2 default)"""
    y = np.asarray(y, dtype=float)
    x_idx = np.arange(y.size, dtype=float)
    coeffs = np.polyfit(x_idx, y, deg=int(degree))
    baseline = np.polyval(coeffs, x_idx)
    return y - baseline


def smooth_spectrum(

    y: np.ndarray, window_length: int = 11, polyorder: int = 2

) -> np.ndarray:
    """Savitzky-Golay smoothing with safe/odd window enforcement"""
    y = np.asarray(y, dtype=float)
    window_length = int(window_length)
    polyorder = int(polyorder)
    # === window must be odd and >= polyorder+1 ===
    if window_length % 2 == 0:
        window_length += 1
    min_win = polyorder + 1
    if min_win % 2 == 0:
        min_win += 1
    window_length = max(window_length, min_win)
    return savgol_filter(
        y, window_length=window_length, polyorder=polyorder, mode="interp"
    )


def normalize_spectrum(y: np.ndarray) -> np.ndarray:
    """Min-max normalization to [0, 1] with constant-signal guard."""
    y = np.asarray(y, dtype=float)
    y_min = float(np.min(y))
    y_max = float(np.max(y))
    if np.isclose(y_max - y_min, 0.0):
        return np.zeros_like(y)
    return (y - y_min) / (y_max - y_min)


def validate_spectrum_range(x: np.ndarray, modality: str = "raman") -> bool:
    """Validate that spectrum wavenumbers are within expected range for modality."""
    if modality not in MODALITY_RANGES:
        raise ValueError(
            f"Unknown modality '{modality}'. Supported: {list(MODALITY_RANGES.keys())}"
        )

    min_range, max_range = MODALITY_RANGES[modality]
    x_min, x_max = np.min(x), np.max(x)

    # Check if majority of data points are within range
    in_range = np.sum((x >= min_range) & (x <= max_range))
    total_points = len(x)

    return (in_range / total_points) >= 0.7  # At least 70% should be in range


def preprocess_spectrum(

    x: np.ndarray,

    y: np.ndarray,

    *,

    target_len: int = TARGET_LENGTH,

    modality: str = "raman",  # New parameter for modality-specific processing

    do_baseline: bool = True,

    degree: int | None = None,  # Will use modality default if None

    do_smooth: bool = True,

    window_length: int | None = None,  # Will use modality default if None

    polyorder: int | None = None,  # Will use modality default if None

    do_normalize: bool = True,

    out_dtype: DTypeLike = np.float32,

    validate_range: bool = True,

) -> tuple[np.ndarray, np.ndarray]:
    """

    Modality-aware preprocessing: resample -> baseline -> smooth -> normalize



    Args:

        x, y: Input spectrum data

        target_len: Target length for resampling

        modality: 'raman' or 'ftir' for modality-specific processing

        do_baseline: Enable baseline correction

        degree: Polynomial degree for baseline (uses modality default if None)

        do_smooth: Enable smoothing

        window_length: Smoothing window length (uses modality default if None)

        polyorder: Polynomial order for smoothing (uses modality default if None)

        do_normalize: Enable normalization

        out_dtype: Output data type

        validate_range: Check if wavenumbers are in expected range for modality



    Returns:

        Tuple of (resampled_x, processed_y)

    """
    # Validate modality
    if modality not in MODALITY_PARAMS:
        raise ValueError(
            f"Unsupported modality '{modality}'. Supported: {list(MODALITY_PARAMS.keys())}"
        )

    # Get modality-specific parameters
    modality_config = MODALITY_PARAMS[modality]

    # Use modality defaults if parameters not specified
    if degree is None:
        degree = modality_config["baseline_degree"]
    if window_length is None:
        window_length = modality_config["smooth_window"]
    if polyorder is None:
        polyorder = modality_config["smooth_polyorder"]

    # Validate spectrum range if requested
    if validate_range:
        if not validate_spectrum_range(x, modality):
            print(
                f"Warning: Spectrum wavenumbers may not be optimal for {modality.upper()} analysis"
            )

    # Standard preprocessing pipeline
    x_rs, y_rs = resample_spectrum(x, y, target_len=target_len)

    if do_baseline:
        y_rs = remove_baseline(y_rs, degree=degree)

    if do_smooth:
        y_rs = smooth_spectrum(y_rs, window_length=window_length, polyorder=polyorder)

    # FTIR-specific processing (placeholder for future enhancements)
    if modality == "ftir":
        if modality_config.get("atmospheric_correction", False):
            # Placeholder for atmospheric correction
            pass
        if modality_config.get("cosmic_ray_removal", False):
            # Placeholder for cosmic ray removal
            pass

    if do_normalize:
        y_rs = normalize_spectrum(y_rs)

    # === Coerce to a real dtype to satisfy static checkers & runtime ===
    out_dt = np.dtype(out_dtype)
    return x_rs.astype(out_dt, copy=False), y_rs.astype(out_dt, copy=False)


def get_modality_info(modality: str) -> dict:
    """Get processing parameters and validation ranges for a modality."""
    if modality not in MODALITY_PARAMS:
        raise ValueError(f"Unknown modality '{modality}'")

    return {
        "range": MODALITY_RANGES[modality],
        "params": MODALITY_PARAMS[modality].copy(),
    }