devjas1 commited on
Commit
f5cad9a
·
1 Parent(s): 05d496e

(FEAT+REFAC)[Data Preprocessing]: Expand preprocessing for multi-modality & file formats

Browse files

- Updated preprocessing logic to support both Raman and FTIR modalities.
- Added parameterization for baseline correction, Savitzky-Golay smoothing, and min-max normalization.
- Enhanced 'preprocess_spectrum()' to accept modality and target length for resampling.
- Improved error handling and data validation for input spectra.
- Updated docstrings and ensured compatibility with TXT, CSV, and JSON file formats for seamless integration with new input pipeline.

Files changed (1) hide show
  1. utils/preprocessing.py +133 -10
utils/preprocessing.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  Preprocessing utilities for polymer classification app.
3
  Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.
 
4
  """
5
 
6
  from __future__ import annotations
@@ -9,8 +10,33 @@ from numpy.typing import DTypeLike
9
  from scipy.interpolate import interp1d
10
  from scipy.signal import savgol_filter
11
  from scipy.interpolate import interp1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- TARGET_LENGTH = 500 # Frozen default per PREPROCESSING_BASELINE
14
 
15
  def _ensure_1d_equal(x: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
16
  x = np.asarray(x, dtype=float)
@@ -19,7 +45,10 @@ def _ensure_1d_equal(x: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarr
19
  raise ValueError("x and y must be 1D arrays of equal length >= 2")
20
  return x, y
21
 
22
- def resample_spectrum(x: np.ndarray, y: np.ndarray, target_len: int = TARGET_LENGTH) -> tuple[np.ndarray, np.ndarray]:
 
 
 
23
  """Linear re-sampling onto a uniform grid of length target_len."""
24
  x, y = _ensure_1d_equal(x, y)
25
  order = np.argsort(x)
@@ -29,6 +58,7 @@ def resample_spectrum(x: np.ndarray, y: np.ndarray, target_len: int = TARGET_LEN
29
  y_new = f(x_new)
30
  return x_new, y_new
31
 
 
32
  def remove_baseline(y: np.ndarray, degree: int = 2) -> np.ndarray:
33
  """Polynomial baseline subtraction (degree=2 default)"""
34
  y = np.asarray(y, dtype=float)
@@ -37,19 +67,25 @@ def remove_baseline(y: np.ndarray, degree: int = 2) -> np.ndarray:
37
  baseline = np.polyval(coeffs, x_idx)
38
  return y - baseline
39
 
40
- def smooth_spectrum(y: np.ndarray, window_length: int = 11, polyorder: int = 2) -> np.ndarray:
 
 
 
41
  """Savitzky-Golay smoothing with safe/odd window enforcement"""
42
  y = np.asarray(y, dtype=float)
43
  window_length = int(window_length)
44
  polyorder = int(polyorder)
45
  # === window must be odd and >= polyorder+1 ===
46
  if window_length % 2 == 0:
47
- window_length += 1
48
  min_win = polyorder + 1
49
  if min_win % 2 == 0:
50
  min_win += 1
51
  window_length = max(window_length, min_win)
52
- return savgol_filter(y, window_length=window_length, polyorder=polyorder, mode="interp")
 
 
 
53
 
54
  def normalize_spectrum(y: np.ndarray) -> np.ndarray:
55
  """Min-max normalization to [0, 1] with constant-signal guard."""
@@ -60,27 +96,114 @@ def normalize_spectrum(y: np.ndarray) -> np.ndarray:
60
  return np.zeros_like(y)
61
  return (y - y_min) / (y_max - y_min)
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def preprocess_spectrum(
64
  x: np.ndarray,
65
  y: np.ndarray,
66
  *,
67
  target_len: int = TARGET_LENGTH,
 
68
  do_baseline: bool = True,
69
- degree: int = 2,
70
  do_smooth: bool = True,
71
- window_length: int = 11,
72
- polyorder: int = 2,
73
  do_normalize: bool = True,
74
  out_dtype: DTypeLike = np.float32,
 
75
  ) -> tuple[np.ndarray, np.ndarray]:
76
- """Exact CLI baseline: resample -> baseline -> smooth -> normalize"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  x_rs, y_rs = resample_spectrum(x, y, target_len=target_len)
 
78
  if do_baseline:
79
  y_rs = remove_baseline(y_rs, degree=degree)
 
80
  if do_smooth:
81
  y_rs = smooth_spectrum(y_rs, window_length=window_length, polyorder=polyorder)
 
 
 
 
 
 
 
 
 
 
82
  if do_normalize:
83
  y_rs = normalize_spectrum(y_rs)
 
84
  # === Coerce to a real dtype to satisfy static checkers & runtime ===
85
  out_dt = np.dtype(out_dtype)
86
- return x_rs.astype(out_dt, copy=False), y_rs.astype(out_dt, copy=False)
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Preprocessing utilities for polymer classification app.
3
  Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.
4
+ Supports both Raman and FTIR spectroscopy modalities.
5
  """
6
 
7
  from __future__ import annotations
 
10
  from scipy.interpolate import interp1d
11
  from scipy.signal import savgol_filter
12
  from scipy.interpolate import interp1d
13
+ from typing import Tuple, Literal
14
+
15
+ TARGET_LENGTH = 500 # Frozen default per PREPROCESSING_BASELINE
16
+
17
+ # Modality-specific validation ranges (cm⁻¹)
18
+ MODALITY_RANGES = {
19
+ "raman": (200, 4000), # Typical Raman range
20
+ "ftir": (400, 4000), # FTIR wavenumber range
21
+ }
22
+
23
+ # Modality-specific preprocessing parameters
24
+ MODALITY_PARAMS = {
25
+ "raman": {
26
+ "baseline_degree": 2,
27
+ "smooth_window": 11,
28
+ "smooth_polyorder": 2,
29
+ "cosmic_ray_removal": False,
30
+ },
31
+ "ftir": {
32
+ "baseline_degree": 2,
33
+ "smooth_window": 13, # Slightly larger window for FTIR
34
+ "smooth_polyorder": 2,
35
+ "cosmic_ray_removal": False, # Could add atmospheric correction
36
+ "atmospheric_correction": False, # Placeholder for future implementation
37
+ },
38
+ }
39
 
 
40
 
41
  def _ensure_1d_equal(x: np.ndarray, y: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
42
  x = np.asarray(x, dtype=float)
 
45
  raise ValueError("x and y must be 1D arrays of equal length >= 2")
46
  return x, y
47
 
48
+
49
+ def resample_spectrum(
50
+ x: np.ndarray, y: np.ndarray, target_len: int = TARGET_LENGTH
51
+ ) -> tuple[np.ndarray, np.ndarray]:
52
  """Linear re-sampling onto a uniform grid of length target_len."""
53
  x, y = _ensure_1d_equal(x, y)
54
  order = np.argsort(x)
 
58
  y_new = f(x_new)
59
  return x_new, y_new
60
 
61
+
62
  def remove_baseline(y: np.ndarray, degree: int = 2) -> np.ndarray:
63
  """Polynomial baseline subtraction (degree=2 default)"""
64
  y = np.asarray(y, dtype=float)
 
67
  baseline = np.polyval(coeffs, x_idx)
68
  return y - baseline
69
 
70
+
71
+ def smooth_spectrum(
72
+ y: np.ndarray, window_length: int = 11, polyorder: int = 2
73
+ ) -> np.ndarray:
74
  """Savitzky-Golay smoothing with safe/odd window enforcement"""
75
  y = np.asarray(y, dtype=float)
76
  window_length = int(window_length)
77
  polyorder = int(polyorder)
78
  # === window must be odd and >= polyorder+1 ===
79
  if window_length % 2 == 0:
80
+ window_length += 1
81
  min_win = polyorder + 1
82
  if min_win % 2 == 0:
83
  min_win += 1
84
  window_length = max(window_length, min_win)
85
+ return savgol_filter(
86
+ y, window_length=window_length, polyorder=polyorder, mode="interp"
87
+ )
88
+
89
 
90
  def normalize_spectrum(y: np.ndarray) -> np.ndarray:
91
  """Min-max normalization to [0, 1] with constant-signal guard."""
 
96
  return np.zeros_like(y)
97
  return (y - y_min) / (y_max - y_min)
98
 
99
+
100
+ def validate_spectrum_range(x: np.ndarray, modality: str = "raman") -> bool:
101
+ """Validate that spectrum wavenumbers are within expected range for modality."""
102
+ if modality not in MODALITY_RANGES:
103
+ raise ValueError(
104
+ f"Unknown modality '{modality}'. Supported: {list(MODALITY_RANGES.keys())}"
105
+ )
106
+
107
+ min_range, max_range = MODALITY_RANGES[modality]
108
+ x_min, x_max = np.min(x), np.max(x)
109
+
110
+ # Check if majority of data points are within range
111
+ in_range = np.sum((x >= min_range) & (x <= max_range))
112
+ total_points = len(x)
113
+
114
+ return (in_range / total_points) >= 0.7 # At least 70% should be in range
115
+
116
+
117
  def preprocess_spectrum(
118
  x: np.ndarray,
119
  y: np.ndarray,
120
  *,
121
  target_len: int = TARGET_LENGTH,
122
+ modality: str = "raman", # New parameter for modality-specific processing
123
  do_baseline: bool = True,
124
+ degree: int | None = None, # Will use modality default if None
125
  do_smooth: bool = True,
126
+ window_length: int | None = None, # Will use modality default if None
127
+ polyorder: int | None = None, # Will use modality default if None
128
  do_normalize: bool = True,
129
  out_dtype: DTypeLike = np.float32,
130
+ validate_range: bool = True,
131
  ) -> tuple[np.ndarray, np.ndarray]:
132
+ """
133
+ Modality-aware preprocessing: resample -> baseline -> smooth -> normalize
134
+
135
+ Args:
136
+ x, y: Input spectrum data
137
+ target_len: Target length for resampling
138
+ modality: 'raman' or 'ftir' for modality-specific processing
139
+ do_baseline: Enable baseline correction
140
+ degree: Polynomial degree for baseline (uses modality default if None)
141
+ do_smooth: Enable smoothing
142
+ window_length: Smoothing window length (uses modality default if None)
143
+ polyorder: Polynomial order for smoothing (uses modality default if None)
144
+ do_normalize: Enable normalization
145
+ out_dtype: Output data type
146
+ validate_range: Check if wavenumbers are in expected range for modality
147
+
148
+ Returns:
149
+ Tuple of (resampled_x, processed_y)
150
+ """
151
+ # Validate modality
152
+ if modality not in MODALITY_PARAMS:
153
+ raise ValueError(
154
+ f"Unsupported modality '{modality}'. Supported: {list(MODALITY_PARAMS.keys())}"
155
+ )
156
+
157
+ # Get modality-specific parameters
158
+ modality_config = MODALITY_PARAMS[modality]
159
+
160
+ # Use modality defaults if parameters not specified
161
+ if degree is None:
162
+ degree = modality_config["baseline_degree"]
163
+ if window_length is None:
164
+ window_length = modality_config["smooth_window"]
165
+ if polyorder is None:
166
+ polyorder = modality_config["smooth_polyorder"]
167
+
168
+ # Validate spectrum range if requested
169
+ if validate_range:
170
+ if not validate_spectrum_range(x, modality):
171
+ print(
172
+ f"Warning: Spectrum wavenumbers may not be optimal for {modality.upper()} analysis"
173
+ )
174
+
175
+ # Standard preprocessing pipeline
176
  x_rs, y_rs = resample_spectrum(x, y, target_len=target_len)
177
+
178
  if do_baseline:
179
  y_rs = remove_baseline(y_rs, degree=degree)
180
+
181
  if do_smooth:
182
  y_rs = smooth_spectrum(y_rs, window_length=window_length, polyorder=polyorder)
183
+
184
+ # FTIR-specific processing (placeholder for future enhancements)
185
+ if modality == "ftir":
186
+ if modality_config.get("atmospheric_correction", False):
187
+ # Placeholder for atmospheric correction
188
+ pass
189
+ if modality_config.get("cosmic_ray_removal", False):
190
+ # Placeholder for cosmic ray removal
191
+ pass
192
+
193
  if do_normalize:
194
  y_rs = normalize_spectrum(y_rs)
195
+
196
  # === Coerce to a real dtype to satisfy static checkers & runtime ===
197
  out_dt = np.dtype(out_dtype)
198
+ return x_rs.astype(out_dt, copy=False), y_rs.astype(out_dt, copy=False)
199
+
200
+
201
+ def get_modality_info(modality: str) -> dict:
202
+ """Get processing parameters and validation ranges for a modality."""
203
+ if modality not in MODALITY_PARAMS:
204
+ raise ValueError(f"Unknown modality '{modality}'")
205
+
206
+ return {
207
+ "range": MODALITY_RANGES[modality],
208
+ "params": MODALITY_PARAMS[modality].copy(),
209
+ }