devjas1 commited on
Commit
65f2520
·
1 Parent(s): 4b30bc8

(FEAT): add utility functions for polymer classification app preprocessing

Browse files
deploy/hf-space/utils/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """Utility functions for the polymer classification app"""
2
+ from .preprocessing import resample_spectrum
3
+
4
+ __all__ = ['resample_spectrum']
deploy/hf-space/utils/preprocessing.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Preprocessing utilities for polymer classification app.
3
+ Adapted from the original scripts/preprocess_dataset.py for Hugging Face Spaces deployment.
4
+ """
5
+
6
+ import numpy as np
7
+ from scipy.interpolate import interp1d
8
+ from scipy.signal import savgol_filter
9
+ from sklearn.preprocessing import minmax_scale
10
+
11
+ # Default resample target
12
+ TARGET_LENGTH = 500
13
+
14
+ def remove_baseline(y):
15
+ """Simple baseline correction using polynomial fitting (order 2)"""
16
+ x = np.arange(len(y))
17
+ coeffs = np.polyfit(x, y, deg=2)
18
+ baseline = np.polyval(coeffs, x)
19
+ return y - baseline
20
+
21
+ def normalize_spectrum(y):
22
+ """Min-max normalization to [0, 1]"""
23
+ return minmax_scale(y)
24
+
25
+ def smooth_spectrum(y, window_length=11, polyorder=2):
26
+ """Apply Savitzky-Golay smoothing."""
27
+ if len(y) < window_length:
28
+ window_length = len(y) if len(y) % 2 == 1 else len(y) - 1
29
+ if window_length < 3:
30
+ return y
31
+ return savgol_filter(y, window_length, polyorder)
32
+
33
+ def resample_spectrum(x, y, target_len=TARGET_LENGTH):
34
+ """
35
+ Resample a spectrum to a fixed number of points using linear interpolation.
36
+
37
+ Args:
38
+ x (array-like): Wavenumber values
39
+ y (array-like): Intensity values
40
+ target_len (int): Target number of points
41
+
42
+ Returns:
43
+ np.ndarray: Resampled intensity values
44
+ """
45
+ # Ensure inputs are numpy arrays
46
+ x = np.asarray(x)
47
+ y = np.asarray(y)
48
+
49
+ # Check for valid input
50
+ if len(x) != len(y):
51
+ raise ValueError(f"x and y must have same length: {len(x)} vs {len(y)}")
52
+
53
+ if len(x) < 2:
54
+ raise ValueError("Need at least 2 points for interpolation")
55
+
56
+ # Sort by x values to ensure monotonic order
57
+ sort_idx = np.argsort(x)
58
+ x_sorted = x[sort_idx]
59
+ y_sorted = y[sort_idx]
60
+
61
+ # Check for duplicate x values
62
+ if len(np.unique(x_sorted)) != len(x_sorted):
63
+ # Remove duplicates by averaging y values for same x
64
+ x_unique, inverse_indices = np.unique(x_sorted, return_inverse=True)
65
+ y_unique = np.zeros_like(x_unique, dtype=float)
66
+ for i in range(len(x_unique)):
67
+ mask = inverse_indices == i
68
+ y_unique[i] = np.mean(y_sorted[mask])
69
+ x_sorted, y_sorted = x_unique, y_unique
70
+
71
+ # Create interpolation function
72
+ f_interp = interp1d(x_sorted, y_sorted, kind='linear', bounds_error=False, fill_value=np.nan)
73
+
74
+ # Generate uniform grid
75
+ x_uniform = np.linspace(min(x_sorted), max(x_sorted), target_len)
76
+ y_uniform = f_interp(x_uniform)
77
+
78
+ return y_uniform
79
+
80
+ def preprocess_spectrum(x, y, target_len=500, baseline_correction=False,
81
+ apply_smoothing=False, normalize=False):
82
+ """
83
+ Complete preprocessing pipeline for a single spectrum.
84
+
85
+ Args:
86
+ x (array-like): Wavenumber values
87
+ y (array-like): Intensity values
88
+ target_len (int): Number of points to resample to
89
+ baseline_correction (bool): Whether to apply baseline removal
90
+ apply_smoothing (bool): Whether to apply Savitzky-Golay smoothing
91
+ normalize (bool): Whether to apply min-max normalization
92
+
93
+ Returns:
94
+ np.ndarray: Preprocessed spectrum
95
+ """
96
+ # Resample first
97
+ y_processed = resample_spectrum(x, y, target_len=target_len)
98
+
99
+ # Optional preprocessing steps
100
+ if baseline_correction:
101
+ y_processed = remove_baseline(y_processed)
102
+
103
+ if apply_smoothing:
104
+ y_processed = smooth_spectrum(y_processed)
105
+
106
+ if normalize:
107
+ y_processed = normalize_spectrum(y_processed)
108
+
109
+ return y_processed