devjas1 commited on
Commit
0a4f1a6
·
1 Parent(s): 6ea9614

(FEAT)[Data Parsing]: Support multi-format spectrum parsing and robust validation

Browse files

- Added 'detect_file_format' to auto-detect file type based on extension and content.
- Implemented 'parse_json_spectrum', 'parse_csv_spectrum', and 'parse_txt_spectrum' for flexible parsing of spectroscopy data.
- Unified entry point 'parse_spectrum_data' uses format detection and delegates to appropriate parser.
- Added 'validate_spectrum_data' to check for NaNs, monotonic x-axis, and reasonable value ranges, with sorting and warnings as needed.
- Updated error handling and logging for parsing failures or unusual data.
- Docstrings and comments improved for clarity.

Files changed (1) hide show
  1. utils/multifile.py +297 -56
utils/multifile.py CHANGED
@@ -1,11 +1,16 @@
1
- """Multi-file processing utiltities for batch inference.
2
- Handles multiple file uploads and iterative processing."""
 
3
 
4
- from typing import List, Dict, Any, Tuple, Optional
5
  import time
6
  import streamlit as st
7
  import numpy as np
8
  import pandas as pd
 
 
 
 
9
 
10
  from .preprocessing import resample_spectrum
11
  from .errors import ErrorHandler, safe_execute
@@ -13,83 +18,230 @@ from .results_manager import ResultsManager
13
  from .confidence import calculate_softmax_confidence
14
 
15
 
16
- def parse_spectrum_data(
17
- text_content: str, filename: str = "unknown"
18
- ) -> Tuple[np.ndarray, np.ndarray]:
19
- """
20
- Parse spectrum data from text content
21
 
22
  Args:
23
- text_content: Raw text content of the spectrum file
24
- filename: Name of the file for error reporting
25
 
26
  Returns:
27
- Tuple of (x_values, y_values) as numpy arrays
28
-
29
- Raises:
30
- ValueError: If the data cannot be parsed
31
  """
32
- try:
33
- lines = text_content.strip().split("\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- # ==Remove empty lines and comments==
36
- data_lines = []
37
- for line in lines:
38
- line = line.strip()
39
- if line and not line.startswith("#") and not line.startswith("%"):
40
- data_lines.append(line)
41
 
42
- if not data_lines:
43
- raise ValueError("No data lines found in file")
44
 
45
- # ==Try to parse==
46
- x_vals, y_vals = [], []
47
 
48
- for i, line in enumerate(data_lines):
49
- try:
50
- # Handle different separators
51
- parts = line.replace(",", " ").split()
52
- numbers = [
53
- p
54
- for p in parts
55
- if p.replace(".", "", 1)
56
- .replace("-", "", 1)
57
- .replace("+", "", 1)
58
- .isdigit()
59
- ]
60
- if len(numbers) >= 2:
61
- x_val = float(numbers[0])
62
- y_val = float(numbers[1])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  x_vals.append(x_val)
64
  y_vals.append(y_val)
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  except ValueError:
67
  ErrorHandler.log_warning(
68
- f"Could not parse line {i+1}: {line}", f"Parsing {filename}"
69
  )
70
  continue
71
 
72
- if len(x_vals) < 10: # ==Need minimum points for interpolation==
73
  raise ValueError(
74
  f"Insufficient data points ({len(x_vals)}). Need at least 10 points."
75
  )
76
 
77
- x = np.array(x_vals)
78
- y = np.array(y_vals)
79
 
80
- # Check for NaNs
81
- if np.any(np.isnan(x)) or np.any(np.isnan(y)):
82
- raise ValueError("Input data contains NaN values")
83
 
84
- # Check monotonic increasing x
85
- if not np.all(np.diff(x) > 0):
86
- raise ValueError("Wavenumbers must be strictly increasing")
87
 
88
- # Check reasonable range for Raman spectroscopy
89
- if min(x) < 0 or max(x) > 10000 or (max(x) - min(x)) < 100:
90
- raise ValueError(
91
- f"Invalid wavenumber range: {min(x)} - {max(x)}. Expected ~400-4000 cm⁻¹ with span >100"
92
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  return x, y
95
 
@@ -97,6 +249,95 @@ def parse_spectrum_data(
97
  raise ValueError(f"Failed to parse spectrum data: {str(e)}")
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  def process_single_file(
101
  filename: str,
102
  text_content: str,
 
1
+ """Multi-file processing utilities for batch inference.
2
+ Handles multiple file uploads and iterative processing.
3
+ Supports TXT, CSV, and JSON file formats with automatic detection."""
4
 
5
+ from typing import List, Dict, Any, Tuple, Optional, Union
6
  import time
7
  import streamlit as st
8
  import numpy as np
9
  import pandas as pd
10
+ import json
11
+ import csv
12
+ import io
13
+ from pathlib import Path
14
 
15
  from .preprocessing import resample_spectrum
16
  from .errors import ErrorHandler, safe_execute
 
18
  from .confidence import calculate_softmax_confidence
19
 
20
 
21
+ def detect_file_format(filename: str, content: str) -> str:
22
+ """Automatically detect file format based on exstention and content
 
 
 
23
 
24
  Args:
25
+ filename: Name of the file
26
+ content: Content of the file
27
 
28
  Returns:
29
+ File format: .'txt', .'csv', .'json'
 
 
 
30
  """
31
+ # First try by extension
32
+ suffix = Path(filename).suffix.lower()
33
+ if suffix == ".json":
34
+ try:
35
+ json.loads(content)
36
+ return "json"
37
+ except:
38
+ pass
39
+ elif suffix == ".csv":
40
+ return "csv"
41
+ elif suffix == ".txt":
42
+ return "txt"
43
+
44
+ # If extension doesn't match or is unclear, try content detection
45
+ content_stripped = content.strip()
46
+
47
+ # Try JSON
48
+ if content_stripped.startswith(("{", "[")):
49
+ try:
50
+ json.loads(content)
51
+ return "json"
52
+ except:
53
+ pass
54
 
55
+ # Try CSV (look for commas in first few lines)
56
+ lines = content_stripped.split("\n")[:5]
57
+ comma_count = sum(line.count(",") for line in lines)
58
+ if comma_count > len(lines): # More commas than lines suggests CSV
59
+ return "csv"
 
60
 
61
+ # Default to TXT
62
+ return "txt"
63
 
 
 
64
 
65
+ # /////////////////////////////////////////////////////
66
+
67
+
68
+ def parse_json_spectrum(
69
+ content: str, filename: str = "unknown"
70
+ ) -> Tuple[np.ndarray, np.ndarray]:
71
+ """
72
+ Parse spectrum data from JSON format.
73
+
74
+ Expected formats:
75
+ - {"wavenumbers": [...], "intensities": [...]}
76
+ - {"x": [...], "y": [...]}
77
+ - [{"wavenumber": val, "intensity": val}, ...]
78
+ """
79
+
80
+ try:
81
+ data = json.load(content)
82
+
83
+ # Format 1: Object with arrays
84
+ if isinstance(data, dict):
85
+ x_key = None
86
+ y_key = None
87
+
88
+ # Try common key names for x-axis
89
+ for key in ["wavenumbers", "wavenumber", "x", "freq", "frequency"]:
90
+ if key in data:
91
+ x_key = key
92
+ break
93
+
94
+ # Try common key names for y-axis
95
+ for key in ["intensities", "intensity", "y", "counts", "absorbance"]:
96
+ if key in data:
97
+ y_key = key
98
+ break
99
+
100
+ if x_key and y_key:
101
+ x_vals = np.array(data[x_key], dtype=float)
102
+ y_vals = np.array(data[y_key], dtype=float)
103
+ return x_vals, y_vals
104
+
105
+ # Format 2: Array of objects
106
+ elif isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
107
+ x_vals = []
108
+ y_vals = []
109
+
110
+ for item in data:
111
+ # Try to find x and y values
112
+ x_val = None
113
+ y_val = None
114
+
115
+ for x_key in ["wavenumber", "wavenumbers", "x", "freq"]:
116
+ if x_key in item:
117
+ x_val = float(item[x_key])
118
+ break
119
+
120
+ for y_key in ["intensity", "intensities", "y", "counts"]:
121
+ if y_key in item:
122
+ y_val = float(item[y_key])
123
+ break
124
+
125
+ if x_val is not None and y_val is not None:
126
  x_vals.append(x_val)
127
  y_vals.append(y_val)
128
 
129
+ if x_vals and y_vals:
130
+ return np.array(x_vals), np.array(y_vals)
131
+
132
+ raise ValueError(
133
+ "JSON format not recognized. Expected wavenumber/intensity pairs."
134
+ )
135
+
136
+ except json.JSONDecodeError as e:
137
+ raise ValueError(f"Invalid JSON format: {str(e)}")
138
+ except Exception as e:
139
+ raise ValueError(f"Failed to parse JSON spectrum: {str(e)}")
140
+
141
+
142
+ # /////////////////////////////////////////////////////
143
+
144
+
145
+ def parse_csv_spectrum(
146
+ content: str, filename: str = "unknown"
147
+ ) -> Tuple[np.ndarray, np.ndarray]:
148
+ """
149
+ Parse spectrum data from CSV format.
150
+
151
+ Handles various CSV formats with headers or without.
152
+ """
153
+ try:
154
+ # Use StringIO to treat string as file-like object
155
+ csv_file = io.StringIO(content)
156
+
157
+ # Try to detect delimiter
158
+ sample = content[:1024]
159
+ delimiter = ","
160
+ if sample.count(";") > sample.count(","):
161
+ delimiter = ";"
162
+ elif sample.count("\t") > sample.count(","):
163
+ delimiter = "\t"
164
+
165
+ # Read CSV
166
+ csv_reader = csv.reader(csv_file, delimiter=delimiter)
167
+ rows = list(csv_reader)
168
+
169
+ if not rows:
170
+ raise ValueError("Empty CSV file")
171
+
172
+ # Check if first row is header
173
+ has_header = False
174
+ try:
175
+ # If first row contains non-numeric data, it's likely a header
176
+ float(rows[0][0])
177
+ float(rows[0][1])
178
+ except (ValueError, IndexError):
179
+ has_header = True
180
+
181
+ data_rows = rows[1:] if has_header else rows
182
+
183
+ # Extract x and y values
184
+ x_vals = []
185
+ y_vals = []
186
+
187
+ for i, row in enumerate(data_rows):
188
+ if len(row) < 2:
189
+ continue
190
+
191
+ try:
192
+ x_val = float(row[0])
193
+ y_val = float(row[1])
194
+ x_vals.append(x_val)
195
+ y_vals.append(y_val)
196
  except ValueError:
197
  ErrorHandler.log_warning(
198
+ f"Could not parse CSV row {i+1}: {row}", f"Parsing {filename}"
199
  )
200
  continue
201
 
202
+ if len(x_vals) < 10:
203
  raise ValueError(
204
  f"Insufficient data points ({len(x_vals)}). Need at least 10 points."
205
  )
206
 
207
+ return np.array(x_vals), np.array(y_vals)
 
208
 
209
+ except Exception as e:
210
+ raise ValueError(f"Failed to parse CSV spectrum: {str(e)}")
 
211
 
 
 
 
212
 
213
+ # /////////////////////////////////////////////////////
214
+
215
+
216
+ def parse_spectrum_data(
217
+ text_content: str, filename: str = "unknown", file_format: Optional[str] = None
218
+ ) -> Tuple[np.ndarray, np.ndarray]:
219
+ """
220
+ Parse spectrum data from text content with automatic format detection.
221
+ Args:
222
+ text_content: Raw text content of the spectrum file
223
+ filename: Name of the file for error reporting
224
+ file_format: Force specific format ('txt', 'csv', 'json') or None for auto-detection
225
+ Returns:
226
+ Tuple of (x_values, y_values) as numpy arrays
227
+ Raises:
228
+ ValueError: If the data cannot be parsed
229
+ """
230
+ try:
231
+ # Detect format if not specified
232
+ if file_format is None:
233
+ file_format = detect_file_format(filename, text_content)
234
+
235
+ # Parse based on detected/specified format
236
+ if file_format == "json":
237
+ x, y = parse_json_spectrum(text_content, filename)
238
+ elif file_format == "csv":
239
+ x, y = parse_csv_spectrum(text_content, filename)
240
+ else: # Default to TXT format
241
+ x, y = parse_txt_spectrum(text_content, filename)
242
+
243
+ # Common validation for all formats
244
+ validate_spectrum_data(x, y, filename)
245
 
246
  return x, y
247
 
 
249
  raise ValueError(f"Failed to parse spectrum data: {str(e)}")
250
 
251
 
252
+ # /////////////////////////////////////////////////////
253
+
254
+
255
+ def parse_txt_spectrum(
256
+ content: str, filename: str = "unknown"
257
+ ) -> Tuple[np.ndarray, np.ndarray]:
258
+ """
259
+ Parse spectrum data from TXT format (original implementation).
260
+ """
261
+ lines = content.strip().split("\n")
262
+
263
+ # ==Remove empty lines and comments==
264
+ data_lines = []
265
+ for line in lines:
266
+ line = line.strip()
267
+ if line and not line.startswith("#") and not line.startswith("%"):
268
+ data_lines.append(line)
269
+
270
+ if not data_lines:
271
+ raise ValueError("No data lines found in file")
272
+
273
+ # ==Try to parse==
274
+ x_vals, y_vals = [], []
275
+
276
+ for i, line in enumerate(data_lines):
277
+ try:
278
+ # Handle different separators
279
+ parts = line.replace(",", " ").split()
280
+ numbers = [
281
+ p
282
+ for p in parts
283
+ if p.replace(".", "", 1)
284
+ .replace("-", "", 1)
285
+ .replace("+", "", 1)
286
+ .isdigit()
287
+ ]
288
+ if len(numbers) >= 2:
289
+ x_val = float(numbers[0])
290
+ y_val = float(numbers[1])
291
+ x_vals.append(x_val)
292
+ y_vals.append(y_val)
293
+
294
+ except ValueError:
295
+ ErrorHandler.log_warning(
296
+ f"Could not parse line {i+1}: {line}", f"Parsing {filename}"
297
+ )
298
+ continue
299
+
300
+ if len(x_vals) < 10: # ==Need minimum points for interpolation==
301
+ raise ValueError(
302
+ f"Insufficient data points ({len(x_vals)}). Need at least 10 points."
303
+ )
304
+
305
+ return np.array(x_vals), np.array(y_vals)
306
+
307
+
308
+ # /////////////////////////////////////////////////////
309
+
310
+
311
+ def validate_spectrum_data(x: np.ndarray, y: np.ndarray, filename: str) -> None:
312
+ """
313
+ Validate parsed spectrum data for common issues.
314
+ """
315
+ # Check for NaNs
316
+ if np.any(np.isnan(x)) or np.any(np.isnan(y)):
317
+ raise ValueError("Input data contains NaN values")
318
+
319
+ # Check monotonic increasing x (sort if needed)
320
+ if not np.all(np.diff(x) >= 0):
321
+ # Sort by x values if not monotonic
322
+ sort_idx = np.argsort(x)
323
+ x = x[sort_idx]
324
+ y = y[sort_idx]
325
+ ErrorHandler.log_warning(
326
+ "Wavenumbers were not monotonic - data has been sorted",
327
+ f"Parsing {filename}",
328
+ )
329
+
330
+ # Check reasonable range for spectroscopy
331
+ if min(x) < 0 or max(x) > 10000 or (max(x) - min(x)) < 100:
332
+ ErrorHandler.log_warning(
333
+ f"Unusual wavenumber range: {min(x):.1f} - {max(x):.1f} cm⁻¹",
334
+ f"Parsing {filename}",
335
+ )
336
+
337
+
338
+ # /////////////////////////////////////////////////////
339
+
340
+
341
  def process_single_file(
342
  filename: str,
343
  text_content: str,