devjas1 commited on
Commit
07fb119
·
1 Parent(s): 8475e7b

FEAT(data_pipeline): implement enhanced data pipeline for polymer ML aging with spectroscopy integration and synthetic data augmentation

Browse files

Adds enhanced data pipeline for polymer ML aging

Implements an advanced data pipeline that integrates spectroscopy databases and synthetic data augmentation techniques for polymer machine learning aging studies.

This new module enhances data processing capabilities, supporting various spectroscopy formats and providing tools for data quality control. It also introduces methods for generating synthetic aging series, improving the analysis of polymer degradation over time.

Files changed (1) hide show
  1. modules/enhanced_data_pipeline.py +1189 -0
modules/enhanced_data_pipeline.py ADDED
@@ -0,0 +1,1189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Data Pipeline for Polymer ML Aging
3
+ Integrates with spectroscopy databases, synthetic data augmentation, and quality control
4
+ """
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ from typing import Dict, List, Tuple, Optional, Union, Any
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ import requests
12
+ import json
13
+ import sqlite3
14
+ from datetime import datetime
15
+ import hashlib
16
+ import warnings
17
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
18
+ from sklearn.decomposition import PCA
19
+ from sklearn.cluster import DBSCAN
20
+ import pickle
21
+ import io
22
+ import base64
23
+
24
+
25
+ @dataclass
26
+ class SpectralDatabase:
27
+ """Configuration for spectroscopy databases"""
28
+
29
+ name: str
30
+ base_url: Optional[str] = None
31
+ api_key: Optional[str] = None
32
+ description: str = ""
33
+ supported_formats: List[str] = field(default_factory=list)
34
+ access_method: str = "api" # "api", "download", "local"
35
+ local_path: Optional[Path] = None
36
+
37
+
38
+ # -///////////////////////////////////////////////////
39
+ @dataclass
40
+ class PolymerSample:
41
+ """Enhanced polymer sample information"""
42
+
43
+ sample_id: str
44
+ polymer_type: str
45
+ molecular_weight: Optional[float] = None
46
+ additives: List[str] = field(default_factory=list)
47
+ processing_conditions: Dict[str, Any] = field(default_factory=dict)
48
+ aging_condition: Dict[str, Any] = field(default_factory=dict)
49
+ aging_time: Optional[float] = None # Hours
50
+ degradation_level: Optional[float] = None # 0-1 Scale
51
+ spectral_data: Dict[str, np.ndarray] = field(default_factory=dict)
52
+ metadata: Dict[str, Any] = field(default_factory=dict)
53
+ quality_score: Optional[float] = None
54
+ validation_status: str = "pending" # pending, validated, rejected
55
+
56
+
57
+ # -///////////////////////////////////////////////////
58
+
59
+ # Database configurations
60
+ SPECTROSCOPY_DATABASES = {
61
+ "FTIR_PLASTICS": SpectralDatabase(
62
+ name="FTIR Plastics Database",
63
+ description="Comprehensive FTIR spectra of plastic materials",
64
+ supported_formats=["FTIR", "ATR-FTIR"],
65
+ access_method="local",
66
+ local_path=Path("data/databases/ftir_plastics"),
67
+ ),
68
+ "NIST_WEBBOOK": SpectralDatabase(
69
+ name="NIST Chemistry WebBook",
70
+ base_url="https://webbook.nist.gov/chemistry",
71
+ description="NIST spectroscopic database",
72
+ supported_formats=["FTIR", "Raman"],
73
+ access_method="api",
74
+ ),
75
+ "POLYMER_DATABASE": SpectralDatabase(
76
+ name="Polymer Spectroscopy Database",
77
+ description="Curated polymer degradation spectra",
78
+ supported_formats=["FTIR", "ATR-FTIR", "Raman"],
79
+ access_method="local",
80
+ local_path=Path("data/databases/polymer_degradation"),
81
+ ),
82
+ }
83
+
84
+ # -///////////////////////////////////////////////////
85
+
86
+
87
+ class DatabaseConnector:
88
+ """Connector for spectroscopy databases"""
89
+
90
+ def __init__(self, database_config: SpectralDatabase):
91
+ self.config = database_config
92
+ self.connection = None
93
+ self.cache_dir = Path("data/cache") / database_config.name.lower().replace(
94
+ " ", "_"
95
+ )
96
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
97
+
98
+ def connect(self) -> bool:
99
+ """Establish connection to database"""
100
+ try:
101
+ if self.config.access_method == "local":
102
+ if self.config.local_path and self.config.local_path.exists():
103
+ return True
104
+ else:
105
+ print(f"Local database path not found: {self.config.local_path}")
106
+ return False
107
+
108
+ elif self.config.access_method == "api":
109
+ # Test API connection
110
+ if self.config.base_url:
111
+ response = requests.get(self.config.base_url, timeout=10)
112
+ return response.status_code == 200
113
+ return False
114
+
115
+ return True
116
+
117
+ except Exception as e:
118
+ print(f"Failed to connect to {self.config.name}: {e}")
119
+ return False
120
+
121
+ # -///////////////////////////////////////////////////
122
+ def search_by_polymer_type(self, polymer_type: str, limit: int = 100) -> List[Dict]:
123
+ """Search database for spectra by polymer type"""
124
+ cache_key = f"search{hashlib.md5(polymer_type.encode()).hexdigest()}"
125
+ cache_file = self.cache_dir / f"{cache_key}.json"
126
+
127
+ # Check cache first
128
+ if cache_file.exists():
129
+ with open(cache_file, "r") as f:
130
+ return json.load(f)
131
+
132
+ results = []
133
+
134
+ if self.config.access_method == "local":
135
+ results = self._search_local_database(polymer_type, limit)
136
+ elif self.config.access_method == "api":
137
+ results = self._search_api_database(polymer_type, limit)
138
+
139
+ # Cache results
140
+ if results:
141
+ with open(cache_file, "w") as f:
142
+ json.dump(results, f)
143
+
144
+ return results
145
+
146
+ # -///////////////////////////////////////////////////
147
+ def _search_local_database(self, polymer_type: str, limit: int) -> List[Dict]:
148
+ """Search local database files"""
149
+ results = []
150
+
151
+ if not self.config.local_path or not self.config.local_path.exists():
152
+ return results
153
+
154
+ # Look for CSV files with polymer data
155
+ for csv_file in self.config.local_path.glob("*.csv"):
156
+ try:
157
+ df = pd.read_csv(csv_file)
158
+
159
+ # Search for polymer type in columns
160
+ polymer_matches = df[
161
+ df.astype(str)
162
+ .apply(lambda x: x.str.contains(polymer_type, case=False))
163
+ .any(axis=1)
164
+ ]
165
+
166
+ for _, row in polymer_matches.head(limit).iterrows():
167
+ result = {
168
+ "source_file": str(csv_file),
169
+ "polymer_type": polymer_type,
170
+ "data": row.to_dict(),
171
+ }
172
+ results.append(result)
173
+
174
+ except Exception as e:
175
+ print(f"Error reading {csv_file}: {e}")
176
+ continue
177
+
178
+ return results
179
+
180
+ # -///////////////////////////////////////////////////
181
+ def _search_api_database(self, polymer_type: str, limit: int) -> List[Dict]:
182
+ """Search API-based database"""
183
+ results = []
184
+
185
+ try:
186
+ # TODO: Example API search (would need actual API endpoints)
187
+ search_params = {"query": polymer_type, "limit": limit, "format": "json"}
188
+
189
+ if self.config.api_key:
190
+ search_params["api_key"] = self.config.api_key
191
+
192
+ response = requests.get(
193
+ f"{self.config.base_url}/search", params=search_params, timeout=30
194
+ )
195
+
196
+ if response.status_code == 200:
197
+ results = response.json().get("results", [])
198
+
199
+ except Exception as e:
200
+ print(f"API search failed: {e}")
201
+
202
+ return results
203
+
204
+ # -///////////////////////////////////////////////////
205
+ def download_spectrum(self, spectrum_id: str) -> Optional[Dict]:
206
+ """Download specific spectrum data"""
207
+ cache_file = self.cache_dir / f"spectrum_{spectrum_id}.json"
208
+
209
+ # Check cache
210
+ if cache_file.exists():
211
+ with open(cache_file, "r") as f:
212
+ return json.load(f)
213
+
214
+ spectrum_data = None
215
+
216
+ if self.config.access_method == "api":
217
+ try:
218
+ url = f"{self.config.base_url}/spectrum/{spectrum_id}"
219
+ response = requests.get(url, timeout=30)
220
+
221
+ if response.status_code == 200:
222
+ spectrum_data = response.json()
223
+
224
+ except Exception as e:
225
+ print(f"Failed to download spectrum {spectrum_id}: {e}")
226
+
227
+ # Cache results if successful
228
+ if spectrum_data:
229
+ with open(cache_file, "w") as f:
230
+ json.dump(spectrum_data, f)
231
+
232
+ return spectrum_data
233
+
234
+
235
+ # -///////////////////////////////////////////////////
236
+ class SyntheticDataAugmentation:
237
+ """Advanced synthetic data augmentation for spectroscopy"""
238
+
239
+ def __init__(self):
240
+ self.augmentation_methods = [
241
+ "noise_addition",
242
+ "baseline_drift",
243
+ "intensity_scaling",
244
+ "wavenumber_shift",
245
+ "peak_broadening",
246
+ "atmospheric_effects",
247
+ "instrumental_response",
248
+ "sample_variations",
249
+ ]
250
+
251
+ def augment_spectrum(
252
+ self,
253
+ wavenumbers: np.ndarray,
254
+ intensities: np.ndarray,
255
+ method: str = "random",
256
+ num_variations: int = 5,
257
+ intensity_factor: float = 0.1,
258
+ ) -> List[Tuple[np.ndarray, np.ndarray]]:
259
+ """
260
+ Generate augmented versions of a spectrum
261
+
262
+ Args:
263
+ wavenumbers: Original wavenumber array
264
+ intensities: Original intensity array
265
+ method: str = Augmentation method or 'random' for random selection
266
+ num_variations: Number of variations to generate
267
+ intensity_factor: Factor controlling augmentation intesity
268
+
269
+ Returns:
270
+ List of (wavenumbers, intensities) tuples
271
+ """
272
+ augmented_spectra = []
273
+
274
+ for _ in range(num_variations):
275
+ if method == "random":
276
+ chosen_method = np.random.choice(self.augmentation_methods)
277
+ else:
278
+ chosen_method = method
279
+
280
+ aug_wavenumbers, aug_intensities = self._apply_augmentation(
281
+ wavenumbers, intensities, chosen_method, intensity_factor
282
+ )
283
+
284
+ augmented_spectra.append((aug_wavenumbers, aug_intensities))
285
+
286
+ return augmented_spectra
287
+
288
+ # -///////////////////////////////////////////////////
289
+ def _apply_augmentation(
290
+ self,
291
+ wavenumbers: np.ndarray,
292
+ intensities: np.ndarray,
293
+ method: str,
294
+ intensity: float,
295
+ ) -> Tuple[np.ndarray, np.ndarray]:
296
+ """Apply specific augmentation method"""
297
+
298
+ aug_wavenumbers = wavenumbers.copy()
299
+ aug_intensities = intensities.copy()
300
+
301
+ if method == "noise_addition":
302
+ # Add random noise
303
+ noise_level = intensity * np.std(intensities)
304
+ noise = np.random.normal(0, noise_level, len(intensities))
305
+ aug_intensities += noise
306
+
307
+ elif method == "baseline_drift":
308
+ # Add baseline drift
309
+ drift_amplitude = intensity * np.mean(np.abs(intensities))
310
+ drift = drift_amplitude * np.sin(
311
+ 2 * np.pi * np.linspace(0, 2, len(intensities))
312
+ )
313
+ aug_intensities += drift
314
+
315
+ elif method == "intensity_scaling":
316
+ # Scale intensity uniformly
317
+ scale_factor = 1.0 + intensity * (2 * np.random.random() - 1)
318
+ aug_intensities *= scale_factor
319
+
320
+ elif method == "wavenumber_shift":
321
+ # Shift wavenumber axis
322
+ shift_range = intensity * 10 # cm-1
323
+ shift = shift_range * (2 * np.random.random() - 1)
324
+ aug_wavenumbers += shift
325
+
326
+ elif method == "peak_broadening":
327
+ # Broaden peaks using convolution
328
+ from scipy import signal
329
+
330
+ sigma = intensity * 2 # Broadening factor
331
+ kernel_size = int(sigma * 6) + 1
332
+ if kernel_size % 2 == 0:
333
+ kernel_size += 1
334
+
335
+ if kernel_size >= 3:
336
+ from scipy.signal.windows import gaussian
337
+
338
+ kernel = gaussian(kernel_size, sigma)
339
+ kernel = kernel / np.sum(kernel)
340
+ aug_intensities = signal.convolve(
341
+ aug_intensities, kernel, mode="same"
342
+ )
343
+
344
+ elif method == "atmospheric_effects":
345
+ # Simulate atmospheric absorption
346
+ co2_region = (wavenumbers >= 2320) & (wavenumbers <= 2380)
347
+ h2o_region = (wavenumbers >= 3200) & (wavenumbers <= 3800)
348
+
349
+ if np.any(co2_region):
350
+ aug_intensities[co2_region] *= 1 - intensity * 0.1
351
+ if np.any(h2o_region):
352
+ aug_intensities[h2o_region] *= 1 - intensity * 0.05
353
+
354
+ elif method == "instrumental_response":
355
+ # Simulate instrumental response variations
356
+ # Add slight frequency-dependent response
357
+ response_curve = 1.0 + intensity * 0.1 * np.sin(
358
+ 2
359
+ * np.pi
360
+ * (wavenumbers - wavenumbers.min())
361
+ / (wavenumbers.max() - wavenumbers.min())
362
+ )
363
+ aug_intensities *= response_curve
364
+
365
+ elif method == "sample_variations":
366
+ # Simulate sample-to-sample variations
367
+ # Random peak intensity variations
368
+ num_peaks = min(5, len(intensities) // 100)
369
+ for _ in range(num_peaks):
370
+ peak_center = np.random.randint(0, len(intensities))
371
+ peak_width = np.random.randint(5, 20)
372
+ peak_variation = intensity * (2 * np.random.random() - 1)
373
+
374
+ start_idx = max(0, peak_center - peak_width)
375
+ end_idx = min(len(intensities), peak_center + peak_width)
376
+
377
+ aug_intensities[start_idx:end_idx] *= 1 + peak_variation
378
+
379
+ return aug_wavenumbers, aug_intensities
380
+
381
+ # -///////////////////////////////////////////////////
382
+ def generate_synthetic_aging_series(
383
+ self,
384
+ base_spectrum: Tuple[np.ndarray, np.ndarray],
385
+ num_time_points: int = 10,
386
+ max_degradation: float = 0.8,
387
+ ) -> List[Dict]:
388
+ """
389
+ Generate synthetic aging series showing progressive degradation
390
+
391
+ Args:
392
+ base_spectrum: (wavenumbers, intensities) for fresh sample
393
+ num_time_points: Number of time points in series
394
+ max_degradation: Maximum degradation level (0-1)
395
+
396
+ Returns:
397
+ List of aging data points
398
+ """
399
+ wavenumbers, intensities = base_spectrum
400
+ aging_series = []
401
+
402
+ # Define degradation-related spectral changes
403
+ degradation_features = {
404
+ "carbonyl_growth": {
405
+ "region": (1700, 1750), # C=0 stretch
406
+ "intensity_change": 2.0, # Factor increase
407
+ },
408
+ "oh_growth": {
409
+ "region": (3200, 3600), # OH stretch
410
+ "intensity_change": 1.5,
411
+ },
412
+ "ch_decrease": {
413
+ "region": (2800, 3000), # CH stretch
414
+ "intensity_change": 0.7, # Factor decrease
415
+ },
416
+ "crystrallinity_change": {
417
+ "region": (1000, 1200), # Various polymer backbone changes
418
+ "intensity_change": 0.9,
419
+ },
420
+ }
421
+
422
+ for i in range(num_time_points):
423
+ degradation_level = (i / (num_time_points - 1)) * max_degradation
424
+ aging_time = i * 100 # hours (arbitrary scale)
425
+
426
+ # Start with base spectrum
427
+ aged_intensities = intensities.copy()
428
+
429
+ # Apply degradation-related changes
430
+ for feature, params in degradation_features.items():
431
+ region_mask = (wavenumbers >= params["region"][0]) & (
432
+ wavenumbers <= params["region"][1]
433
+ )
434
+ if np.any(region_mask):
435
+ change_factor = 1.0 + degradation_level * (
436
+ params["intensity_change"] - 1.0
437
+ )
438
+ aged_intensities[region_mask] *= change_factor
439
+
440
+ # Add some random variations
441
+ aug_wavenumbers, aug_intensities = self._apply_augmentation(
442
+ wavenumbers, aged_intensities, "noise_addition", 0.02
443
+ )
444
+
445
+ aging_point = {
446
+ "aging_time": aging_time,
447
+ "degradation_level": degradation_level,
448
+ "wavenumbers": aug_wavenumbers,
449
+ "intensities": aug_intensities,
450
+ "spectral_changes": {
451
+ feature: degradation_level * params["intensity_change"] - 1.0
452
+ for feature, params in degradation_features.items()
453
+ },
454
+ }
455
+
456
+ aging_series.append(aging_point)
457
+
458
+ return aging_series
459
+
460
+
461
+ # -///////////////////////////////////////////////////
462
+ class DataQualityController:
463
+ """Advanced data quality assessment and validation"""
464
+
465
+ def __init__(self):
466
+ self.quality_metrics = [
467
+ "signal_to_noise_ratio",
468
+ "baseline_stability",
469
+ "peak_resolution",
470
+ "spectral_range_coverage",
471
+ "instrumental_artifacts",
472
+ "data_completeness",
473
+ "metadata_completeness",
474
+ ]
475
+
476
+ self.validation_rules = {
477
+ "min_str": 10.0,
478
+ "max_baseline_variation": 0.1,
479
+ "min_peak_count": 3,
480
+ "min_spectral_range": 1000.0, # cm-1
481
+ "max_missing_points": 0.05, # 5% max missing data
482
+ }
483
+
484
+ def assess_spectrum_quality(
485
+ self,
486
+ wavenumbers: np.ndarray,
487
+ intensities: np.ndarray,
488
+ metadata: Optional[Dict] = None,
489
+ ) -> Dict[str, Any]:
490
+ """
491
+ Comprehensive quality assessment of spectral data
492
+
493
+ Args:
494
+ wavenumbers: Array of wavenumbers
495
+ intensities: Array of intensities
496
+ metadata: Optional metadata dictionary
497
+
498
+ Returns:
499
+ Quality assessment results
500
+ """
501
+ assessment = {
502
+ "overall_score": 0.0,
503
+ "individual_scores": {},
504
+ "issues_found": [],
505
+ "recommendations": [], # Ensure this is initialized as a list
506
+ "validation_status": "pending",
507
+ }
508
+
509
+ # Signal-to-noise
510
+ snr_score, snr_value = self._assess_snr(intensities)
511
+ assessment["individual_scores"]["snr"] = snr_score
512
+ assessment["recommendations"] = snr_value
513
+
514
+ if snr_value < self.validation_rules["min_snr"]:
515
+ assessment["issues_found"].append(
516
+ f"Low SNR: {snr_value:.1f} (min: {self.validation_rules['min_snr']})"
517
+ )
518
+ assessment["recommendations"].append(
519
+ "Consider noise reduction preprocessing"
520
+ )
521
+
522
+ # Baseline stability
523
+ baseline_score, baseline_variation = self._assess_baseline_stability(
524
+ intensities
525
+ )
526
+ assessment["individual_scores"]["baseline"] = baseline_score
527
+ assessment["baseline_variation"] = baseline_variation
528
+
529
+ if baseline_variation > self.validation_rules["max_baseline_variation"]:
530
+ assessment["issues_found"].append(
531
+ f"Unstable baseline: {baseline_variation:.3f}"
532
+ )
533
+ assessment["recommendations"].append("Apply baseline correction")
534
+
535
+ # Peak resolution and count
536
+ peak_score, peak_count = self._assess_peak_resolution(wavenumbers, intensities)
537
+ assessment["individual_scores"]["peaks"] = peak_score
538
+ assessment["peak_count"] = peak_count
539
+
540
+ if peak_count < self.validation_rules["min_peak_count"]:
541
+ assessment["issues_found"].append(f"Few peaks detected: {peak_count}")
542
+ assessment["recommendations"].append(
543
+ "Check sample quality or measurement conditions"
544
+ )
545
+
546
+ # Spectral range coverage
547
+ range_score, spectral_range = self._assess_spectral_range(wavenumbers)
548
+ assessment["individual_scores"]["range"] = range_score
549
+ assessment["spectral_range"] = spectral_range
550
+
551
+ if spectral_range < self.validation_rules["min_spectral_range"]:
552
+ assessment["issues_found"].append(
553
+ f"Limited spectral range: {spectral_range:.0f} cm-1"
554
+ )
555
+
556
+ # Data completeness
557
+ completeness_score, missing_fraction = self._assess_data_completeness(
558
+ intensities
559
+ )
560
+ assessment["individual_scores"]["completeness"] = completeness_score
561
+ assessment["missing_fraction"] = missing_fraction
562
+
563
+ if missing_fraction > self.validation_rules["max_missing_points"]:
564
+ assessment["issues_found"].append(
565
+ f"Missing data points: {missing_fraction:.1f}%"
566
+ )
567
+ assessment["recommendations"].append(
568
+ "Interpolate missing points or re-measure"
569
+ )
570
+
571
+ # Instrumental artifacts
572
+ artifact_score, artifacts = self._detect_instrumental_artifacts(
573
+ wavenumbers, intensities
574
+ )
575
+ assessment["individual_scores"]["artifacts"] = artifact_score
576
+ assessment["artifacts_detected"] = artifacts
577
+
578
+ if artifacts:
579
+ assessment["issues_found"].extend(
580
+ [f"Artifact detected {artifact}" for artifact in artifacts]
581
+ )
582
+ assessment["recommendations"].append("Apply artifact correction")
583
+
584
+ # Metadata completeness
585
+ metadata_score = self._assess_metadata_completeness(metadata)
586
+ assessment["individual_scores"]["metadata"] = metadata_score
587
+
588
+ # Calculate overall score
589
+ scores = list(assessment["individual_scores"].values())
590
+ assessment["overall_score"] = np.mean(scores) if scores else 0.0
591
+
592
+ # Determine validation status
593
+ if assessment["overall_score"] >= 0.8 and len(assessment["issues_found"]) == 0:
594
+ assessment["validation_status"] = "validated"
595
+ elif assessment["overall_score"] >= 0.6:
596
+ assessment["validation_status"] = "conditional"
597
+ else:
598
+ assessment["validation_status"] = "rejected"
599
+
600
+ return assessment
601
+
602
+ # -///////////////////////////////////////////////////
603
+ def _assess_snr(self, intensities: np.ndarray) -> Tuple[float, float]:
604
+ """Assess signal-to-noise ratio"""
605
+ try:
606
+ # Estimate noise from high-frequency components
607
+ diff_signal = np.diff(intensities)
608
+ noise_std = np.std(diff_signal)
609
+ signal_power = np.var(intensities)
610
+
611
+ snr = np.sqrt(signal_power) / noise_std if noise_std > 0 else float("inf")
612
+
613
+ # Score based on SNR values
614
+ score = min(
615
+ 1.0, max(0.0, (np.log10(snr) - 1) / 2)
616
+ ) # Log scale, 10-1000 range
617
+
618
+ return score, snr
619
+ except:
620
+ return 0.5, 1.0
621
+
622
+ # -///////////////////////////////////////////////////
623
+ def _assess_baseline_stability(
624
+ self, intensities: np.ndarray
625
+ ) -> Tuple[float, float]:
626
+ """Assess baseline stability"""
627
+ try:
628
+ # Estimate baseline from endpoints and low-frequency components
629
+ baseline_points = np.concatenate([intensities[:10], intensities[-10]])
630
+ baseline_variation = np.std(baseline_points) / np.mean(abs(intensities))
631
+
632
+ score = max(0.0, 1.0 - baseline_variation * 10) # Penalty for variation
633
+
634
+ return score, baseline_variation
635
+
636
+ except:
637
+ return 0.5, 1.0
638
+
639
+ # -///////////////////////////////////////////////////
640
+ def _assess_peak_resolution(
641
+ self, wavenumbers: np.ndarray, intensities: np.ndarray
642
+ ) -> Tuple[float, int]:
643
+ """Assess peak resolution and count"""
644
+ try:
645
+ from scipy.signal import find_peaks
646
+
647
+ # Find peaks with minimum prominence
648
+ prominence_threshold = 0.1 * np.std(intensities)
649
+ peaks, properties = find_peaks(
650
+ intensities, prominence=prominence_threshold, distance=5
651
+ )
652
+
653
+ peak_count = len(peaks)
654
+
655
+ # Score based on peak count and prominence
656
+ if peak_count > 0:
657
+ avg_prominence = np.mean(properties["prominences"])
658
+ prominence_score = min(
659
+ 1.0, avg_prominence / (0.2 * np.std(intensities))
660
+ )
661
+ count_score = min(1.0, peak_count / 10) # Normalize to ~10 peaks
662
+ score = 0.5 * prominence_score + 0.5 * count_score
663
+ else:
664
+ score = 0.0
665
+
666
+ return score, peak_count
667
+
668
+ except:
669
+ return 0.5, 0
670
+
671
+ # -///////////////////////////////////////////////////
672
+ def _assess_spectral_range(self, wavenumbers: np.ndarray) -> Tuple[float, float]:
673
+ """Assess spectral range coverage"""
674
+ try:
675
+ spectral_range = wavenumbers.max() - wavenumbers.min()
676
+
677
+ # Score based on typical FTIR range (4000 cm-1)
678
+ score = min(1.0, spectral_range / 4000)
679
+
680
+ return score, spectral_range
681
+
682
+ except:
683
+ return 0.5, 1000
684
+
685
+ # -///////////////////////////////////////////////////
686
+ def _assess_data_completeness(self, intensities: np.ndarray) -> Tuple[float, float]:
687
+ """Assess data completion"""
688
+ try:
689
+ # Check for NaN, or zero values
690
+ invalid_mask = (
691
+ np.isnan(intensities) | np.isinf(intensities) | (intensities == 0)
692
+ )
693
+ missing_fraction = np.sum(invalid_mask) / len(intensities)
694
+
695
+ score = max(
696
+ 0.0, 1.0 - missing_fraction * 10
697
+ ) # Heavy penalty for missing data
698
+
699
+ return score, missing_fraction
700
+ except:
701
+ return 0.5, 0.0
702
+
703
+ # -///////////////////////////////////////////////////
704
+ def _detect_instrumental_artifacts(
705
+ self, wavenumbers: np.ndarray, intensities: np.ndarray
706
+ ) -> Tuple[float, List[str]]:
707
+ """Detect common instrumental artifacts"""
708
+ artifacts = []
709
+
710
+ try:
711
+ # Check for spike artifacts (cosmic rays, electrical interference)
712
+ diff_threshold = 5 * np.std(np.diff(intensities))
713
+ spikes = np.where(np.abs(np.diff(intensities)) > diff_threshold)[0]
714
+
715
+ if len(spikes) > len(intensities) * 0.01: # More than 1% spikes
716
+ artifacts.append("excessive_spikes")
717
+
718
+ # Check for saturation (flat regions at max/min)
719
+ if np.std(intensities) > 0:
720
+ max_val = np.max(intensities)
721
+ min_val = np.min(intensities)
722
+
723
+ saturation_high = np.sum(intensities >= 0.99 * max_val) / len(
724
+ intensities
725
+ )
726
+ saturation_low = np.sum(intensities <= 1.01 * min_val) / len(
727
+ intensities
728
+ )
729
+
730
+ if saturation_high > 0.05:
731
+ artifacts.append("high_saturation")
732
+ if saturation_low > 0.05:
733
+ artifacts.append("low_saturation")
734
+
735
+ # Check for periodic noise (electrical interference)
736
+ fft = np.fft.fft(intensities - np.mean(intensities))
737
+ freq_domain = np.abs(fft[: len(fft) // 2])
738
+
739
+ # Look for strong periodic components
740
+ if len(freq_domain) > 10:
741
+ mean_amplitude = np.mean(freq_domain)
742
+ strong_frequencies = np.sum(freq_domain > 3 * mean_amplitude)
743
+
744
+ if strong_frequencies > len(freq_domain) * 0.1:
745
+ artifacts.append("periodic_noise")
746
+
747
+ # Score inversely related to number of artifacts
748
+ score = max(0.0, 1.0 - len(artifacts) * 0.3)
749
+
750
+ return score, artifacts
751
+
752
+ except:
753
+ return 0.5, []
754
+
755
+ # -///////////////////////////////////////////////////
756
+ def _assess_metadata_completeness(self, metadata: Optional[Dict]) -> float:
757
+ """Assess completeness of metadata"""
758
+ if metadata is None:
759
+ return 0.0
760
+
761
+ required_fields = [
762
+ "sample_id",
763
+ "measurement_date",
764
+ "instrument_type",
765
+ "resolution",
766
+ "number_of_scans",
767
+ "sample_type",
768
+ ]
769
+
770
+ present_fields = sum(
771
+ 1
772
+ for field in required_fields
773
+ if field in metadata and metadata[field] is not None
774
+ )
775
+ score = present_fields / len(required_fields)
776
+
777
+ return score
778
+
779
+
780
+ # -///////////////////////////////////////////////////
781
+ class EnhancedDataPipeline:
782
+ """Complete enhanced data pipeline integrating all components"""
783
+
784
+ def __init__(self):
785
+ self.database_connector = {}
786
+ self.augmentation_engine = SyntheticDataAugmentation()
787
+ self.quality_controller = DataQualityController()
788
+ self.local_database_path = Path("data/enhanced_data")
789
+ self.local_database_path.mkdir(parents=True, exist_ok=True)
790
+ self._init_local_database()
791
+
792
+ def _init_local_database(self):
793
+ """Initialize local SQLite database"""
794
+ db_path = self.local_database_path / "polymer_spectra.db"
795
+
796
+ with sqlite3.connect(db_path) as conn:
797
+ cursor = conn.cursor()
798
+
799
+ # Create main spectra table
800
+ cursor.execute(
801
+ """
802
+ CREATE TABLE IF NOT EXISTS spectra (
803
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
804
+ sample_id TEXT UNIQUE NOT NULL,
805
+ polymer_type TEXT NOT NULL,
806
+ technique TEXT NOT NULL,
807
+ wavenumbers BLOB,
808
+ intensities BLOB,
809
+ metadata TEXT,
810
+ quality_score REAL,
811
+ validation_status TEXT,
812
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
813
+ source_database TEXT
814
+ )
815
+ """
816
+ )
817
+
818
+ # Create aging data table
819
+ cursor.execute(
820
+ """
821
+ CREATE TABLE IF NOT EXISTS aging_data (
822
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
823
+ sample_id TEXT,
824
+ aging_time REAL,
825
+ degradation_level REAL,
826
+ spectral_changes TEXT,
827
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
828
+ FOREIGN KEY (sample_id) REFERENCES spectra (sample_id)
829
+ )
830
+ """
831
+ )
832
+
833
+ conn.commit()
834
+
835
+ # -///////////////////////////////////////////////////
836
+ def connect_to_databases(self) -> Dict[str, bool]:
837
+ """Connect to all configured databases"""
838
+ connection_status = {}
839
+
840
+ for db_name, db_config in SPECTROSCOPY_DATABASES.items():
841
+ connector = DatabaseConnector(db_config)
842
+ self.database_connector[db_name] = connector.connect()
843
+
844
+ return connection_status
845
+
846
+ # -///////////////////////////////////////////////////
847
+ def search_and_import_spectra(
848
+ self, polymer_type: str, max_per_database: int = 50
849
+ ) -> Dict[str, int]:
850
+ """Search and import spectra from all connected databases"""
851
+ import_counts = {}
852
+
853
+ for db_name, connector in self.database_connector.items():
854
+ try:
855
+ search_results = connector.search_by_polymer_type(
856
+ polymer_type, max_per_database
857
+ )
858
+ imported_count = 0
859
+
860
+ for result in search_results:
861
+ if self._import_spectrum_to_local(result, db_name):
862
+ imported_count += 1
863
+
864
+ import_counts[db_name] = imported_count
865
+
866
+ except Exception as e:
867
+ print(f"Error importing from {db_name}: {e}")
868
+ import_counts[db_name] = 0
869
+
870
+ return import_counts
871
+
872
+ # -///////////////////////////////////////////////////]
873
+ def _import_spectrum_to_local(self, spectrum_data: Dict, source_db: str) -> bool:
874
+ """Import spectrum data to local database"""
875
+ try:
876
+ # Extract or generate sample ID
877
+ sample_id = spectrum_data.get(
878
+ "sample_id", f"{source_db}_{hash(str(spectrum_data))}"
879
+ )
880
+
881
+ # Convert spectrum data format
882
+ if "wavenumbers" in spectrum_data and "intensities" in spectrum_data:
883
+ wavenumbers = np.array(spectrum_data["wavenumbers"])
884
+ intensities = np.array(spectrum_data["intensities"])
885
+ else:
886
+ # Try to extract from other formats
887
+ return False
888
+
889
+ # Quality assessment
890
+ metadata = spectrum_data.get("metadata", {})
891
+ quality_assessment = self.quality_controller.assess_spectrum_quality(
892
+ wavenumbers, intensities, metadata
893
+ )
894
+
895
+ # Only import if quality is acceptable
896
+ if quality_assessment["validation_status"] == "rejected":
897
+ return False
898
+
899
+ # Serialize arrays
900
+ wavenumbers_blob = pickle.dumps(wavenumbers)
901
+ intensities_blob = pickle.dumps(intensities)
902
+ metadata_json = json.dumps(metadata)
903
+
904
+ # Insert into database
905
+ db_path = self.local_database_path / "polymer_spectra.db"
906
+
907
+ with sqlite3.connect(db_path) as conn:
908
+ cursor = conn.cursor()
909
+
910
+ cursor.execute(
911
+ """
912
+ INSERT OR REPLACE INTO spectra(
913
+ sample_id, polymer_type, technique,
914
+ wavenumbers, intensities, metadata,
915
+ quality_score, validation_status,
916
+ source_database)
917
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
918
+ """,
919
+ (
920
+ sample_id,
921
+ spectrum_data.get("polymer_type", "unknown"),
922
+ spectrum_data.get("technique", "FTIR"),
923
+ wavenumbers_blob,
924
+ intensities_blob,
925
+ metadata_json,
926
+ quality_assessment["overall_score"],
927
+ quality_assessment["validation_status"],
928
+ source_db,
929
+ ),
930
+ )
931
+
932
+ conn.commit()
933
+
934
+ return True
935
+
936
+ except Exception as e:
937
+ print(f"Error importing spectrum: {e}")
938
+ return False
939
+
940
+ # -///////////////////////////////////////////////////
941
+ def generate_synthetic_aging_dataset(
942
+ self,
943
+ base_polymer_type: str,
944
+ num_samples: int = 50,
945
+ aging_conditions: Optional[List[Dict]] = None,
946
+ ) -> int:
947
+ """
948
+ Generate synthetic aging dataset for training
949
+
950
+ Args:
951
+ base_polymer_type: Base polymer type to use
952
+ num_samples: Number of synthetic samples to generate
953
+ aging_conditions: List of aging condition dictionaries
954
+
955
+ Returns:
956
+ Number of samples generated
957
+ """
958
+ if aging_conditions is None:
959
+ aging_conditions = [
960
+ {"temperature": 60, "humidity": 75, "uv_exposure": True},
961
+ {"temperature": 80, "humidity": 85, "uv_exposure": True},
962
+ {"temperature": 40, "humidity": 95, "uv_exposure": False},
963
+ {"temperature": 100, "humidity": 50, "uv_exposure": True},
964
+ ]
965
+
966
+ # Get base spectra from database
967
+ base_spectra = self.spectra_by_type(base_polymer_type, limit=10)
968
+
969
+ if not base_spectra:
970
+ print(f"No base spectra found for {base_polymer_type}")
971
+ return 0
972
+
973
+ generated_count = 0
974
+
975
+ synthetic_id = None # Initialize synthetic_id to avoid unbound error
976
+ aging_series = [] # Initialize aging_series to avoid unbound error
977
+ for base_spectrum in base_spectra:
978
+ wavenumbers = pickle.loads(base_spectrum["wavenumbers"])
979
+ intensities = pickle.loads(base_spectrum["intensities"])
980
+
981
+ # Generate aging series for each condition
982
+ for condition in aging_conditions:
983
+ aging_series = self.augmentation_engine.generate_synthetic_aging_series(
984
+ (wavenumbers, intensities),
985
+ num_time_points=min(
986
+ 10, num_samples // len(aging_conditions) // len(base_spectra)
987
+ ),
988
+ )
989
+
990
+ if "aging_series" in locals() and aging_series:
991
+ for aging_point in aging_series:
992
+ synthetic_id = f"synthetic_{base_polymer_type}_{generated_count}"
993
+
994
+ # Ensure condition is properly passed into the loop
995
+ metadata = {
996
+ "synthetic": True,
997
+ "aging_condition": aging_conditions[
998
+ 0
999
+ ], # Use the first condition or adjust as needed
1000
+ "aging_time": aging_point["aging_time"],
1001
+ "degradation_level": aging_point["degradation_level"],
1002
+ }
1003
+
1004
+ # Store synthetic spectrum
1005
+ if self._store_synthetic_spectrum(
1006
+ synthetic_id, base_polymer_type, aging_point, metadata
1007
+ ):
1008
+ generated_count += 1
1009
+
1010
+ return generated_count
1011
+
1012
+ def _store_synthetic_spectrum(
1013
+ self, sample_id: str, polymer_type: str, aging_point: Dict, metadata: Dict
1014
+ ) -> bool:
1015
+ """Store synthetic spectrum in local database"""
1016
+ try:
1017
+ quality_assessment = self.quality_controller.assess_spectrum_quality(
1018
+ aging_point["wavenumbers"], aging_point["intensities"], metadata
1019
+ )
1020
+
1021
+ # Serialize data
1022
+ wavenumbers_blob = pickle.dumps(aging_point["wavenumbers"])
1023
+ intensities_blob = pickle.dumps(aging_point["intensities"])
1024
+ metadata_json = json.dumps(metadata)
1025
+
1026
+ # Insert spectrum
1027
+ db_path = self.local_database_path / "polymer_spectra.db"
1028
+
1029
+ with sqlite3.connect(db_path) as conn:
1030
+ cursor = conn.cursor()
1031
+
1032
+ cursor.execute(
1033
+ """
1034
+ INSERT INTO spectra
1035
+ (sample_id, polymer_type, technique, wavenumbers, intensities,
1036
+ metadata, quality_score, validation_status, source_database)
1037
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1038
+ """,
1039
+ (
1040
+ sample_id,
1041
+ polymer_type,
1042
+ "FTIR_synthetic",
1043
+ wavenumbers_blob,
1044
+ intensities_blob,
1045
+ metadata_json,
1046
+ quality_assessment["overall_score"],
1047
+ "validated", # Synthetic data is pre-validated
1048
+ "synthetic",
1049
+ ),
1050
+ )
1051
+
1052
+ # Insert aging data
1053
+ cursor.execute(
1054
+ """
1055
+ INSERT INTO aging_data
1056
+ (sample_id, aging_time, degradation_level, aging_conditions, spectral_changes)
1057
+ VALUES (?, ?, ?, ?, ?)
1058
+ """,
1059
+ (
1060
+ sample_id,
1061
+ aging_point["aging_time"],
1062
+ aging_point["degradation_level"],
1063
+ json.dumps(metadata["aging_conditions"]),
1064
+ json.dumps(aging_point.get("spectral_changes", {})),
1065
+ ),
1066
+ )
1067
+
1068
+ conn.commit()
1069
+
1070
+ return True
1071
+
1072
+ except Exception as e:
1073
+ print(f"Error storing synthetic spectrum: {e}")
1074
+ return False
1075
+
1076
+ # -///////////////////////////////////////////////////]
1077
+ def spectra_by_type(self, polymer_type: str, limit: int = 100) -> List[Dict]:
1078
+ """Retrieve spectra by polymer type from local database"""
1079
+ db_path = self.local_database_path / "polymer_spectra.db"
1080
+
1081
+ with sqlite3.connect(db_path) as conn:
1082
+ cursor = conn.cursor()
1083
+
1084
+ cursor.execute(
1085
+ """
1086
+ SELECT * FROM spectra
1087
+ WHERE polymer_type LIKE ? AND validation_status != 'rejected'
1088
+ ORDER BY quality_score DESC
1089
+ LIMIT ?
1090
+ """,
1091
+ (f"%{polymer_type}%", limit),
1092
+ )
1093
+
1094
+ columns = [description[0] for description in cursor.description]
1095
+ results = [dict(zip(columns, row)) for row in cursor.fetchall()]
1096
+
1097
+ return results
1098
+
1099
+ # -///////////////////////////////////////////////////]
1100
+ def get_weathered_samples(self, polymer_type: Optional[str] = None) -> List[Dict]:
1101
+ """Get samples with aging/weathering data"""
1102
+ db_path = self.local_database_path / "polymer_spectra.db"
1103
+
1104
+ with sqlite3.connect(db_path) as conn:
1105
+ cursor = conn.cursor()
1106
+
1107
+ query = """
1108
+ SELECT s.*, a.aging_time, a.degradation_level, a.aging_conditions
1109
+ FROM spectra s
1110
+ JOIN aging_data a ON s.sample_id = a.sample_id
1111
+ WHERE s.validation_status != 'rejected'
1112
+ """
1113
+ params = []
1114
+
1115
+ if polymer_type:
1116
+ query += " AND s.polymer_type LIKE ?"
1117
+ params.append(f"%{polymer_type}%")
1118
+
1119
+ query += " ORDER BY a.degradation_level"
1120
+
1121
+ cursor.execute(query, params)
1122
+
1123
+ columns = [description[0] for description in cursor.description]
1124
+ results = [dict(zip(columns, row)) for row in cursor.fetchall()]
1125
+
1126
+ return results
1127
+
1128
+ # -////////////////////////////////
1129
+ def get_database_statistics(self) -> Dict[str, Any]:
1130
+ """Get statistics about the local database"""
1131
+ db_path = self.local_database_path / "polymer_spectra.db"
1132
+
1133
+ with sqlite3.connect(db_path) as conn:
1134
+ cursor = conn.cursor()
1135
+
1136
+ # Total spectra count
1137
+ cursor.execute("SELECT COUNT(*) FROM spectra")
1138
+ total_spectra = cursor.fetchone()[0]
1139
+
1140
+ # By polymer type
1141
+ cursor.execute(
1142
+ """
1143
+ SELECT polymer_type, COUNT(*) as count
1144
+ FROM spectra
1145
+ GROUP BY polymer_type
1146
+ ORDER BY count DESC
1147
+ """
1148
+ )
1149
+ by_polymer_type = dict(cursor.fetchall())
1150
+
1151
+ # By technique
1152
+ cursor.execute(
1153
+ """
1154
+ SELECT technique, COUNT(*) as count
1155
+ FROM spectra
1156
+ GROUP BY technique
1157
+ ORDER BY count DESC
1158
+ """
1159
+ )
1160
+ by_technique = dict(cursor.fetchall())
1161
+
1162
+ # By validation status
1163
+ cursor.execute(
1164
+ """
1165
+ SELECT validation_status, COUNT(*) as count
1166
+ FROM spectra
1167
+ GROUP BY validation_status
1168
+ """
1169
+ )
1170
+ by_validation = dict(cursor.fetchall())
1171
+
1172
+ # Average quality score
1173
+ cursor.execute(
1174
+ "SELECT AVG(quality_score) FROM spectra WHERE quality_score IS NOT NULL"
1175
+ )
1176
+ avg_quality = cursor.fetchone()[0] or 0.0
1177
+
1178
+ # Aging data count
1179
+ cursor.execute("SELECT COUNT(*) FROM aging_data")
1180
+ aging_samples = cursor.fetchone()[0]
1181
+
1182
+ return {
1183
+ "total_spectra": total_spectra,
1184
+ "by_polymer_type": by_polymer_type,
1185
+ "by_technique": by_technique,
1186
+ "by_validation_status": by_validation,
1187
+ "average_quality_score": avg_quality,
1188
+ "aging_samples": aging_samples,
1189
+ }