devjas1 commited on
Commit
b2793c5
·
1 Parent(s): c024e8f

Adds enhanced data management system for spectral analysis

Browse files

Implements a comprehensive framework for managing spectral data, including metadata preservation, provenance tracking, and contextual knowledge networks.

Introduces classes for spectral metadata, provenance records, and contextual spectra, facilitating efficient data handling and quality assessment. Enhances user experience through intelligent preprocessing recommendations and session management.

This system aims to improve reproducibility and data quality in scientific research.

Files changed (1) hide show
  1. modules/enhanced_data.py +448 -0
modules/enhanced_data.py ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Data Management System for POLYMEROS
3
+ Implements contextual knowledge networks and metadata preservation
4
+ """
5
+
6
+ import os
7
+ import json
8
+ import hashlib
9
+ from dataclasses import dataclass, asdict
10
+ from datetime import datetime
11
+ from typing import Dict, List, Optional, Any, Tuple
12
+ from pathlib import Path
13
+ import numpy as np
14
+
15
+ from utils.preprocessing import preprocess_spectrum
16
+
17
+
18
+ @dataclass
19
+ class SpectralMetadata:
20
+ """Comprehensive metadata for spectral data"""
21
+
22
+ filename: str
23
+ acquisition_date: Optional[str] = None
24
+ instrument_type: str = "Raman"
25
+ laser_wavelength: Optional[float] = None
26
+ integration_time: Optional[float] = None
27
+ laser_power: Optional[float] = None
28
+ temperature: Optional[float] = None
29
+ humidity: Optional[float] = None
30
+ sample_preparation: Optional[str] = None
31
+ operator: Optional[str] = None
32
+ data_quality_score: Optional[float] = None
33
+ preprocessing_history: Optional[List[str]] = None
34
+
35
+ def __post_init__(self):
36
+ if self.preprocessing_history is None:
37
+ self.preprocessing_history = []
38
+
39
+ def to_dict(self) -> Dict[str, Any]:
40
+ return asdict(self)
41
+
42
+ @classmethod
43
+ def from_dict(cls, data: Dict[str, Any]) -> "SpectralMetadata":
44
+ return cls(**data)
45
+
46
+
47
+ @dataclass
48
+ class ProvenanceRecord:
49
+ """Complete provenance tracking for scientific reproducibility"""
50
+
51
+ operation: str
52
+ timestamp: str
53
+ parameters: Dict[str, Any]
54
+ input_hash: str
55
+ output_hash: str
56
+ operator: str = "system"
57
+
58
+ def to_dict(self) -> Dict[str, Any]:
59
+ return asdict(self)
60
+
61
+ @classmethod
62
+ def from_dict(cls, data: Dict[str, Any]) -> "ProvenanceRecord":
63
+ return cls(**data)
64
+
65
+
66
+ class ContextualSpectrum:
67
+ """Enhanced spectral data with context and provenance"""
68
+
69
+ def __init__(
70
+ self,
71
+ x_data: np.ndarray,
72
+ y_data: np.ndarray,
73
+ metadata: SpectralMetadata,
74
+ label: Optional[int] = None,
75
+ ):
76
+ self.x_data = x_data
77
+ self.y_data = y_data
78
+ self.metadata = metadata
79
+ self.label = label
80
+ self.provenance: List[ProvenanceRecord] = []
81
+ self.relationships: Dict[str, List[str]] = {
82
+ "similar_spectra": [],
83
+ "related_samples": [],
84
+ }
85
+
86
+ # Calculate initial hash
87
+ self._update_hash()
88
+
89
+ def _calculate_hash(self, data: np.ndarray) -> str:
90
+ """Calculate hash of numpy array for provenance tracking"""
91
+ return hashlib.sha256(data.tobytes()).hexdigest()[:16]
92
+
93
+ def _update_hash(self):
94
+ """Update data hash after modifications"""
95
+ self.data_hash = self._calculate_hash(self.y_data)
96
+
97
+ def add_provenance(
98
+ self, operation: str, parameters: Dict[str, Any], operator: str = "system"
99
+ ):
100
+ """Add provenance record for operation"""
101
+ input_hash = self.data_hash
102
+
103
+ record = ProvenanceRecord(
104
+ operation=operation,
105
+ timestamp=datetime.now().isoformat(),
106
+ parameters=parameters,
107
+ input_hash=input_hash,
108
+ output_hash="", # Will be updated after operation
109
+ operator=operator,
110
+ )
111
+
112
+ self.provenance.append(record)
113
+ return record
114
+
115
+ def finalize_provenance(self, record: ProvenanceRecord):
116
+ """Finalize provenance record with output hash"""
117
+ self._update_hash()
118
+ record.output_hash = self.data_hash
119
+
120
+ def apply_preprocessing(self, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
121
+ """Apply preprocessing with full provenance tracking"""
122
+ record = self.add_provenance("preprocessing", kwargs)
123
+
124
+ # Apply preprocessing
125
+ x_processed, y_processed = preprocess_spectrum(
126
+ self.x_data, self.y_data, **kwargs
127
+ )
128
+
129
+ # Update data and finalize provenance
130
+ self.x_data = x_processed
131
+ self.y_data = y_processed
132
+ self.finalize_provenance(record)
133
+
134
+ # Update metadata
135
+ if self.metadata.preprocessing_history is None:
136
+ self.metadata.preprocessing_history = []
137
+ self.metadata.preprocessing_history.append(
138
+ f"preprocessing_{datetime.now().isoformat()[:19]}"
139
+ )
140
+
141
+ return x_processed, y_processed
142
+
143
+ def to_dict(self) -> Dict[str, Any]:
144
+ """Serialize to dictionary"""
145
+ return {
146
+ "x_data": self.x_data.tolist(),
147
+ "y_data": self.y_data.tolist(),
148
+ "metadata": self.metadata.to_dict(),
149
+ "label": self.label,
150
+ "provenance": [p.to_dict() for p in self.provenance],
151
+ "relationships": self.relationships,
152
+ "data_hash": self.data_hash,
153
+ }
154
+
155
+ @classmethod
156
+ def from_dict(cls, data: Dict[str, Any]) -> "ContextualSpectrum":
157
+ """Deserialize from dictionary"""
158
+ spectrum = cls(
159
+ x_data=np.array(data["x_data"]),
160
+ y_data=np.array(data["y_data"]),
161
+ metadata=SpectralMetadata.from_dict(data["metadata"]),
162
+ label=data.get("label"),
163
+ )
164
+ spectrum.provenance = [
165
+ ProvenanceRecord.from_dict(p) for p in data["provenance"]
166
+ ]
167
+ spectrum.relationships = data["relationships"]
168
+ spectrum.data_hash = data["data_hash"]
169
+ return spectrum
170
+
171
+
172
+ class KnowledgeGraph:
173
+ """Knowledge graph for managing relationships between spectra and samples"""
174
+
175
+ def __init__(self):
176
+ self.nodes: Dict[str, ContextualSpectrum] = {}
177
+ self.edges: Dict[str, List[Dict[str, Any]]] = {}
178
+
179
+ def add_spectrum(self, spectrum: ContextualSpectrum, node_id: Optional[str] = None):
180
+ """Add spectrum to knowledge graph"""
181
+ if node_id is None:
182
+ node_id = spectrum.data_hash
183
+
184
+ self.nodes[node_id] = spectrum
185
+ self.edges[node_id] = []
186
+
187
+ # Auto-detect relationships
188
+ self._detect_relationships(node_id)
189
+
190
+ def _detect_relationships(self, node_id: str):
191
+ """Automatically detect relationships between spectra"""
192
+ current_spectrum = self.nodes[node_id]
193
+
194
+ for other_id, other_spectrum in self.nodes.items():
195
+ if other_id == node_id:
196
+ continue
197
+
198
+ # Check for similar acquisition conditions
199
+ if self._are_similar_conditions(current_spectrum, other_spectrum):
200
+ self.add_relationship(node_id, other_id, "similar_conditions", 0.8)
201
+
202
+ # Check for spectral similarity (simplified)
203
+ similarity = self._calculate_spectral_similarity(
204
+ current_spectrum.y_data, other_spectrum.y_data
205
+ )
206
+ if similarity > 0.9:
207
+ self.add_relationship(
208
+ node_id, other_id, "spectral_similarity", similarity
209
+ )
210
+
211
+ def _are_similar_conditions(
212
+ self, spec1: ContextualSpectrum, spec2: ContextualSpectrum
213
+ ) -> bool:
214
+ """Check if two spectra were acquired under similar conditions"""
215
+ meta1, meta2 = spec1.metadata, spec2.metadata
216
+
217
+ # Check instrument type
218
+ if meta1.instrument_type != meta2.instrument_type:
219
+ return False
220
+
221
+ # Check laser wavelength (if available)
222
+ if (
223
+ meta1.laser_wavelength
224
+ and meta2.laser_wavelength
225
+ and abs(meta1.laser_wavelength - meta2.laser_wavelength) > 1.0
226
+ ):
227
+ return False
228
+
229
+ return True
230
+
231
+ def _calculate_spectral_similarity(
232
+ self, spec1: np.ndarray, spec2: np.ndarray
233
+ ) -> float:
234
+ """Calculate similarity between two spectra"""
235
+ if len(spec1) != len(spec2):
236
+ return 0.0
237
+
238
+ # Normalize spectra
239
+ spec1_norm = (spec1 - np.min(spec1)) / (np.max(spec1) - np.min(spec1) + 1e-8)
240
+ spec2_norm = (spec2 - np.min(spec2)) / (np.max(spec2) - np.min(spec2) + 1e-8)
241
+
242
+ # Calculate correlation coefficient
243
+ correlation = np.corrcoef(spec1_norm, spec2_norm)[0, 1]
244
+ return max(0.0, correlation)
245
+
246
+ def add_relationship(
247
+ self, node1: str, node2: str, relationship_type: str, weight: float
248
+ ):
249
+ """Add relationship between two nodes"""
250
+ edge = {
251
+ "target": node2,
252
+ "type": relationship_type,
253
+ "weight": weight,
254
+ "timestamp": datetime.now().isoformat(),
255
+ }
256
+
257
+ self.edges[node1].append(edge)
258
+
259
+ # Add reverse edge
260
+ reverse_edge = {
261
+ "target": node1,
262
+ "type": relationship_type,
263
+ "weight": weight,
264
+ "timestamp": datetime.now().isoformat(),
265
+ }
266
+
267
+ if node2 in self.edges:
268
+ self.edges[node2].append(reverse_edge)
269
+
270
+ def get_related_spectra(
271
+ self, node_id: str, relationship_type: Optional[str] = None
272
+ ) -> List[str]:
273
+ """Get spectra related to given node"""
274
+ if node_id not in self.edges:
275
+ return []
276
+
277
+ related = []
278
+ for edge in self.edges[node_id]:
279
+ if relationship_type is None or edge["type"] == relationship_type:
280
+ related.append(edge["target"])
281
+
282
+ return related
283
+
284
+ def export_knowledge_graph(self, filepath: str):
285
+ """Export knowledge graph to JSON file"""
286
+ export_data = {
287
+ "nodes": {k: v.to_dict() for k, v in self.nodes.items()},
288
+ "edges": self.edges,
289
+ "metadata": {
290
+ "created": datetime.now().isoformat(),
291
+ "total_nodes": len(self.nodes),
292
+ "total_edges": sum(len(edges) for edges in self.edges.values()),
293
+ },
294
+ }
295
+
296
+ with open(filepath, "w", encoding="utf-8") as f:
297
+ json.dump(export_data, f, indent=2)
298
+
299
+
300
+ class EnhancedDataManager:
301
+ """Main data management interface for POLYMEROS"""
302
+
303
+ def __init__(self, cache_dir: str = "data_cache"):
304
+ self.cache_dir = Path(cache_dir)
305
+ self.cache_dir.mkdir(exist_ok=True)
306
+ self.knowledge_graph = KnowledgeGraph()
307
+ self.quality_thresholds = {
308
+ "min_intensity": 10.0,
309
+ "min_signal_to_noise": 3.0,
310
+ "max_baseline_drift": 0.1,
311
+ }
312
+
313
+ def load_spectrum_with_context(
314
+ self, filepath: str, metadata: Optional[Dict[str, Any]] = None
315
+ ) -> ContextualSpectrum:
316
+ """Load spectrum with automatic metadata extraction and quality assessment"""
317
+ from scripts.plot_spectrum import load_spectrum
318
+
319
+ # Load raw data
320
+ x_data, y_data = load_spectrum(filepath)
321
+
322
+ # Extract metadata
323
+ if metadata is None:
324
+ metadata = self._extract_metadata_from_file(filepath)
325
+
326
+ spectral_metadata = SpectralMetadata(
327
+ filename=os.path.basename(filepath), **metadata
328
+ )
329
+
330
+ # Create contextual spectrum
331
+ spectrum = ContextualSpectrum(
332
+ np.array(x_data), np.array(y_data), spectral_metadata
333
+ )
334
+
335
+ # Assess data quality
336
+ quality_score = self._assess_data_quality(np.array(y_data))
337
+ spectrum.metadata.data_quality_score = quality_score
338
+
339
+ # Add to knowledge graph
340
+ self.knowledge_graph.add_spectrum(spectrum)
341
+
342
+ return spectrum
343
+
344
+ def _extract_metadata_from_file(self, filepath: str) -> Dict[str, Any]:
345
+ """Extract metadata from filename and file properties"""
346
+ filename = os.path.basename(filepath)
347
+
348
+ metadata = {
349
+ "acquisition_date": datetime.fromtimestamp(
350
+ os.path.getmtime(filepath)
351
+ ).isoformat(),
352
+ "instrument_type": "Raman", # Default
353
+ }
354
+
355
+ # Extract information from filename patterns
356
+ if "785nm" in filename.lower():
357
+ metadata["laser_wavelength"] = "785.0"
358
+ elif "532nm" in filename.lower():
359
+ metadata["laser_wavelength"] = "532.0"
360
+
361
+ return metadata
362
+
363
+ def _assess_data_quality(self, y_data: np.ndarray) -> float:
364
+ """Assess spectral data quality using multiple metrics"""
365
+ scores = []
366
+
367
+ # Signal intensity check
368
+ max_intensity = np.max(y_data)
369
+ if max_intensity >= self.quality_thresholds["min_intensity"]:
370
+ scores.append(min(1.0, max_intensity / 1000.0))
371
+ else:
372
+ scores.append(0.0)
373
+
374
+ # Signal-to-noise ratio estimation
375
+ signal = np.mean(y_data)
376
+ noise = np.std(y_data[y_data < np.percentile(y_data, 10)])
377
+ snr = signal / (noise + 1e-8)
378
+
379
+ if snr >= self.quality_thresholds["min_signal_to_noise"]:
380
+ scores.append(min(1.0, snr / 10.0))
381
+ else:
382
+ scores.append(0.0)
383
+
384
+ # Baseline stability
385
+ baseline_variation = np.std(y_data) / (np.mean(y_data) + 1e-8)
386
+ baseline_score = max(
387
+ 0.0,
388
+ 1.0 - baseline_variation / self.quality_thresholds["max_baseline_drift"],
389
+ )
390
+ scores.append(baseline_score)
391
+
392
+ return float(np.mean(scores))
393
+
394
+ def preprocess_with_tracking(
395
+ self, spectrum: ContextualSpectrum, **preprocessing_params
396
+ ) -> ContextualSpectrum:
397
+ """Apply preprocessing with full tracking"""
398
+ spectrum.apply_preprocessing(**preprocessing_params)
399
+ return spectrum
400
+
401
+ def get_preprocessing_recommendations(
402
+ self, spectrum: ContextualSpectrum
403
+ ) -> Dict[str, Any]:
404
+ """Provide intelligent preprocessing recommendations based on data characteristics"""
405
+ recommendations = {}
406
+
407
+ y_data = spectrum.y_data
408
+
409
+ # Baseline correction recommendation
410
+ baseline_variation = np.std(np.diff(y_data))
411
+ if baseline_variation > 0.05:
412
+ recommendations["do_baseline"] = True
413
+ recommendations["degree"] = 3 if baseline_variation > 0.1 else 2
414
+ else:
415
+ recommendations["do_baseline"] = False
416
+
417
+ # Smoothing recommendation
418
+ noise_level = np.std(y_data[y_data < np.percentile(y_data, 20)])
419
+ if noise_level > 0.01:
420
+ recommendations["do_smooth"] = True
421
+ recommendations["window_length"] = 11 if noise_level > 0.05 else 7
422
+ else:
423
+ recommendations["do_smooth"] = False
424
+
425
+ # Normalization is generally recommended
426
+ recommendations["do_normalize"] = True
427
+
428
+ return recommendations
429
+
430
+ def save_session(self, session_name: str):
431
+ """Save current data management session"""
432
+ session_file = self.cache_dir / f"{session_name}_session.json"
433
+ self.knowledge_graph.export_knowledge_graph(str(session_file))
434
+
435
+ def load_session(self, session_name: str):
436
+ """Load saved data management session"""
437
+ session_file = self.cache_dir / f"{session_name}_session.json"
438
+
439
+ if session_file.exists():
440
+ with open(session_file, "r") as f:
441
+ data = json.load(f)
442
+
443
+ # Reconstruct knowledge graph
444
+ for node_id, node_data in data["nodes"].items():
445
+ spectrum = ContextualSpectrum.from_dict(node_data)
446
+ self.knowledge_graph.nodes[node_id] = spectrum
447
+
448
+ self.knowledge_graph.edges = data["edges"]