Spaces:
Sleeping
Sleeping
devjas1
commited on
Commit
·
07fb119
1
Parent(s):
8475e7b
FEAT(data_pipeline): implement enhanced data pipeline for polymer ML aging with spectroscopy integration and synthetic data augmentation
Browse filesAdds enhanced data pipeline for polymer ML aging
Implements an advanced data pipeline that integrates spectroscopy databases and synthetic data augmentation techniques for polymer machine learning aging studies.
This new module enhances data processing capabilities, supporting various spectroscopy formats and providing tools for data quality control. It also introduces methods for generating synthetic aging series, improving the analysis of polymer degradation over time.
- modules/enhanced_data_pipeline.py +1189 -0
modules/enhanced_data_pipeline.py
ADDED
@@ -0,0 +1,1189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Enhanced Data Pipeline for Polymer ML Aging
|
3 |
+
Integrates with spectroscopy databases, synthetic data augmentation, and quality control
|
4 |
+
"""
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
from typing import Dict, List, Tuple, Optional, Union, Any
|
9 |
+
from dataclasses import dataclass, field
|
10 |
+
from pathlib import Path
|
11 |
+
import requests
|
12 |
+
import json
|
13 |
+
import sqlite3
|
14 |
+
from datetime import datetime
|
15 |
+
import hashlib
|
16 |
+
import warnings
|
17 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
18 |
+
from sklearn.decomposition import PCA
|
19 |
+
from sklearn.cluster import DBSCAN
|
20 |
+
import pickle
|
21 |
+
import io
|
22 |
+
import base64
|
23 |
+
|
24 |
+
|
25 |
+
@dataclass
|
26 |
+
class SpectralDatabase:
|
27 |
+
"""Configuration for spectroscopy databases"""
|
28 |
+
|
29 |
+
name: str
|
30 |
+
base_url: Optional[str] = None
|
31 |
+
api_key: Optional[str] = None
|
32 |
+
description: str = ""
|
33 |
+
supported_formats: List[str] = field(default_factory=list)
|
34 |
+
access_method: str = "api" # "api", "download", "local"
|
35 |
+
local_path: Optional[Path] = None
|
36 |
+
|
37 |
+
|
38 |
+
# -///////////////////////////////////////////////////
|
39 |
+
@dataclass
|
40 |
+
class PolymerSample:
|
41 |
+
"""Enhanced polymer sample information"""
|
42 |
+
|
43 |
+
sample_id: str
|
44 |
+
polymer_type: str
|
45 |
+
molecular_weight: Optional[float] = None
|
46 |
+
additives: List[str] = field(default_factory=list)
|
47 |
+
processing_conditions: Dict[str, Any] = field(default_factory=dict)
|
48 |
+
aging_condition: Dict[str, Any] = field(default_factory=dict)
|
49 |
+
aging_time: Optional[float] = None # Hours
|
50 |
+
degradation_level: Optional[float] = None # 0-1 Scale
|
51 |
+
spectral_data: Dict[str, np.ndarray] = field(default_factory=dict)
|
52 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
53 |
+
quality_score: Optional[float] = None
|
54 |
+
validation_status: str = "pending" # pending, validated, rejected
|
55 |
+
|
56 |
+
|
57 |
+
# -///////////////////////////////////////////////////
|
58 |
+
|
59 |
+
# Database configurations
|
60 |
+
SPECTROSCOPY_DATABASES = {
|
61 |
+
"FTIR_PLASTICS": SpectralDatabase(
|
62 |
+
name="FTIR Plastics Database",
|
63 |
+
description="Comprehensive FTIR spectra of plastic materials",
|
64 |
+
supported_formats=["FTIR", "ATR-FTIR"],
|
65 |
+
access_method="local",
|
66 |
+
local_path=Path("data/databases/ftir_plastics"),
|
67 |
+
),
|
68 |
+
"NIST_WEBBOOK": SpectralDatabase(
|
69 |
+
name="NIST Chemistry WebBook",
|
70 |
+
base_url="https://webbook.nist.gov/chemistry",
|
71 |
+
description="NIST spectroscopic database",
|
72 |
+
supported_formats=["FTIR", "Raman"],
|
73 |
+
access_method="api",
|
74 |
+
),
|
75 |
+
"POLYMER_DATABASE": SpectralDatabase(
|
76 |
+
name="Polymer Spectroscopy Database",
|
77 |
+
description="Curated polymer degradation spectra",
|
78 |
+
supported_formats=["FTIR", "ATR-FTIR", "Raman"],
|
79 |
+
access_method="local",
|
80 |
+
local_path=Path("data/databases/polymer_degradation"),
|
81 |
+
),
|
82 |
+
}
|
83 |
+
|
84 |
+
# -///////////////////////////////////////////////////
|
85 |
+
|
86 |
+
|
87 |
+
class DatabaseConnector:
|
88 |
+
"""Connector for spectroscopy databases"""
|
89 |
+
|
90 |
+
def __init__(self, database_config: SpectralDatabase):
|
91 |
+
self.config = database_config
|
92 |
+
self.connection = None
|
93 |
+
self.cache_dir = Path("data/cache") / database_config.name.lower().replace(
|
94 |
+
" ", "_"
|
95 |
+
)
|
96 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
97 |
+
|
98 |
+
def connect(self) -> bool:
|
99 |
+
"""Establish connection to database"""
|
100 |
+
try:
|
101 |
+
if self.config.access_method == "local":
|
102 |
+
if self.config.local_path and self.config.local_path.exists():
|
103 |
+
return True
|
104 |
+
else:
|
105 |
+
print(f"Local database path not found: {self.config.local_path}")
|
106 |
+
return False
|
107 |
+
|
108 |
+
elif self.config.access_method == "api":
|
109 |
+
# Test API connection
|
110 |
+
if self.config.base_url:
|
111 |
+
response = requests.get(self.config.base_url, timeout=10)
|
112 |
+
return response.status_code == 200
|
113 |
+
return False
|
114 |
+
|
115 |
+
return True
|
116 |
+
|
117 |
+
except Exception as e:
|
118 |
+
print(f"Failed to connect to {self.config.name}: {e}")
|
119 |
+
return False
|
120 |
+
|
121 |
+
# -///////////////////////////////////////////////////
|
122 |
+
def search_by_polymer_type(self, polymer_type: str, limit: int = 100) -> List[Dict]:
|
123 |
+
"""Search database for spectra by polymer type"""
|
124 |
+
cache_key = f"search{hashlib.md5(polymer_type.encode()).hexdigest()}"
|
125 |
+
cache_file = self.cache_dir / f"{cache_key}.json"
|
126 |
+
|
127 |
+
# Check cache first
|
128 |
+
if cache_file.exists():
|
129 |
+
with open(cache_file, "r") as f:
|
130 |
+
return json.load(f)
|
131 |
+
|
132 |
+
results = []
|
133 |
+
|
134 |
+
if self.config.access_method == "local":
|
135 |
+
results = self._search_local_database(polymer_type, limit)
|
136 |
+
elif self.config.access_method == "api":
|
137 |
+
results = self._search_api_database(polymer_type, limit)
|
138 |
+
|
139 |
+
# Cache results
|
140 |
+
if results:
|
141 |
+
with open(cache_file, "w") as f:
|
142 |
+
json.dump(results, f)
|
143 |
+
|
144 |
+
return results
|
145 |
+
|
146 |
+
# -///////////////////////////////////////////////////
|
147 |
+
def _search_local_database(self, polymer_type: str, limit: int) -> List[Dict]:
|
148 |
+
"""Search local database files"""
|
149 |
+
results = []
|
150 |
+
|
151 |
+
if not self.config.local_path or not self.config.local_path.exists():
|
152 |
+
return results
|
153 |
+
|
154 |
+
# Look for CSV files with polymer data
|
155 |
+
for csv_file in self.config.local_path.glob("*.csv"):
|
156 |
+
try:
|
157 |
+
df = pd.read_csv(csv_file)
|
158 |
+
|
159 |
+
# Search for polymer type in columns
|
160 |
+
polymer_matches = df[
|
161 |
+
df.astype(str)
|
162 |
+
.apply(lambda x: x.str.contains(polymer_type, case=False))
|
163 |
+
.any(axis=1)
|
164 |
+
]
|
165 |
+
|
166 |
+
for _, row in polymer_matches.head(limit).iterrows():
|
167 |
+
result = {
|
168 |
+
"source_file": str(csv_file),
|
169 |
+
"polymer_type": polymer_type,
|
170 |
+
"data": row.to_dict(),
|
171 |
+
}
|
172 |
+
results.append(result)
|
173 |
+
|
174 |
+
except Exception as e:
|
175 |
+
print(f"Error reading {csv_file}: {e}")
|
176 |
+
continue
|
177 |
+
|
178 |
+
return results
|
179 |
+
|
180 |
+
# -///////////////////////////////////////////////////
|
181 |
+
def _search_api_database(self, polymer_type: str, limit: int) -> List[Dict]:
|
182 |
+
"""Search API-based database"""
|
183 |
+
results = []
|
184 |
+
|
185 |
+
try:
|
186 |
+
# TODO: Example API search (would need actual API endpoints)
|
187 |
+
search_params = {"query": polymer_type, "limit": limit, "format": "json"}
|
188 |
+
|
189 |
+
if self.config.api_key:
|
190 |
+
search_params["api_key"] = self.config.api_key
|
191 |
+
|
192 |
+
response = requests.get(
|
193 |
+
f"{self.config.base_url}/search", params=search_params, timeout=30
|
194 |
+
)
|
195 |
+
|
196 |
+
if response.status_code == 200:
|
197 |
+
results = response.json().get("results", [])
|
198 |
+
|
199 |
+
except Exception as e:
|
200 |
+
print(f"API search failed: {e}")
|
201 |
+
|
202 |
+
return results
|
203 |
+
|
204 |
+
# -///////////////////////////////////////////////////
|
205 |
+
def download_spectrum(self, spectrum_id: str) -> Optional[Dict]:
|
206 |
+
"""Download specific spectrum data"""
|
207 |
+
cache_file = self.cache_dir / f"spectrum_{spectrum_id}.json"
|
208 |
+
|
209 |
+
# Check cache
|
210 |
+
if cache_file.exists():
|
211 |
+
with open(cache_file, "r") as f:
|
212 |
+
return json.load(f)
|
213 |
+
|
214 |
+
spectrum_data = None
|
215 |
+
|
216 |
+
if self.config.access_method == "api":
|
217 |
+
try:
|
218 |
+
url = f"{self.config.base_url}/spectrum/{spectrum_id}"
|
219 |
+
response = requests.get(url, timeout=30)
|
220 |
+
|
221 |
+
if response.status_code == 200:
|
222 |
+
spectrum_data = response.json()
|
223 |
+
|
224 |
+
except Exception as e:
|
225 |
+
print(f"Failed to download spectrum {spectrum_id}: {e}")
|
226 |
+
|
227 |
+
# Cache results if successful
|
228 |
+
if spectrum_data:
|
229 |
+
with open(cache_file, "w") as f:
|
230 |
+
json.dump(spectrum_data, f)
|
231 |
+
|
232 |
+
return spectrum_data
|
233 |
+
|
234 |
+
|
235 |
+
# -///////////////////////////////////////////////////
|
236 |
+
class SyntheticDataAugmentation:
|
237 |
+
"""Advanced synthetic data augmentation for spectroscopy"""
|
238 |
+
|
239 |
+
def __init__(self):
|
240 |
+
self.augmentation_methods = [
|
241 |
+
"noise_addition",
|
242 |
+
"baseline_drift",
|
243 |
+
"intensity_scaling",
|
244 |
+
"wavenumber_shift",
|
245 |
+
"peak_broadening",
|
246 |
+
"atmospheric_effects",
|
247 |
+
"instrumental_response",
|
248 |
+
"sample_variations",
|
249 |
+
]
|
250 |
+
|
251 |
+
def augment_spectrum(
|
252 |
+
self,
|
253 |
+
wavenumbers: np.ndarray,
|
254 |
+
intensities: np.ndarray,
|
255 |
+
method: str = "random",
|
256 |
+
num_variations: int = 5,
|
257 |
+
intensity_factor: float = 0.1,
|
258 |
+
) -> List[Tuple[np.ndarray, np.ndarray]]:
|
259 |
+
"""
|
260 |
+
Generate augmented versions of a spectrum
|
261 |
+
|
262 |
+
Args:
|
263 |
+
wavenumbers: Original wavenumber array
|
264 |
+
intensities: Original intensity array
|
265 |
+
method: str = Augmentation method or 'random' for random selection
|
266 |
+
num_variations: Number of variations to generate
|
267 |
+
intensity_factor: Factor controlling augmentation intesity
|
268 |
+
|
269 |
+
Returns:
|
270 |
+
List of (wavenumbers, intensities) tuples
|
271 |
+
"""
|
272 |
+
augmented_spectra = []
|
273 |
+
|
274 |
+
for _ in range(num_variations):
|
275 |
+
if method == "random":
|
276 |
+
chosen_method = np.random.choice(self.augmentation_methods)
|
277 |
+
else:
|
278 |
+
chosen_method = method
|
279 |
+
|
280 |
+
aug_wavenumbers, aug_intensities = self._apply_augmentation(
|
281 |
+
wavenumbers, intensities, chosen_method, intensity_factor
|
282 |
+
)
|
283 |
+
|
284 |
+
augmented_spectra.append((aug_wavenumbers, aug_intensities))
|
285 |
+
|
286 |
+
return augmented_spectra
|
287 |
+
|
288 |
+
# -///////////////////////////////////////////////////
|
289 |
+
def _apply_augmentation(
|
290 |
+
self,
|
291 |
+
wavenumbers: np.ndarray,
|
292 |
+
intensities: np.ndarray,
|
293 |
+
method: str,
|
294 |
+
intensity: float,
|
295 |
+
) -> Tuple[np.ndarray, np.ndarray]:
|
296 |
+
"""Apply specific augmentation method"""
|
297 |
+
|
298 |
+
aug_wavenumbers = wavenumbers.copy()
|
299 |
+
aug_intensities = intensities.copy()
|
300 |
+
|
301 |
+
if method == "noise_addition":
|
302 |
+
# Add random noise
|
303 |
+
noise_level = intensity * np.std(intensities)
|
304 |
+
noise = np.random.normal(0, noise_level, len(intensities))
|
305 |
+
aug_intensities += noise
|
306 |
+
|
307 |
+
elif method == "baseline_drift":
|
308 |
+
# Add baseline drift
|
309 |
+
drift_amplitude = intensity * np.mean(np.abs(intensities))
|
310 |
+
drift = drift_amplitude * np.sin(
|
311 |
+
2 * np.pi * np.linspace(0, 2, len(intensities))
|
312 |
+
)
|
313 |
+
aug_intensities += drift
|
314 |
+
|
315 |
+
elif method == "intensity_scaling":
|
316 |
+
# Scale intensity uniformly
|
317 |
+
scale_factor = 1.0 + intensity * (2 * np.random.random() - 1)
|
318 |
+
aug_intensities *= scale_factor
|
319 |
+
|
320 |
+
elif method == "wavenumber_shift":
|
321 |
+
# Shift wavenumber axis
|
322 |
+
shift_range = intensity * 10 # cm-1
|
323 |
+
shift = shift_range * (2 * np.random.random() - 1)
|
324 |
+
aug_wavenumbers += shift
|
325 |
+
|
326 |
+
elif method == "peak_broadening":
|
327 |
+
# Broaden peaks using convolution
|
328 |
+
from scipy import signal
|
329 |
+
|
330 |
+
sigma = intensity * 2 # Broadening factor
|
331 |
+
kernel_size = int(sigma * 6) + 1
|
332 |
+
if kernel_size % 2 == 0:
|
333 |
+
kernel_size += 1
|
334 |
+
|
335 |
+
if kernel_size >= 3:
|
336 |
+
from scipy.signal.windows import gaussian
|
337 |
+
|
338 |
+
kernel = gaussian(kernel_size, sigma)
|
339 |
+
kernel = kernel / np.sum(kernel)
|
340 |
+
aug_intensities = signal.convolve(
|
341 |
+
aug_intensities, kernel, mode="same"
|
342 |
+
)
|
343 |
+
|
344 |
+
elif method == "atmospheric_effects":
|
345 |
+
# Simulate atmospheric absorption
|
346 |
+
co2_region = (wavenumbers >= 2320) & (wavenumbers <= 2380)
|
347 |
+
h2o_region = (wavenumbers >= 3200) & (wavenumbers <= 3800)
|
348 |
+
|
349 |
+
if np.any(co2_region):
|
350 |
+
aug_intensities[co2_region] *= 1 - intensity * 0.1
|
351 |
+
if np.any(h2o_region):
|
352 |
+
aug_intensities[h2o_region] *= 1 - intensity * 0.05
|
353 |
+
|
354 |
+
elif method == "instrumental_response":
|
355 |
+
# Simulate instrumental response variations
|
356 |
+
# Add slight frequency-dependent response
|
357 |
+
response_curve = 1.0 + intensity * 0.1 * np.sin(
|
358 |
+
2
|
359 |
+
* np.pi
|
360 |
+
* (wavenumbers - wavenumbers.min())
|
361 |
+
/ (wavenumbers.max() - wavenumbers.min())
|
362 |
+
)
|
363 |
+
aug_intensities *= response_curve
|
364 |
+
|
365 |
+
elif method == "sample_variations":
|
366 |
+
# Simulate sample-to-sample variations
|
367 |
+
# Random peak intensity variations
|
368 |
+
num_peaks = min(5, len(intensities) // 100)
|
369 |
+
for _ in range(num_peaks):
|
370 |
+
peak_center = np.random.randint(0, len(intensities))
|
371 |
+
peak_width = np.random.randint(5, 20)
|
372 |
+
peak_variation = intensity * (2 * np.random.random() - 1)
|
373 |
+
|
374 |
+
start_idx = max(0, peak_center - peak_width)
|
375 |
+
end_idx = min(len(intensities), peak_center + peak_width)
|
376 |
+
|
377 |
+
aug_intensities[start_idx:end_idx] *= 1 + peak_variation
|
378 |
+
|
379 |
+
return aug_wavenumbers, aug_intensities
|
380 |
+
|
381 |
+
# -///////////////////////////////////////////////////
|
382 |
+
def generate_synthetic_aging_series(
|
383 |
+
self,
|
384 |
+
base_spectrum: Tuple[np.ndarray, np.ndarray],
|
385 |
+
num_time_points: int = 10,
|
386 |
+
max_degradation: float = 0.8,
|
387 |
+
) -> List[Dict]:
|
388 |
+
"""
|
389 |
+
Generate synthetic aging series showing progressive degradation
|
390 |
+
|
391 |
+
Args:
|
392 |
+
base_spectrum: (wavenumbers, intensities) for fresh sample
|
393 |
+
num_time_points: Number of time points in series
|
394 |
+
max_degradation: Maximum degradation level (0-1)
|
395 |
+
|
396 |
+
Returns:
|
397 |
+
List of aging data points
|
398 |
+
"""
|
399 |
+
wavenumbers, intensities = base_spectrum
|
400 |
+
aging_series = []
|
401 |
+
|
402 |
+
# Define degradation-related spectral changes
|
403 |
+
degradation_features = {
|
404 |
+
"carbonyl_growth": {
|
405 |
+
"region": (1700, 1750), # C=0 stretch
|
406 |
+
"intensity_change": 2.0, # Factor increase
|
407 |
+
},
|
408 |
+
"oh_growth": {
|
409 |
+
"region": (3200, 3600), # OH stretch
|
410 |
+
"intensity_change": 1.5,
|
411 |
+
},
|
412 |
+
"ch_decrease": {
|
413 |
+
"region": (2800, 3000), # CH stretch
|
414 |
+
"intensity_change": 0.7, # Factor decrease
|
415 |
+
},
|
416 |
+
"crystrallinity_change": {
|
417 |
+
"region": (1000, 1200), # Various polymer backbone changes
|
418 |
+
"intensity_change": 0.9,
|
419 |
+
},
|
420 |
+
}
|
421 |
+
|
422 |
+
for i in range(num_time_points):
|
423 |
+
degradation_level = (i / (num_time_points - 1)) * max_degradation
|
424 |
+
aging_time = i * 100 # hours (arbitrary scale)
|
425 |
+
|
426 |
+
# Start with base spectrum
|
427 |
+
aged_intensities = intensities.copy()
|
428 |
+
|
429 |
+
# Apply degradation-related changes
|
430 |
+
for feature, params in degradation_features.items():
|
431 |
+
region_mask = (wavenumbers >= params["region"][0]) & (
|
432 |
+
wavenumbers <= params["region"][1]
|
433 |
+
)
|
434 |
+
if np.any(region_mask):
|
435 |
+
change_factor = 1.0 + degradation_level * (
|
436 |
+
params["intensity_change"] - 1.0
|
437 |
+
)
|
438 |
+
aged_intensities[region_mask] *= change_factor
|
439 |
+
|
440 |
+
# Add some random variations
|
441 |
+
aug_wavenumbers, aug_intensities = self._apply_augmentation(
|
442 |
+
wavenumbers, aged_intensities, "noise_addition", 0.02
|
443 |
+
)
|
444 |
+
|
445 |
+
aging_point = {
|
446 |
+
"aging_time": aging_time,
|
447 |
+
"degradation_level": degradation_level,
|
448 |
+
"wavenumbers": aug_wavenumbers,
|
449 |
+
"intensities": aug_intensities,
|
450 |
+
"spectral_changes": {
|
451 |
+
feature: degradation_level * params["intensity_change"] - 1.0
|
452 |
+
for feature, params in degradation_features.items()
|
453 |
+
},
|
454 |
+
}
|
455 |
+
|
456 |
+
aging_series.append(aging_point)
|
457 |
+
|
458 |
+
return aging_series
|
459 |
+
|
460 |
+
|
461 |
+
# -///////////////////////////////////////////////////
|
462 |
+
class DataQualityController:
|
463 |
+
"""Advanced data quality assessment and validation"""
|
464 |
+
|
465 |
+
def __init__(self):
|
466 |
+
self.quality_metrics = [
|
467 |
+
"signal_to_noise_ratio",
|
468 |
+
"baseline_stability",
|
469 |
+
"peak_resolution",
|
470 |
+
"spectral_range_coverage",
|
471 |
+
"instrumental_artifacts",
|
472 |
+
"data_completeness",
|
473 |
+
"metadata_completeness",
|
474 |
+
]
|
475 |
+
|
476 |
+
self.validation_rules = {
|
477 |
+
"min_str": 10.0,
|
478 |
+
"max_baseline_variation": 0.1,
|
479 |
+
"min_peak_count": 3,
|
480 |
+
"min_spectral_range": 1000.0, # cm-1
|
481 |
+
"max_missing_points": 0.05, # 5% max missing data
|
482 |
+
}
|
483 |
+
|
484 |
+
def assess_spectrum_quality(
|
485 |
+
self,
|
486 |
+
wavenumbers: np.ndarray,
|
487 |
+
intensities: np.ndarray,
|
488 |
+
metadata: Optional[Dict] = None,
|
489 |
+
) -> Dict[str, Any]:
|
490 |
+
"""
|
491 |
+
Comprehensive quality assessment of spectral data
|
492 |
+
|
493 |
+
Args:
|
494 |
+
wavenumbers: Array of wavenumbers
|
495 |
+
intensities: Array of intensities
|
496 |
+
metadata: Optional metadata dictionary
|
497 |
+
|
498 |
+
Returns:
|
499 |
+
Quality assessment results
|
500 |
+
"""
|
501 |
+
assessment = {
|
502 |
+
"overall_score": 0.0,
|
503 |
+
"individual_scores": {},
|
504 |
+
"issues_found": [],
|
505 |
+
"recommendations": [], # Ensure this is initialized as a list
|
506 |
+
"validation_status": "pending",
|
507 |
+
}
|
508 |
+
|
509 |
+
# Signal-to-noise
|
510 |
+
snr_score, snr_value = self._assess_snr(intensities)
|
511 |
+
assessment["individual_scores"]["snr"] = snr_score
|
512 |
+
assessment["recommendations"] = snr_value
|
513 |
+
|
514 |
+
if snr_value < self.validation_rules["min_snr"]:
|
515 |
+
assessment["issues_found"].append(
|
516 |
+
f"Low SNR: {snr_value:.1f} (min: {self.validation_rules['min_snr']})"
|
517 |
+
)
|
518 |
+
assessment["recommendations"].append(
|
519 |
+
"Consider noise reduction preprocessing"
|
520 |
+
)
|
521 |
+
|
522 |
+
# Baseline stability
|
523 |
+
baseline_score, baseline_variation = self._assess_baseline_stability(
|
524 |
+
intensities
|
525 |
+
)
|
526 |
+
assessment["individual_scores"]["baseline"] = baseline_score
|
527 |
+
assessment["baseline_variation"] = baseline_variation
|
528 |
+
|
529 |
+
if baseline_variation > self.validation_rules["max_baseline_variation"]:
|
530 |
+
assessment["issues_found"].append(
|
531 |
+
f"Unstable baseline: {baseline_variation:.3f}"
|
532 |
+
)
|
533 |
+
assessment["recommendations"].append("Apply baseline correction")
|
534 |
+
|
535 |
+
# Peak resolution and count
|
536 |
+
peak_score, peak_count = self._assess_peak_resolution(wavenumbers, intensities)
|
537 |
+
assessment["individual_scores"]["peaks"] = peak_score
|
538 |
+
assessment["peak_count"] = peak_count
|
539 |
+
|
540 |
+
if peak_count < self.validation_rules["min_peak_count"]:
|
541 |
+
assessment["issues_found"].append(f"Few peaks detected: {peak_count}")
|
542 |
+
assessment["recommendations"].append(
|
543 |
+
"Check sample quality or measurement conditions"
|
544 |
+
)
|
545 |
+
|
546 |
+
# Spectral range coverage
|
547 |
+
range_score, spectral_range = self._assess_spectral_range(wavenumbers)
|
548 |
+
assessment["individual_scores"]["range"] = range_score
|
549 |
+
assessment["spectral_range"] = spectral_range
|
550 |
+
|
551 |
+
if spectral_range < self.validation_rules["min_spectral_range"]:
|
552 |
+
assessment["issues_found"].append(
|
553 |
+
f"Limited spectral range: {spectral_range:.0f} cm-1"
|
554 |
+
)
|
555 |
+
|
556 |
+
# Data completeness
|
557 |
+
completeness_score, missing_fraction = self._assess_data_completeness(
|
558 |
+
intensities
|
559 |
+
)
|
560 |
+
assessment["individual_scores"]["completeness"] = completeness_score
|
561 |
+
assessment["missing_fraction"] = missing_fraction
|
562 |
+
|
563 |
+
if missing_fraction > self.validation_rules["max_missing_points"]:
|
564 |
+
assessment["issues_found"].append(
|
565 |
+
f"Missing data points: {missing_fraction:.1f}%"
|
566 |
+
)
|
567 |
+
assessment["recommendations"].append(
|
568 |
+
"Interpolate missing points or re-measure"
|
569 |
+
)
|
570 |
+
|
571 |
+
# Instrumental artifacts
|
572 |
+
artifact_score, artifacts = self._detect_instrumental_artifacts(
|
573 |
+
wavenumbers, intensities
|
574 |
+
)
|
575 |
+
assessment["individual_scores"]["artifacts"] = artifact_score
|
576 |
+
assessment["artifacts_detected"] = artifacts
|
577 |
+
|
578 |
+
if artifacts:
|
579 |
+
assessment["issues_found"].extend(
|
580 |
+
[f"Artifact detected {artifact}" for artifact in artifacts]
|
581 |
+
)
|
582 |
+
assessment["recommendations"].append("Apply artifact correction")
|
583 |
+
|
584 |
+
# Metadata completeness
|
585 |
+
metadata_score = self._assess_metadata_completeness(metadata)
|
586 |
+
assessment["individual_scores"]["metadata"] = metadata_score
|
587 |
+
|
588 |
+
# Calculate overall score
|
589 |
+
scores = list(assessment["individual_scores"].values())
|
590 |
+
assessment["overall_score"] = np.mean(scores) if scores else 0.0
|
591 |
+
|
592 |
+
# Determine validation status
|
593 |
+
if assessment["overall_score"] >= 0.8 and len(assessment["issues_found"]) == 0:
|
594 |
+
assessment["validation_status"] = "validated"
|
595 |
+
elif assessment["overall_score"] >= 0.6:
|
596 |
+
assessment["validation_status"] = "conditional"
|
597 |
+
else:
|
598 |
+
assessment["validation_status"] = "rejected"
|
599 |
+
|
600 |
+
return assessment
|
601 |
+
|
602 |
+
# -///////////////////////////////////////////////////
|
603 |
+
def _assess_snr(self, intensities: np.ndarray) -> Tuple[float, float]:
|
604 |
+
"""Assess signal-to-noise ratio"""
|
605 |
+
try:
|
606 |
+
# Estimate noise from high-frequency components
|
607 |
+
diff_signal = np.diff(intensities)
|
608 |
+
noise_std = np.std(diff_signal)
|
609 |
+
signal_power = np.var(intensities)
|
610 |
+
|
611 |
+
snr = np.sqrt(signal_power) / noise_std if noise_std > 0 else float("inf")
|
612 |
+
|
613 |
+
# Score based on SNR values
|
614 |
+
score = min(
|
615 |
+
1.0, max(0.0, (np.log10(snr) - 1) / 2)
|
616 |
+
) # Log scale, 10-1000 range
|
617 |
+
|
618 |
+
return score, snr
|
619 |
+
except:
|
620 |
+
return 0.5, 1.0
|
621 |
+
|
622 |
+
# -///////////////////////////////////////////////////
|
623 |
+
def _assess_baseline_stability(
|
624 |
+
self, intensities: np.ndarray
|
625 |
+
) -> Tuple[float, float]:
|
626 |
+
"""Assess baseline stability"""
|
627 |
+
try:
|
628 |
+
# Estimate baseline from endpoints and low-frequency components
|
629 |
+
baseline_points = np.concatenate([intensities[:10], intensities[-10]])
|
630 |
+
baseline_variation = np.std(baseline_points) / np.mean(abs(intensities))
|
631 |
+
|
632 |
+
score = max(0.0, 1.0 - baseline_variation * 10) # Penalty for variation
|
633 |
+
|
634 |
+
return score, baseline_variation
|
635 |
+
|
636 |
+
except:
|
637 |
+
return 0.5, 1.0
|
638 |
+
|
639 |
+
# -///////////////////////////////////////////////////
|
640 |
+
def _assess_peak_resolution(
|
641 |
+
self, wavenumbers: np.ndarray, intensities: np.ndarray
|
642 |
+
) -> Tuple[float, int]:
|
643 |
+
"""Assess peak resolution and count"""
|
644 |
+
try:
|
645 |
+
from scipy.signal import find_peaks
|
646 |
+
|
647 |
+
# Find peaks with minimum prominence
|
648 |
+
prominence_threshold = 0.1 * np.std(intensities)
|
649 |
+
peaks, properties = find_peaks(
|
650 |
+
intensities, prominence=prominence_threshold, distance=5
|
651 |
+
)
|
652 |
+
|
653 |
+
peak_count = len(peaks)
|
654 |
+
|
655 |
+
# Score based on peak count and prominence
|
656 |
+
if peak_count > 0:
|
657 |
+
avg_prominence = np.mean(properties["prominences"])
|
658 |
+
prominence_score = min(
|
659 |
+
1.0, avg_prominence / (0.2 * np.std(intensities))
|
660 |
+
)
|
661 |
+
count_score = min(1.0, peak_count / 10) # Normalize to ~10 peaks
|
662 |
+
score = 0.5 * prominence_score + 0.5 * count_score
|
663 |
+
else:
|
664 |
+
score = 0.0
|
665 |
+
|
666 |
+
return score, peak_count
|
667 |
+
|
668 |
+
except:
|
669 |
+
return 0.5, 0
|
670 |
+
|
671 |
+
# -///////////////////////////////////////////////////
|
672 |
+
def _assess_spectral_range(self, wavenumbers: np.ndarray) -> Tuple[float, float]:
|
673 |
+
"""Assess spectral range coverage"""
|
674 |
+
try:
|
675 |
+
spectral_range = wavenumbers.max() - wavenumbers.min()
|
676 |
+
|
677 |
+
# Score based on typical FTIR range (4000 cm-1)
|
678 |
+
score = min(1.0, spectral_range / 4000)
|
679 |
+
|
680 |
+
return score, spectral_range
|
681 |
+
|
682 |
+
except:
|
683 |
+
return 0.5, 1000
|
684 |
+
|
685 |
+
# -///////////////////////////////////////////////////
|
686 |
+
def _assess_data_completeness(self, intensities: np.ndarray) -> Tuple[float, float]:
|
687 |
+
"""Assess data completion"""
|
688 |
+
try:
|
689 |
+
# Check for NaN, or zero values
|
690 |
+
invalid_mask = (
|
691 |
+
np.isnan(intensities) | np.isinf(intensities) | (intensities == 0)
|
692 |
+
)
|
693 |
+
missing_fraction = np.sum(invalid_mask) / len(intensities)
|
694 |
+
|
695 |
+
score = max(
|
696 |
+
0.0, 1.0 - missing_fraction * 10
|
697 |
+
) # Heavy penalty for missing data
|
698 |
+
|
699 |
+
return score, missing_fraction
|
700 |
+
except:
|
701 |
+
return 0.5, 0.0
|
702 |
+
|
703 |
+
# -///////////////////////////////////////////////////
|
704 |
+
def _detect_instrumental_artifacts(
|
705 |
+
self, wavenumbers: np.ndarray, intensities: np.ndarray
|
706 |
+
) -> Tuple[float, List[str]]:
|
707 |
+
"""Detect common instrumental artifacts"""
|
708 |
+
artifacts = []
|
709 |
+
|
710 |
+
try:
|
711 |
+
# Check for spike artifacts (cosmic rays, electrical interference)
|
712 |
+
diff_threshold = 5 * np.std(np.diff(intensities))
|
713 |
+
spikes = np.where(np.abs(np.diff(intensities)) > diff_threshold)[0]
|
714 |
+
|
715 |
+
if len(spikes) > len(intensities) * 0.01: # More than 1% spikes
|
716 |
+
artifacts.append("excessive_spikes")
|
717 |
+
|
718 |
+
# Check for saturation (flat regions at max/min)
|
719 |
+
if np.std(intensities) > 0:
|
720 |
+
max_val = np.max(intensities)
|
721 |
+
min_val = np.min(intensities)
|
722 |
+
|
723 |
+
saturation_high = np.sum(intensities >= 0.99 * max_val) / len(
|
724 |
+
intensities
|
725 |
+
)
|
726 |
+
saturation_low = np.sum(intensities <= 1.01 * min_val) / len(
|
727 |
+
intensities
|
728 |
+
)
|
729 |
+
|
730 |
+
if saturation_high > 0.05:
|
731 |
+
artifacts.append("high_saturation")
|
732 |
+
if saturation_low > 0.05:
|
733 |
+
artifacts.append("low_saturation")
|
734 |
+
|
735 |
+
# Check for periodic noise (electrical interference)
|
736 |
+
fft = np.fft.fft(intensities - np.mean(intensities))
|
737 |
+
freq_domain = np.abs(fft[: len(fft) // 2])
|
738 |
+
|
739 |
+
# Look for strong periodic components
|
740 |
+
if len(freq_domain) > 10:
|
741 |
+
mean_amplitude = np.mean(freq_domain)
|
742 |
+
strong_frequencies = np.sum(freq_domain > 3 * mean_amplitude)
|
743 |
+
|
744 |
+
if strong_frequencies > len(freq_domain) * 0.1:
|
745 |
+
artifacts.append("periodic_noise")
|
746 |
+
|
747 |
+
# Score inversely related to number of artifacts
|
748 |
+
score = max(0.0, 1.0 - len(artifacts) * 0.3)
|
749 |
+
|
750 |
+
return score, artifacts
|
751 |
+
|
752 |
+
except:
|
753 |
+
return 0.5, []
|
754 |
+
|
755 |
+
# -///////////////////////////////////////////////////
|
756 |
+
def _assess_metadata_completeness(self, metadata: Optional[Dict]) -> float:
|
757 |
+
"""Assess completeness of metadata"""
|
758 |
+
if metadata is None:
|
759 |
+
return 0.0
|
760 |
+
|
761 |
+
required_fields = [
|
762 |
+
"sample_id",
|
763 |
+
"measurement_date",
|
764 |
+
"instrument_type",
|
765 |
+
"resolution",
|
766 |
+
"number_of_scans",
|
767 |
+
"sample_type",
|
768 |
+
]
|
769 |
+
|
770 |
+
present_fields = sum(
|
771 |
+
1
|
772 |
+
for field in required_fields
|
773 |
+
if field in metadata and metadata[field] is not None
|
774 |
+
)
|
775 |
+
score = present_fields / len(required_fields)
|
776 |
+
|
777 |
+
return score
|
778 |
+
|
779 |
+
|
780 |
+
# -///////////////////////////////////////////////////
|
781 |
+
class EnhancedDataPipeline:
|
782 |
+
"""Complete enhanced data pipeline integrating all components"""
|
783 |
+
|
784 |
+
def __init__(self):
|
785 |
+
self.database_connector = {}
|
786 |
+
self.augmentation_engine = SyntheticDataAugmentation()
|
787 |
+
self.quality_controller = DataQualityController()
|
788 |
+
self.local_database_path = Path("data/enhanced_data")
|
789 |
+
self.local_database_path.mkdir(parents=True, exist_ok=True)
|
790 |
+
self._init_local_database()
|
791 |
+
|
792 |
+
def _init_local_database(self):
|
793 |
+
"""Initialize local SQLite database"""
|
794 |
+
db_path = self.local_database_path / "polymer_spectra.db"
|
795 |
+
|
796 |
+
with sqlite3.connect(db_path) as conn:
|
797 |
+
cursor = conn.cursor()
|
798 |
+
|
799 |
+
# Create main spectra table
|
800 |
+
cursor.execute(
|
801 |
+
"""
|
802 |
+
CREATE TABLE IF NOT EXISTS spectra (
|
803 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
804 |
+
sample_id TEXT UNIQUE NOT NULL,
|
805 |
+
polymer_type TEXT NOT NULL,
|
806 |
+
technique TEXT NOT NULL,
|
807 |
+
wavenumbers BLOB,
|
808 |
+
intensities BLOB,
|
809 |
+
metadata TEXT,
|
810 |
+
quality_score REAL,
|
811 |
+
validation_status TEXT,
|
812 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
813 |
+
source_database TEXT
|
814 |
+
)
|
815 |
+
"""
|
816 |
+
)
|
817 |
+
|
818 |
+
# Create aging data table
|
819 |
+
cursor.execute(
|
820 |
+
"""
|
821 |
+
CREATE TABLE IF NOT EXISTS aging_data (
|
822 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
823 |
+
sample_id TEXT,
|
824 |
+
aging_time REAL,
|
825 |
+
degradation_level REAL,
|
826 |
+
spectral_changes TEXT,
|
827 |
+
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
828 |
+
FOREIGN KEY (sample_id) REFERENCES spectra (sample_id)
|
829 |
+
)
|
830 |
+
"""
|
831 |
+
)
|
832 |
+
|
833 |
+
conn.commit()
|
834 |
+
|
835 |
+
# -///////////////////////////////////////////////////
|
836 |
+
def connect_to_databases(self) -> Dict[str, bool]:
|
837 |
+
"""Connect to all configured databases"""
|
838 |
+
connection_status = {}
|
839 |
+
|
840 |
+
for db_name, db_config in SPECTROSCOPY_DATABASES.items():
|
841 |
+
connector = DatabaseConnector(db_config)
|
842 |
+
self.database_connector[db_name] = connector.connect()
|
843 |
+
|
844 |
+
return connection_status
|
845 |
+
|
846 |
+
# -///////////////////////////////////////////////////
|
847 |
+
def search_and_import_spectra(
|
848 |
+
self, polymer_type: str, max_per_database: int = 50
|
849 |
+
) -> Dict[str, int]:
|
850 |
+
"""Search and import spectra from all connected databases"""
|
851 |
+
import_counts = {}
|
852 |
+
|
853 |
+
for db_name, connector in self.database_connector.items():
|
854 |
+
try:
|
855 |
+
search_results = connector.search_by_polymer_type(
|
856 |
+
polymer_type, max_per_database
|
857 |
+
)
|
858 |
+
imported_count = 0
|
859 |
+
|
860 |
+
for result in search_results:
|
861 |
+
if self._import_spectrum_to_local(result, db_name):
|
862 |
+
imported_count += 1
|
863 |
+
|
864 |
+
import_counts[db_name] = imported_count
|
865 |
+
|
866 |
+
except Exception as e:
|
867 |
+
print(f"Error importing from {db_name}: {e}")
|
868 |
+
import_counts[db_name] = 0
|
869 |
+
|
870 |
+
return import_counts
|
871 |
+
|
872 |
+
# -///////////////////////////////////////////////////]
|
873 |
+
def _import_spectrum_to_local(self, spectrum_data: Dict, source_db: str) -> bool:
|
874 |
+
"""Import spectrum data to local database"""
|
875 |
+
try:
|
876 |
+
# Extract or generate sample ID
|
877 |
+
sample_id = spectrum_data.get(
|
878 |
+
"sample_id", f"{source_db}_{hash(str(spectrum_data))}"
|
879 |
+
)
|
880 |
+
|
881 |
+
# Convert spectrum data format
|
882 |
+
if "wavenumbers" in spectrum_data and "intensities" in spectrum_data:
|
883 |
+
wavenumbers = np.array(spectrum_data["wavenumbers"])
|
884 |
+
intensities = np.array(spectrum_data["intensities"])
|
885 |
+
else:
|
886 |
+
# Try to extract from other formats
|
887 |
+
return False
|
888 |
+
|
889 |
+
# Quality assessment
|
890 |
+
metadata = spectrum_data.get("metadata", {})
|
891 |
+
quality_assessment = self.quality_controller.assess_spectrum_quality(
|
892 |
+
wavenumbers, intensities, metadata
|
893 |
+
)
|
894 |
+
|
895 |
+
# Only import if quality is acceptable
|
896 |
+
if quality_assessment["validation_status"] == "rejected":
|
897 |
+
return False
|
898 |
+
|
899 |
+
# Serialize arrays
|
900 |
+
wavenumbers_blob = pickle.dumps(wavenumbers)
|
901 |
+
intensities_blob = pickle.dumps(intensities)
|
902 |
+
metadata_json = json.dumps(metadata)
|
903 |
+
|
904 |
+
# Insert into database
|
905 |
+
db_path = self.local_database_path / "polymer_spectra.db"
|
906 |
+
|
907 |
+
with sqlite3.connect(db_path) as conn:
|
908 |
+
cursor = conn.cursor()
|
909 |
+
|
910 |
+
cursor.execute(
|
911 |
+
"""
|
912 |
+
INSERT OR REPLACE INTO spectra(
|
913 |
+
sample_id, polymer_type, technique,
|
914 |
+
wavenumbers, intensities, metadata,
|
915 |
+
quality_score, validation_status,
|
916 |
+
source_database)
|
917 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
918 |
+
""",
|
919 |
+
(
|
920 |
+
sample_id,
|
921 |
+
spectrum_data.get("polymer_type", "unknown"),
|
922 |
+
spectrum_data.get("technique", "FTIR"),
|
923 |
+
wavenumbers_blob,
|
924 |
+
intensities_blob,
|
925 |
+
metadata_json,
|
926 |
+
quality_assessment["overall_score"],
|
927 |
+
quality_assessment["validation_status"],
|
928 |
+
source_db,
|
929 |
+
),
|
930 |
+
)
|
931 |
+
|
932 |
+
conn.commit()
|
933 |
+
|
934 |
+
return True
|
935 |
+
|
936 |
+
except Exception as e:
|
937 |
+
print(f"Error importing spectrum: {e}")
|
938 |
+
return False
|
939 |
+
|
940 |
+
# -///////////////////////////////////////////////////
|
941 |
+
def generate_synthetic_aging_dataset(
|
942 |
+
self,
|
943 |
+
base_polymer_type: str,
|
944 |
+
num_samples: int = 50,
|
945 |
+
aging_conditions: Optional[List[Dict]] = None,
|
946 |
+
) -> int:
|
947 |
+
"""
|
948 |
+
Generate synthetic aging dataset for training
|
949 |
+
|
950 |
+
Args:
|
951 |
+
base_polymer_type: Base polymer type to use
|
952 |
+
num_samples: Number of synthetic samples to generate
|
953 |
+
aging_conditions: List of aging condition dictionaries
|
954 |
+
|
955 |
+
Returns:
|
956 |
+
Number of samples generated
|
957 |
+
"""
|
958 |
+
if aging_conditions is None:
|
959 |
+
aging_conditions = [
|
960 |
+
{"temperature": 60, "humidity": 75, "uv_exposure": True},
|
961 |
+
{"temperature": 80, "humidity": 85, "uv_exposure": True},
|
962 |
+
{"temperature": 40, "humidity": 95, "uv_exposure": False},
|
963 |
+
{"temperature": 100, "humidity": 50, "uv_exposure": True},
|
964 |
+
]
|
965 |
+
|
966 |
+
# Get base spectra from database
|
967 |
+
base_spectra = self.spectra_by_type(base_polymer_type, limit=10)
|
968 |
+
|
969 |
+
if not base_spectra:
|
970 |
+
print(f"No base spectra found for {base_polymer_type}")
|
971 |
+
return 0
|
972 |
+
|
973 |
+
generated_count = 0
|
974 |
+
|
975 |
+
synthetic_id = None # Initialize synthetic_id to avoid unbound error
|
976 |
+
aging_series = [] # Initialize aging_series to avoid unbound error
|
977 |
+
for base_spectrum in base_spectra:
|
978 |
+
wavenumbers = pickle.loads(base_spectrum["wavenumbers"])
|
979 |
+
intensities = pickle.loads(base_spectrum["intensities"])
|
980 |
+
|
981 |
+
# Generate aging series for each condition
|
982 |
+
for condition in aging_conditions:
|
983 |
+
aging_series = self.augmentation_engine.generate_synthetic_aging_series(
|
984 |
+
(wavenumbers, intensities),
|
985 |
+
num_time_points=min(
|
986 |
+
10, num_samples // len(aging_conditions) // len(base_spectra)
|
987 |
+
),
|
988 |
+
)
|
989 |
+
|
990 |
+
if "aging_series" in locals() and aging_series:
|
991 |
+
for aging_point in aging_series:
|
992 |
+
synthetic_id = f"synthetic_{base_polymer_type}_{generated_count}"
|
993 |
+
|
994 |
+
# Ensure condition is properly passed into the loop
|
995 |
+
metadata = {
|
996 |
+
"synthetic": True,
|
997 |
+
"aging_condition": aging_conditions[
|
998 |
+
0
|
999 |
+
], # Use the first condition or adjust as needed
|
1000 |
+
"aging_time": aging_point["aging_time"],
|
1001 |
+
"degradation_level": aging_point["degradation_level"],
|
1002 |
+
}
|
1003 |
+
|
1004 |
+
# Store synthetic spectrum
|
1005 |
+
if self._store_synthetic_spectrum(
|
1006 |
+
synthetic_id, base_polymer_type, aging_point, metadata
|
1007 |
+
):
|
1008 |
+
generated_count += 1
|
1009 |
+
|
1010 |
+
return generated_count
|
1011 |
+
|
1012 |
+
def _store_synthetic_spectrum(
|
1013 |
+
self, sample_id: str, polymer_type: str, aging_point: Dict, metadata: Dict
|
1014 |
+
) -> bool:
|
1015 |
+
"""Store synthetic spectrum in local database"""
|
1016 |
+
try:
|
1017 |
+
quality_assessment = self.quality_controller.assess_spectrum_quality(
|
1018 |
+
aging_point["wavenumbers"], aging_point["intensities"], metadata
|
1019 |
+
)
|
1020 |
+
|
1021 |
+
# Serialize data
|
1022 |
+
wavenumbers_blob = pickle.dumps(aging_point["wavenumbers"])
|
1023 |
+
intensities_blob = pickle.dumps(aging_point["intensities"])
|
1024 |
+
metadata_json = json.dumps(metadata)
|
1025 |
+
|
1026 |
+
# Insert spectrum
|
1027 |
+
db_path = self.local_database_path / "polymer_spectra.db"
|
1028 |
+
|
1029 |
+
with sqlite3.connect(db_path) as conn:
|
1030 |
+
cursor = conn.cursor()
|
1031 |
+
|
1032 |
+
cursor.execute(
|
1033 |
+
"""
|
1034 |
+
INSERT INTO spectra
|
1035 |
+
(sample_id, polymer_type, technique, wavenumbers, intensities,
|
1036 |
+
metadata, quality_score, validation_status, source_database)
|
1037 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
1038 |
+
""",
|
1039 |
+
(
|
1040 |
+
sample_id,
|
1041 |
+
polymer_type,
|
1042 |
+
"FTIR_synthetic",
|
1043 |
+
wavenumbers_blob,
|
1044 |
+
intensities_blob,
|
1045 |
+
metadata_json,
|
1046 |
+
quality_assessment["overall_score"],
|
1047 |
+
"validated", # Synthetic data is pre-validated
|
1048 |
+
"synthetic",
|
1049 |
+
),
|
1050 |
+
)
|
1051 |
+
|
1052 |
+
# Insert aging data
|
1053 |
+
cursor.execute(
|
1054 |
+
"""
|
1055 |
+
INSERT INTO aging_data
|
1056 |
+
(sample_id, aging_time, degradation_level, aging_conditions, spectral_changes)
|
1057 |
+
VALUES (?, ?, ?, ?, ?)
|
1058 |
+
""",
|
1059 |
+
(
|
1060 |
+
sample_id,
|
1061 |
+
aging_point["aging_time"],
|
1062 |
+
aging_point["degradation_level"],
|
1063 |
+
json.dumps(metadata["aging_conditions"]),
|
1064 |
+
json.dumps(aging_point.get("spectral_changes", {})),
|
1065 |
+
),
|
1066 |
+
)
|
1067 |
+
|
1068 |
+
conn.commit()
|
1069 |
+
|
1070 |
+
return True
|
1071 |
+
|
1072 |
+
except Exception as e:
|
1073 |
+
print(f"Error storing synthetic spectrum: {e}")
|
1074 |
+
return False
|
1075 |
+
|
1076 |
+
# -///////////////////////////////////////////////////]
|
1077 |
+
def spectra_by_type(self, polymer_type: str, limit: int = 100) -> List[Dict]:
|
1078 |
+
"""Retrieve spectra by polymer type from local database"""
|
1079 |
+
db_path = self.local_database_path / "polymer_spectra.db"
|
1080 |
+
|
1081 |
+
with sqlite3.connect(db_path) as conn:
|
1082 |
+
cursor = conn.cursor()
|
1083 |
+
|
1084 |
+
cursor.execute(
|
1085 |
+
"""
|
1086 |
+
SELECT * FROM spectra
|
1087 |
+
WHERE polymer_type LIKE ? AND validation_status != 'rejected'
|
1088 |
+
ORDER BY quality_score DESC
|
1089 |
+
LIMIT ?
|
1090 |
+
""",
|
1091 |
+
(f"%{polymer_type}%", limit),
|
1092 |
+
)
|
1093 |
+
|
1094 |
+
columns = [description[0] for description in cursor.description]
|
1095 |
+
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
1096 |
+
|
1097 |
+
return results
|
1098 |
+
|
1099 |
+
# -///////////////////////////////////////////////////]
|
1100 |
+
def get_weathered_samples(self, polymer_type: Optional[str] = None) -> List[Dict]:
|
1101 |
+
"""Get samples with aging/weathering data"""
|
1102 |
+
db_path = self.local_database_path / "polymer_spectra.db"
|
1103 |
+
|
1104 |
+
with sqlite3.connect(db_path) as conn:
|
1105 |
+
cursor = conn.cursor()
|
1106 |
+
|
1107 |
+
query = """
|
1108 |
+
SELECT s.*, a.aging_time, a.degradation_level, a.aging_conditions
|
1109 |
+
FROM spectra s
|
1110 |
+
JOIN aging_data a ON s.sample_id = a.sample_id
|
1111 |
+
WHERE s.validation_status != 'rejected'
|
1112 |
+
"""
|
1113 |
+
params = []
|
1114 |
+
|
1115 |
+
if polymer_type:
|
1116 |
+
query += " AND s.polymer_type LIKE ?"
|
1117 |
+
params.append(f"%{polymer_type}%")
|
1118 |
+
|
1119 |
+
query += " ORDER BY a.degradation_level"
|
1120 |
+
|
1121 |
+
cursor.execute(query, params)
|
1122 |
+
|
1123 |
+
columns = [description[0] for description in cursor.description]
|
1124 |
+
results = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
1125 |
+
|
1126 |
+
return results
|
1127 |
+
|
1128 |
+
# -////////////////////////////////
|
1129 |
+
def get_database_statistics(self) -> Dict[str, Any]:
|
1130 |
+
"""Get statistics about the local database"""
|
1131 |
+
db_path = self.local_database_path / "polymer_spectra.db"
|
1132 |
+
|
1133 |
+
with sqlite3.connect(db_path) as conn:
|
1134 |
+
cursor = conn.cursor()
|
1135 |
+
|
1136 |
+
# Total spectra count
|
1137 |
+
cursor.execute("SELECT COUNT(*) FROM spectra")
|
1138 |
+
total_spectra = cursor.fetchone()[0]
|
1139 |
+
|
1140 |
+
# By polymer type
|
1141 |
+
cursor.execute(
|
1142 |
+
"""
|
1143 |
+
SELECT polymer_type, COUNT(*) as count
|
1144 |
+
FROM spectra
|
1145 |
+
GROUP BY polymer_type
|
1146 |
+
ORDER BY count DESC
|
1147 |
+
"""
|
1148 |
+
)
|
1149 |
+
by_polymer_type = dict(cursor.fetchall())
|
1150 |
+
|
1151 |
+
# By technique
|
1152 |
+
cursor.execute(
|
1153 |
+
"""
|
1154 |
+
SELECT technique, COUNT(*) as count
|
1155 |
+
FROM spectra
|
1156 |
+
GROUP BY technique
|
1157 |
+
ORDER BY count DESC
|
1158 |
+
"""
|
1159 |
+
)
|
1160 |
+
by_technique = dict(cursor.fetchall())
|
1161 |
+
|
1162 |
+
# By validation status
|
1163 |
+
cursor.execute(
|
1164 |
+
"""
|
1165 |
+
SELECT validation_status, COUNT(*) as count
|
1166 |
+
FROM spectra
|
1167 |
+
GROUP BY validation_status
|
1168 |
+
"""
|
1169 |
+
)
|
1170 |
+
by_validation = dict(cursor.fetchall())
|
1171 |
+
|
1172 |
+
# Average quality score
|
1173 |
+
cursor.execute(
|
1174 |
+
"SELECT AVG(quality_score) FROM spectra WHERE quality_score IS NOT NULL"
|
1175 |
+
)
|
1176 |
+
avg_quality = cursor.fetchone()[0] or 0.0
|
1177 |
+
|
1178 |
+
# Aging data count
|
1179 |
+
cursor.execute("SELECT COUNT(*) FROM aging_data")
|
1180 |
+
aging_samples = cursor.fetchone()[0]
|
1181 |
+
|
1182 |
+
return {
|
1183 |
+
"total_spectra": total_spectra,
|
1184 |
+
"by_polymer_type": by_polymer_type,
|
1185 |
+
"by_technique": by_technique,
|
1186 |
+
"by_validation_status": by_validation,
|
1187 |
+
"average_quality_score": avg_quality,
|
1188 |
+
"aging_samples": aging_samples,
|
1189 |
+
}
|