devjas1 commited on
Commit
a602039
·
1 Parent(s): aecd727

(FEAT/DATASETS)[Demo Dataset Generation Script for ML Training]: Automate creation of synthetic polymer spectra for stable and weathered classes

Browse files

- Script generates realistic synthetic spectra for both stable and weathered polymers, simulating noise, peak broadening, and chemical changes.
- Saves output in organized directory structure compatible with training pipeline, enabling quick prototyping and testing.
- Provides clear progress feedback and summary of dataset location and sample counts for user convenience.

Files changed (1) hide show
  1. scripts/create_demo_dataset.py +141 -0
scripts/create_demo_dataset.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Generate demo datasets for testing the training functionality.
3
+ """
4
+
5
+ import numpy as np
6
+ from pathlib import Path
7
+ import sys
8
+ import os
9
+
10
+ # Add project root to path
11
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
12
+
13
+
14
+ def generate_synthetic_spectrum(
15
+ wavenumbers, base_intensity=0.5, noise_level=0.05, peaks=None
16
+ ):
17
+ """Generate a synthetic spectrum with specified characteristics"""
18
+ spectrum = np.full_like(wavenumbers, base_intensity)
19
+
20
+ # Add some peaks
21
+ if peaks is None:
22
+ peaks = [
23
+ (1000, 0.3, 50),
24
+ (1500, 0.5, 80),
25
+ (2000, 0.2, 40),
26
+ ] # (center, height, width)
27
+
28
+ for center, height, width in peaks:
29
+ peak = height * np.exp(-(((wavenumbers - center) / width) ** 2))
30
+ spectrum += peak
31
+
32
+ # Add noise
33
+ spectrum += np.random.normal(0, noise_level, len(wavenumbers))
34
+
35
+ # Ensure positive values
36
+ spectrum = np.maximum(spectrum, 0.01)
37
+
38
+ return spectrum
39
+
40
+
41
+ def create_demo_datasets():
42
+ """Create demo datasets for training"""
43
+
44
+ # Define wavenumber range (typical for Raman)
45
+ wavenumbers = np.linspace(400, 3500, 200)
46
+
47
+ # Create stable polymer samples
48
+ stable_dir = Path("datasets/demo_dataset/stable")
49
+ stable_dir.mkdir(parents=True, exist_ok=True)
50
+
51
+ print("Generating stable polymer samples...")
52
+ for i in range(20):
53
+ # Stable polymers - higher intensity, sharper peaks
54
+ stable_peaks = [
55
+ (
56
+ 800 + np.random.normal(0, 20),
57
+ 0.4 + np.random.normal(0, 0.05),
58
+ 30 + np.random.normal(0, 5),
59
+ ),
60
+ (
61
+ 1200 + np.random.normal(0, 30),
62
+ 0.6 + np.random.normal(0, 0.08),
63
+ 40 + np.random.normal(0, 8),
64
+ ),
65
+ (
66
+ 1600 + np.random.normal(0, 25),
67
+ 0.3 + np.random.normal(0, 0.04),
68
+ 35 + np.random.normal(0, 6),
69
+ ),
70
+ (
71
+ 2900 + np.random.normal(0, 40),
72
+ 0.8 + np.random.normal(0, 0.1),
73
+ 60 + np.random.normal(0, 10),
74
+ ),
75
+ ]
76
+
77
+ spectrum = generate_synthetic_spectrum(
78
+ wavenumbers,
79
+ base_intensity=0.4 + np.random.normal(0, 0.05),
80
+ noise_level=0.02,
81
+ peaks=stable_peaks,
82
+ )
83
+
84
+ # Save as two-column format
85
+ data = np.column_stack([wavenumbers, spectrum])
86
+ np.savetxt(stable_dir / f"stable_sample_{i:02d}.txt", data, fmt="%.6f")
87
+
88
+ # Create weathered polymer samples
89
+ weathered_dir = Path("datasets/demo_dataset/weathered")
90
+ weathered_dir.mkdir(parents=True, exist_ok=True)
91
+
92
+ print("Generating weathered polymer samples...")
93
+ for i in range(20):
94
+ # Weathered polymers - lower intensity, broader peaks, additional oxidation peaks
95
+ weathered_peaks = [
96
+ (
97
+ 800 + np.random.normal(0, 30),
98
+ 0.2 + np.random.normal(0, 0.04),
99
+ 45 + np.random.normal(0, 10),
100
+ ),
101
+ (
102
+ 1200 + np.random.normal(0, 40),
103
+ 0.3 + np.random.normal(0, 0.06),
104
+ 55 + np.random.normal(0, 12),
105
+ ),
106
+ (
107
+ 1600 + np.random.normal(0, 35),
108
+ 0.15 + np.random.normal(0, 0.03),
109
+ 50 + np.random.normal(0, 8),
110
+ ),
111
+ (
112
+ 1720 + np.random.normal(0, 20),
113
+ 0.25 + np.random.normal(0, 0.04),
114
+ 40 + np.random.normal(0, 7),
115
+ ), # Oxidation peak
116
+ (
117
+ 2900 + np.random.normal(0, 50),
118
+ 0.4 + np.random.normal(0, 0.08),
119
+ 80 + np.random.normal(0, 15),
120
+ ),
121
+ ]
122
+
123
+ spectrum = generate_synthetic_spectrum(
124
+ wavenumbers,
125
+ base_intensity=0.25 + np.random.normal(0, 0.04),
126
+ noise_level=0.03,
127
+ peaks=weathered_peaks,
128
+ )
129
+
130
+ # Save as two-column format
131
+ data = np.column_stack([wavenumbers, spectrum])
132
+ np.savetxt(weathered_dir / f"weathered_sample_{i:02d}.txt", data, fmt="%.6f")
133
+
134
+ print(f"✅ Demo dataset created:")
135
+ print(f" Stable samples: {len(list(stable_dir.glob('*.txt')))}")
136
+ print(f" Weathered samples: {len(list(weathered_dir.glob('*.txt')))}")
137
+ print(f" Location: datasets/demo_dataset/")
138
+
139
+
140
+ if __name__ == "__main__":
141
+ create_demo_datasets()