JoshKeesee commited on
Commit
682a8f7
·
verified ·
1 Parent(s): b0bed5d

Upload spectro.py

Browse files
Files changed (1) hide show
  1. spectro.py +221 -0
spectro.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Audio processing tools to convert between spectrogram images and waveforms.
3
+ """
4
+ import io
5
+ import typing as T
6
+
7
+ import numpy as np
8
+ from PIL import Image
9
+ import pydub
10
+ from scipy.io import wavfile
11
+ import torch
12
+ import torchaudio
13
+
14
+
15
+ def wav_bytes_from_spectrogram_image(image: Image.Image) -> T.Tuple[io.BytesIO, float]:
16
+ """
17
+ Reconstruct a WAV audio clip from a spectrogram image. Also returns the duration in seconds.
18
+ """
19
+ print("Starting reconstruction of WAV from spectrogram image...")
20
+
21
+ max_volume = 50
22
+ power_for_image = 0.25
23
+ Sxx = spectrogram_from_image(image, max_volume=max_volume, power_for_image=power_for_image)
24
+
25
+ sample_rate = 44100 # [Hz]
26
+ clip_duration_ms = 10000 # [ms]
27
+
28
+ bins_per_image = image.height
29
+ n_mels = image.width
30
+
31
+ # FFT parameters
32
+ window_duration_ms = 100 # [ms]
33
+ padded_duration_ms = 400 # [ms]
34
+ step_size_ms = 10 # [ms]
35
+
36
+ # Derived parameters
37
+ num_samples = int(image.width / float(bins_per_image) * clip_duration_ms) * sample_rate
38
+ n_fft = int(padded_duration_ms / 1000.0 * sample_rate)
39
+ hop_length = int(step_size_ms / 1000.0 * sample_rate)
40
+ win_length = int(window_duration_ms / 1000.0 * sample_rate)
41
+
42
+ print("Computing waveform from spectrogram...")
43
+ samples = waveform_from_spectrogram(
44
+ Sxx=Sxx,
45
+ n_fft=n_fft,
46
+ hop_length=hop_length,
47
+ win_length=win_length,
48
+ num_samples=num_samples,
49
+ sample_rate=sample_rate,
50
+ mel_scale=True,
51
+ n_mels=n_mels,
52
+ max_mel_iters=200,
53
+ num_griffin_lim_iters=32,
54
+ )
55
+
56
+ print("Writing WAV bytes...")
57
+ wav_bytes = io.BytesIO()
58
+ wavfile.write(wav_bytes, sample_rate, samples.astype(np.int16))
59
+ wav_bytes.seek(0)
60
+
61
+ duration_s = float(len(samples)) / sample_rate
62
+
63
+ print("Reconstruction complete.")
64
+ return wav_bytes, duration_s
65
+
66
+
67
+ def spectrogram_from_image(
68
+ image: Image.Image, max_volume: float = 50, power_for_image: float = 0.25
69
+ ) -> np.ndarray:
70
+ """
71
+ Compute a spectrogram magnitude array from a spectrogram image.
72
+ TODO: Add image_from_spectrogram and call this out as the reverse.
73
+ """
74
+ print("Converting image to spectrogram...")
75
+
76
+ # Convert to a numpy array of floats
77
+ data = np.array(image).astype(np.float32)
78
+
79
+ # Flip Y take a single channel
80
+ data = data[::-1, :]
81
+
82
+ # Invert
83
+ data = 255 - data
84
+
85
+ # Rescale to max volume
86
+ data = data * max_volume / 255
87
+
88
+ # Reverse the power curve
89
+ data = np.power(data, 1 / power_for_image)
90
+
91
+ print("Conversion complete.")
92
+ return data
93
+
94
+
95
+ def spectrogram_from_waveform(
96
+ waveform: np.ndarray,
97
+ sample_rate: int,
98
+ n_fft: int,
99
+ hop_length: int,
100
+ win_length: int,
101
+ mel_scale: bool = True,
102
+ n_mels: int = 512,
103
+ ) -> np.ndarray:
104
+ """
105
+ Compute a spectrogram from a waveform.
106
+ """
107
+ print("Computing spectrogram from waveform...")
108
+
109
+ spectrogram_func = torchaudio.transforms.Spectrogram(
110
+ n_fft=n_fft,
111
+ power=None,
112
+ hop_length=hop_length,
113
+ win_length=win_length,
114
+ )
115
+
116
+ waveform_tensor = torch.from_numpy(waveform.astype(np.float32)).reshape(1, -1)
117
+ Sxx_complex = spectrogram_func(waveform_tensor).numpy()[0]
118
+
119
+ Sxx_mag = np.abs(Sxx_complex)
120
+
121
+ if mel_scale:
122
+ mel_scaler = torchaudio.transforms.MelScale(
123
+ n_mels=n_mels,
124
+ sample_rate=sample_rate,
125
+ f_min=0,
126
+ f_max=10000,
127
+ n_stft=n_fft // 2 + 1,
128
+ norm=None,
129
+ mel_scale="htk",
130
+ )
131
+
132
+ Sxx_mag = mel_scaler(torch.from_numpy(Sxx_mag)).numpy()
133
+
134
+ print("Spectrogram computation complete.")
135
+ return Sxx_mag
136
+
137
+
138
+ def waveform_from_spectrogram(
139
+ Sxx: np.ndarray,
140
+ n_fft: int,
141
+ hop_length: int,
142
+ win_length: int,
143
+ num_samples: int,
144
+ sample_rate: int,
145
+ mel_scale: bool = True,
146
+ n_mels: int = 768,
147
+ max_mel_iters: int = 200,
148
+ num_griffin_lim_iters: int = 32,
149
+ device: str = "cpu",
150
+ ) -> np.ndarray:
151
+ """
152
+ Reconstruct a waveform from a spectrogram.
153
+ This is an approximate inverse of spectrogram_from_waveform, using the Griffin-Lim algorithm
154
+ to approximate the phase.
155
+ """
156
+ print("Reconstructing waveform from spectrogram...")
157
+
158
+ Sxx_torch = torch.from_numpy(Sxx).to(device)
159
+
160
+ if mel_scale:
161
+ mel_inv_scaler = torchaudio.transforms.InverseMelScale(
162
+ n_mels=n_mels,
163
+ sample_rate=sample_rate,
164
+ f_min=0,
165
+ f_max=10000,
166
+ n_stft=n_fft // 2 + 1,
167
+ norm=None,
168
+ mel_scale="htk",
169
+ ).to(device)
170
+
171
+ Sxx_torch = mel_inv_scaler(Sxx_torch)
172
+
173
+ griffin_lim = torchaudio.transforms.GriffinLim(
174
+ n_fft=n_fft,
175
+ win_length=win_length,
176
+ hop_length=hop_length,
177
+ power=1.0,
178
+ n_iter=num_griffin_lim_iters,
179
+ ).to(device)
180
+
181
+ waveform = griffin_lim(Sxx_torch).cpu().numpy()
182
+
183
+ print("Waveform reconstruction complete.")
184
+ return waveform
185
+
186
+
187
+ def mp3_bytes_from_wav_bytes(wav_bytes: io.BytesIO) -> io.BytesIO:
188
+ print("Converting WAV bytes to MP3 bytes...")
189
+ mp3_bytes = io.BytesIO()
190
+ sound = pydub.AudioSegment.from_wav(wav_bytes)
191
+ sound.export(mp3_bytes, format="mp3")
192
+ mp3_bytes.seek(0)
193
+ print("Conversion complete.")
194
+ return mp3_bytes
195
+
196
+ def image_from_spectrogram(spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25) -> Image.Image:
197
+ """
198
+ Compute a spectrogram image from a spectrogram magnitude array.
199
+ """
200
+ print("Converting spectrogram to image...")
201
+
202
+ # Apply the power curve
203
+ data = np.power(spectrogram, power_for_image)
204
+
205
+ # Rescale to 0-255
206
+ data = data * 255 / max_volume
207
+
208
+ # Invert
209
+ data = 255 - data
210
+
211
+ # Convert to a PIL image
212
+ image = Image.fromarray(data.astype(np.uint8))
213
+
214
+ # Flip Y
215
+ image = image.transpose(Image.FLIP_TOP_BOTTOM)
216
+
217
+ # Convert to RGB
218
+ image = image.convert("RGB")
219
+
220
+ print("Conversion complete.")
221
+ return image