Update custom_interface_app.py
Browse files- custom_interface_app.py +16 -43
custom_interface_app.py
CHANGED
@@ -100,39 +100,12 @@ class ASR(Pretrained):
|
|
100 |
seq.append(token)
|
101 |
output = []
|
102 |
return seq
|
103 |
-
|
104 |
-
|
105 |
-
def increase_volume(self, waveform, threshold_db=-25):
|
106 |
-
# Measure loudness using RMS
|
107 |
-
loudness_vector = librosa.feature.rms(y=waveform)
|
108 |
-
average_loudness = np.mean(loudness_vector)
|
109 |
-
average_loudness_db = librosa.amplitude_to_db(average_loudness)
|
110 |
-
|
111 |
-
print(f"Average Loudness: {average_loudness_db} dB")
|
112 |
-
|
113 |
-
# Check if loudness is below threshold and apply gain if needed
|
114 |
-
if average_loudness_db < threshold_db:
|
115 |
-
# Calculate gain needed
|
116 |
-
gain_db = threshold_db - average_loudness_db
|
117 |
-
gain = librosa.db_to_amplitude(gain_db) # Convert dB to amplitude factor
|
118 |
-
|
119 |
-
# Apply gain to the audio signal
|
120 |
-
waveform = waveform * gain
|
121 |
-
loudness_vector = librosa.feature.rms(y=waveform)
|
122 |
-
average_loudness = np.mean(loudness_vector)
|
123 |
-
average_loudness_db = librosa.amplitude_to_db(average_loudness)
|
124 |
-
|
125 |
-
print(f"Average Loudness: {average_loudness_db} dB")
|
126 |
-
return waveform
|
127 |
|
128 |
|
129 |
def classify_file_w2v2(self, path, device):
|
130 |
# Load the audio file
|
131 |
waveform, sr = librosa.load(path, sr=16000)
|
132 |
|
133 |
-
# increase the volume if needed
|
134 |
-
# waveform = self.increase_volume(waveform)
|
135 |
-
|
136 |
# Get audio length in seconds
|
137 |
audio_length = len(waveform) / sr
|
138 |
|
@@ -142,15 +115,16 @@ class ASR(Pretrained):
|
|
142 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
143 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
144 |
start = 0
|
145 |
-
end = 0
|
146 |
for i in range(num_segments):
|
147 |
-
|
148 |
-
end
|
|
|
149 |
segment_part = waveform[start:end]
|
150 |
segment_len = len(segment_part) / sr
|
151 |
if segment_len < 1:
|
152 |
continue
|
153 |
segments.append(segment_part)
|
|
|
154 |
|
155 |
for segment in segments:
|
156 |
segment_tensor = torch.tensor(segment).to(device)
|
@@ -171,14 +145,14 @@ class ASR(Pretrained):
|
|
171 |
outputs = self.encode_batch_w2v2(device, batch, rel_length)
|
172 |
yield outputs
|
173 |
|
|
|
|
|
174 |
|
175 |
def classify_file_whisper_mkd(self, path, device):
|
176 |
# Load the audio file
|
|
|
177 |
waveform, sr = librosa.load(path, sr=16000)
|
178 |
|
179 |
-
# increase the volume if needed
|
180 |
-
# waveform = self.increase_volume(waveform)
|
181 |
-
|
182 |
# Get audio length in seconds
|
183 |
audio_length = len(waveform) / sr
|
184 |
|
@@ -188,22 +162,23 @@ class ASR(Pretrained):
|
|
188 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
189 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
190 |
start = 0
|
191 |
-
end = 0
|
192 |
for i in range(num_segments):
|
193 |
-
|
194 |
-
end
|
|
|
195 |
segment_part = waveform[start:end]
|
196 |
segment_len = len(segment_part) / sr
|
197 |
if segment_len < 1:
|
198 |
continue
|
199 |
segments.append(segment_part)
|
|
|
200 |
|
201 |
for segment in segments:
|
202 |
segment_tensor = torch.tensor(segment).to(device)
|
203 |
|
204 |
# Fake a batch for the segment
|
205 |
batch = segment_tensor.unsqueeze(0).to(device)
|
206 |
-
rel_length = torch.tensor([1.0]).to(device)
|
207 |
|
208 |
# Pass the segment through the ASR model
|
209 |
segment_output = self.encode_batch_whisper(device, batch, rel_length)
|
@@ -228,9 +203,6 @@ class ASR(Pretrained):
|
|
228 |
# Load the audio file
|
229 |
waveform, sr = librosa.load(path, sr=16000)
|
230 |
|
231 |
-
# increase the volume if needed
|
232 |
-
# waveform = self.increase_volume(waveform)
|
233 |
-
|
234 |
# Get audio length in seconds
|
235 |
audio_length = len(waveform) / sr
|
236 |
|
@@ -240,15 +212,16 @@ class ASR(Pretrained):
|
|
240 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
241 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
242 |
start = 0
|
243 |
-
end = 0
|
244 |
for i in range(num_segments):
|
245 |
-
|
246 |
-
end
|
|
|
247 |
segment_part = waveform[start:end]
|
248 |
segment_len = len(segment_part) / sr
|
249 |
if segment_len < 1:
|
250 |
continue
|
251 |
segments.append(segment_part)
|
|
|
252 |
|
253 |
for segment in segments:
|
254 |
segment_tensor = torch.tensor(segment).to(device)
|
|
|
100 |
seq.append(token)
|
101 |
output = []
|
102 |
return seq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
|
105 |
def classify_file_w2v2(self, path, device):
|
106 |
# Load the audio file
|
107 |
waveform, sr = librosa.load(path, sr=16000)
|
108 |
|
|
|
|
|
|
|
109 |
# Get audio length in seconds
|
110 |
audio_length = len(waveform) / sr
|
111 |
|
|
|
115 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
116 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
117 |
start = 0
|
|
|
118 |
for i in range(num_segments):
|
119 |
+
end = start + max_duration
|
120 |
+
if end > len(waveform):
|
121 |
+
end = len(waveform)
|
122 |
segment_part = waveform[start:end]
|
123 |
segment_len = len(segment_part) / sr
|
124 |
if segment_len < 1:
|
125 |
continue
|
126 |
segments.append(segment_part)
|
127 |
+
start = end
|
128 |
|
129 |
for segment in segments:
|
130 |
segment_tensor = torch.tensor(segment).to(device)
|
|
|
145 |
outputs = self.encode_batch_w2v2(device, batch, rel_length)
|
146 |
yield outputs
|
147 |
|
148 |
+
|
149 |
+
|
150 |
|
151 |
def classify_file_whisper_mkd(self, path, device):
|
152 |
# Load the audio file
|
153 |
+
# path = "long_sample.wav"
|
154 |
waveform, sr = librosa.load(path, sr=16000)
|
155 |
|
|
|
|
|
|
|
156 |
# Get audio length in seconds
|
157 |
audio_length = len(waveform) / sr
|
158 |
|
|
|
162 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
163 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
164 |
start = 0
|
|
|
165 |
for i in range(num_segments):
|
166 |
+
end = start + max_duration
|
167 |
+
if end > len(waveform):
|
168 |
+
end = len(waveform)
|
169 |
segment_part = waveform[start:end]
|
170 |
segment_len = len(segment_part) / sr
|
171 |
if segment_len < 1:
|
172 |
continue
|
173 |
segments.append(segment_part)
|
174 |
+
start = end
|
175 |
|
176 |
for segment in segments:
|
177 |
segment_tensor = torch.tensor(segment).to(device)
|
178 |
|
179 |
# Fake a batch for the segment
|
180 |
batch = segment_tensor.unsqueeze(0).to(device)
|
181 |
+
rel_length = torch.tensor([1.0]).to(device)
|
182 |
|
183 |
# Pass the segment through the ASR model
|
184 |
segment_output = self.encode_batch_whisper(device, batch, rel_length)
|
|
|
203 |
# Load the audio file
|
204 |
waveform, sr = librosa.load(path, sr=16000)
|
205 |
|
|
|
|
|
|
|
206 |
# Get audio length in seconds
|
207 |
audio_length = len(waveform) / sr
|
208 |
|
|
|
212 |
max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
|
213 |
num_segments = int(np.ceil(len(waveform) / max_duration))
|
214 |
start = 0
|
|
|
215 |
for i in range(num_segments):
|
216 |
+
end = start + max_duration
|
217 |
+
if end > len(waveform):
|
218 |
+
end = len(waveform)
|
219 |
segment_part = waveform[start:end]
|
220 |
segment_len = len(segment_part) / sr
|
221 |
if segment_len < 1:
|
222 |
continue
|
223 |
segments.append(segment_part)
|
224 |
+
start = end
|
225 |
|
226 |
for segment in segments:
|
227 |
segment_tensor = torch.tensor(segment).to(device)
|