Porjaz commited on
Commit
51c1dfe
·
verified ·
1 Parent(s): ef931c3

Update custom_interface_app.py

Browse files
Files changed (1) hide show
  1. custom_interface_app.py +16 -43
custom_interface_app.py CHANGED
@@ -100,39 +100,12 @@ class ASR(Pretrained):
100
  seq.append(token)
101
  output = []
102
  return seq
103
-
104
-
105
- def increase_volume(self, waveform, threshold_db=-25):
106
- # Measure loudness using RMS
107
- loudness_vector = librosa.feature.rms(y=waveform)
108
- average_loudness = np.mean(loudness_vector)
109
- average_loudness_db = librosa.amplitude_to_db(average_loudness)
110
-
111
- print(f"Average Loudness: {average_loudness_db} dB")
112
-
113
- # Check if loudness is below threshold and apply gain if needed
114
- if average_loudness_db < threshold_db:
115
- # Calculate gain needed
116
- gain_db = threshold_db - average_loudness_db
117
- gain = librosa.db_to_amplitude(gain_db) # Convert dB to amplitude factor
118
-
119
- # Apply gain to the audio signal
120
- waveform = waveform * gain
121
- loudness_vector = librosa.feature.rms(y=waveform)
122
- average_loudness = np.mean(loudness_vector)
123
- average_loudness_db = librosa.amplitude_to_db(average_loudness)
124
-
125
- print(f"Average Loudness: {average_loudness_db} dB")
126
- return waveform
127
 
128
 
129
  def classify_file_w2v2(self, path, device):
130
  # Load the audio file
131
  waveform, sr = librosa.load(path, sr=16000)
132
 
133
- # increase the volume if needed
134
- # waveform = self.increase_volume(waveform)
135
-
136
  # Get audio length in seconds
137
  audio_length = len(waveform) / sr
138
 
@@ -142,15 +115,16 @@ class ASR(Pretrained):
142
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
143
  num_segments = int(np.ceil(len(waveform) / max_duration))
144
  start = 0
145
- end = 0
146
  for i in range(num_segments):
147
- start = start + end
148
- end = start + max_duration * sr
 
149
  segment_part = waveform[start:end]
150
  segment_len = len(segment_part) / sr
151
  if segment_len < 1:
152
  continue
153
  segments.append(segment_part)
 
154
 
155
  for segment in segments:
156
  segment_tensor = torch.tensor(segment).to(device)
@@ -171,14 +145,14 @@ class ASR(Pretrained):
171
  outputs = self.encode_batch_w2v2(device, batch, rel_length)
172
  yield outputs
173
 
 
 
174
 
175
  def classify_file_whisper_mkd(self, path, device):
176
  # Load the audio file
 
177
  waveform, sr = librosa.load(path, sr=16000)
178
 
179
- # increase the volume if needed
180
- # waveform = self.increase_volume(waveform)
181
-
182
  # Get audio length in seconds
183
  audio_length = len(waveform) / sr
184
 
@@ -188,22 +162,23 @@ class ASR(Pretrained):
188
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
189
  num_segments = int(np.ceil(len(waveform) / max_duration))
190
  start = 0
191
- end = 0
192
  for i in range(num_segments):
193
- start = start + end
194
- end = start + max_duration * sr
 
195
  segment_part = waveform[start:end]
196
  segment_len = len(segment_part) / sr
197
  if segment_len < 1:
198
  continue
199
  segments.append(segment_part)
 
200
 
201
  for segment in segments:
202
  segment_tensor = torch.tensor(segment).to(device)
203
 
204
  # Fake a batch for the segment
205
  batch = segment_tensor.unsqueeze(0).to(device)
206
- rel_length = torch.tensor([1.0]).to(device) # Adjust if necessary
207
 
208
  # Pass the segment through the ASR model
209
  segment_output = self.encode_batch_whisper(device, batch, rel_length)
@@ -228,9 +203,6 @@ class ASR(Pretrained):
228
  # Load the audio file
229
  waveform, sr = librosa.load(path, sr=16000)
230
 
231
- # increase the volume if needed
232
- # waveform = self.increase_volume(waveform)
233
-
234
  # Get audio length in seconds
235
  audio_length = len(waveform) / sr
236
 
@@ -240,15 +212,16 @@ class ASR(Pretrained):
240
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
241
  num_segments = int(np.ceil(len(waveform) / max_duration))
242
  start = 0
243
- end = 0
244
  for i in range(num_segments):
245
- start = start + end
246
- end = start + max_duration * sr
 
247
  segment_part = waveform[start:end]
248
  segment_len = len(segment_part) / sr
249
  if segment_len < 1:
250
  continue
251
  segments.append(segment_part)
 
252
 
253
  for segment in segments:
254
  segment_tensor = torch.tensor(segment).to(device)
 
100
  seq.append(token)
101
  output = []
102
  return seq
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  def classify_file_w2v2(self, path, device):
106
  # Load the audio file
107
  waveform, sr = librosa.load(path, sr=16000)
108
 
 
 
 
109
  # Get audio length in seconds
110
  audio_length = len(waveform) / sr
111
 
 
115
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
116
  num_segments = int(np.ceil(len(waveform) / max_duration))
117
  start = 0
 
118
  for i in range(num_segments):
119
+ end = start + max_duration
120
+ if end > len(waveform):
121
+ end = len(waveform)
122
  segment_part = waveform[start:end]
123
  segment_len = len(segment_part) / sr
124
  if segment_len < 1:
125
  continue
126
  segments.append(segment_part)
127
+ start = end
128
 
129
  for segment in segments:
130
  segment_tensor = torch.tensor(segment).to(device)
 
145
  outputs = self.encode_batch_w2v2(device, batch, rel_length)
146
  yield outputs
147
 
148
+
149
+
150
 
151
  def classify_file_whisper_mkd(self, path, device):
152
  # Load the audio file
153
+ # path = "long_sample.wav"
154
  waveform, sr = librosa.load(path, sr=16000)
155
 
 
 
 
156
  # Get audio length in seconds
157
  audio_length = len(waveform) / sr
158
 
 
162
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
163
  num_segments = int(np.ceil(len(waveform) / max_duration))
164
  start = 0
 
165
  for i in range(num_segments):
166
+ end = start + max_duration
167
+ if end > len(waveform):
168
+ end = len(waveform)
169
  segment_part = waveform[start:end]
170
  segment_len = len(segment_part) / sr
171
  if segment_len < 1:
172
  continue
173
  segments.append(segment_part)
174
+ start = end
175
 
176
  for segment in segments:
177
  segment_tensor = torch.tensor(segment).to(device)
178
 
179
  # Fake a batch for the segment
180
  batch = segment_tensor.unsqueeze(0).to(device)
181
+ rel_length = torch.tensor([1.0]).to(device)
182
 
183
  # Pass the segment through the ASR model
184
  segment_output = self.encode_batch_whisper(device, batch, rel_length)
 
203
  # Load the audio file
204
  waveform, sr = librosa.load(path, sr=16000)
205
 
 
 
 
206
  # Get audio length in seconds
207
  audio_length = len(waveform) / sr
208
 
 
212
  max_duration = 20 * sr # Maximum segment duration in samples (20 seconds)
213
  num_segments = int(np.ceil(len(waveform) / max_duration))
214
  start = 0
 
215
  for i in range(num_segments):
216
+ end = start + max_duration
217
+ if end > len(waveform):
218
+ end = len(waveform)
219
  segment_part = waveform[start:end]
220
  segment_len = len(segment_part) / sr
221
  if segment_len < 1:
222
  continue
223
  segments.append(segment_part)
224
+ start = end
225
 
226
  for segment in segments:
227
  segment_tensor = torch.tensor(segment).to(device)