Athspi commited on
Commit
dbffdf4
·
verified ·
1 Parent(s): c885037

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -232
app.py CHANGED
@@ -3,16 +3,7 @@ import whisper
3
  import torch
4
  import os
5
  from pydub import AudioSegment, silence
6
- from faster_whisper import WhisperModel
7
- import numpy as np
8
- from scipy.io import wavfile
9
- from scipy.signal import correlate
10
- import tempfile
11
- import logging
12
-
13
- # Set up logging
14
- logging.basicConfig(level=logging.INFO)
15
- logger = logging.getLogger(__name__)
16
 
17
  # Mapping of model names to Whisper model sizes
18
  MODELS = {
@@ -131,51 +122,32 @@ LANGUAGE_NAME_TO_CODE = {
131
  # Reverse mapping of language codes to full language names
132
  CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
133
 
134
- def convert_to_wav(audio_file):
135
- """Convert any audio file to WAV format."""
136
- audio = AudioSegment.from_file(audio_file)
137
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
138
- wav_path = temp_wav.name
139
- audio.export(wav_path, format="wav")
140
- return wav_path
141
-
142
- def resample_audio(audio_segment, target_sample_rate):
143
- """Resample an audio segment to the target sample rate."""
144
- return audio_segment.set_frame_rate(target_sample_rate)
145
-
146
  def detect_language(audio_file):
147
  """Detect the language of the audio file."""
148
- if audio_file is None:
149
- return "Error: No audio file uploaded."
 
150
 
151
- try:
152
- # Define device and compute type for faster-whisper
153
- device = "cuda" if torch.cuda.is_available() else "cpu"
154
- compute_type = "float32" if device == "cuda" else "int8"
155
-
156
- # Load the faster-whisper model for language detection
157
- model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
158
-
159
- # Convert audio to 16kHz mono for better compatibility
160
- audio = AudioSegment.from_file(audio_file)
161
- audio = audio.set_frame_rate(16000).set_channels(1)
162
- processed_audio_path = "processed_audio.wav"
163
- audio.export(processed_audio_path, format="wav")
164
-
165
- # Detect the language using faster-whisper
166
- segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
167
- detected_language_code = info.language
168
-
169
- # Get the full language name from the code
170
- detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
171
-
172
- # Clean up processed audio file
173
- os.remove(processed_audio_path)
174
-
175
- return f"Detected Language: {detected_language}"
176
- except Exception as e:
177
- logger.error(f"Error in detect_language: {str(e)}", exc_info=True)
178
- return f"Error: {str(e)}"
179
 
180
  def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
181
  """
@@ -189,183 +161,81 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
189
  Returns:
190
  str: Path to the output audio file with silence removed.
191
  """
192
- if audio_file is None:
193
- return None
194
 
195
- try:
196
- # Convert audio to WAV format
197
- wav_path = convert_to_wav(audio_file)
198
-
199
- # Load the audio file
200
- audio = AudioSegment.from_file(wav_path)
201
-
202
- # Detect silent chunks
203
- silent_chunks = silence.detect_silence(
204
- audio,
205
- min_silence_len=min_silence_len,
206
- silence_thresh=silence_threshold
207
- )
208
-
209
- # Remove silent chunks
210
- non_silent_audio = AudioSegment.empty()
211
- start = 0
212
- for chunk in silent_chunks:
213
- non_silent_audio += audio[start:chunk[0]] # Add non-silent part
214
- start = chunk[1] # Move to the end of the silent chunk
215
- non_silent_audio += audio[start:] # Add the remaining part
216
-
217
- # Export the processed audio
218
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
219
- output_path = temp_output.name
220
- non_silent_audio.export(output_path, format="wav")
221
-
222
- # Clean up temporary WAV file
223
- os.remove(wav_path)
224
-
225
- return output_path
226
- except Exception as e:
227
- logger.error(f"Error in remove_silence: {str(e)}")
228
- return f"Error: {str(e)}"
229
-
230
- def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
231
- """
232
- Detect the target audio in the main audio and trim the main audio to include only the detected segments.
233
 
234
- Args:
235
- main_audio (str): Path to the main audio file.
236
- target_audio (str): Path to the target audio file.
237
- threshold (float): Detection threshold (0 to 1). Higher values mean stricter detection.
 
 
 
238
 
239
- Returns:
240
- str: Path to the trimmed audio file.
241
- str: Detected timestamps in the format "start-end (in seconds)".
242
- """
243
- if main_audio is None or target_audio is None:
244
- return None, "Error: Please upload both main and target audio files."
245
 
246
- try:
247
- # Convert audio files to WAV format
248
- main_wav_path = convert_to_wav(main_audio)
249
- target_wav_path = convert_to_wav(target_audio)
250
-
251
- # Load audio files
252
- main_rate, main_data = wavfile.read(main_wav_path)
253
- target_rate, target_data = wavfile.read(target_wav_path)
254
-
255
- # Ensure both audio files have the same sample rate
256
- if main_rate != target_rate:
257
- logger.warning(f"Sample rates differ: main_audio={main_rate}, target_audio={target_rate}. Resampling target audio.")
258
- target_segment = AudioSegment.from_file(target_wav_path)
259
- target_segment = resample_audio(target_segment, main_rate)
260
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_resampled:
261
- resampled_path = temp_resampled.name
262
- target_segment.export(resampled_path, format="wav")
263
- target_rate, target_data = wavfile.read(resampled_path)
264
-
265
- # Normalize audio data
266
- main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
267
- target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
268
-
269
- # Perform cross-correlation to detect the target audio in the main audio
270
- correlation = correlate(main_data, target_data, mode='valid')
271
- correlation = np.abs(correlation)
272
- max_corr = np.max(correlation)
273
-
274
- # Find the peak in the cross-correlation result
275
- peak_index = np.argmax(correlation)
276
- peak_value = correlation[peak_index]
277
-
278
- # Check if the peak value exceeds the threshold
279
- if peak_value < threshold * max_corr:
280
- return None, "Error: Target audio not detected in the main audio."
281
-
282
- # Calculate the start and end times of the target audio in the main audio
283
- start_time = peak_index / main_rate
284
- end_time = (peak_index + len(target_data)) / main_rate
285
-
286
- # Trim the main audio to include only the detected segment
287
- main_audio_segment = AudioSegment.from_file(main_wav_path)
288
- start_ms = int(start_time * 1000)
289
- end_ms = int(end_time * 1000)
290
- trimmed_audio = main_audio_segment[start_ms:end_ms]
291
-
292
- # Export the trimmed audio
293
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
294
- output_path = temp_output.name
295
- trimmed_audio.export(output_path, format="wav")
296
-
297
- # Format timestamps
298
- timestamps_str = f"{start_time:.2f}-{end_time:.2f}"
299
-
300
- # Clean up temporary WAV files
301
- os.remove(main_wav_path)
302
- os.remove(target_wav_path)
303
- if 'resampled_path' in locals():
304
- os.remove(resampled_path)
305
-
306
- return output_path, timestamps_str
307
- except Exception as e:
308
- logger.error(f"Error in detect_and_trim_audio: {str(e)}")
309
- return None, f"Error: {str(e)}"
310
 
311
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
312
  """Transcribe the audio file."""
313
- if audio_file is None:
314
- return "Error: No audio file uploaded."
 
 
 
315
 
316
- try:
317
- # Convert audio to 16kHz mono for better compatibility
318
- audio = AudioSegment.from_file(audio_file)
319
- audio = audio.set_frame_rate(16000).set_channels(1)
320
- processed_audio_path = "processed_audio.wav"
321
- audio.export(processed_audio_path, format="wav")
322
 
323
- # Load the appropriate model
324
- if model_size == "Faster Whisper Large v3":
325
- # Define device and compute type for faster-whisper
326
- device = "cuda" if torch.cuda.is_available() else "cpu"
327
- compute_type = "float32" if device == "cuda" else "int8"
328
-
329
- # Use faster-whisper for the Systran model
330
- model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
331
- segments, info = model.transcribe(
332
- processed_audio_path,
333
- task="transcribe",
334
- word_timestamps=True,
335
- repetition_penalty=1.1,
336
- temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
337
- )
338
- transcription = " ".join([segment.text for segment in segments])
339
- detected_language_code = info.language
 
 
 
340
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
341
  else:
342
- # Use the standard Whisper model
343
- model = whisper.load_model(MODELS[model_size])
344
-
345
- # Transcribe the audio
346
- if language == "Auto Detect":
347
- result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
348
- detected_language_code = result.get("language", "unknown")
349
- detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
350
- else:
351
- language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
352
- result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
353
- detected_language = language
354
-
355
- transcription = result["text"]
356
-
357
- # Clean up processed audio file
358
- os.remove(processed_audio_path)
359
 
360
- # Return transcription and detected language
361
- return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
362
- except Exception as e:
363
- logger.error(f"Error in transcribe_audio: {str(e)}")
364
- return f"Error: {str(e)}"
 
 
365
 
366
  # Define the Gradio interface
367
  with gr.Blocks() as demo:
368
- gr.Markdown("# Audio Processing Tool")
369
 
370
  with gr.Tab("Detect Language"):
371
  gr.Markdown("Upload an audio file to detect its language.")
@@ -406,19 +276,6 @@ with gr.Blocks() as demo:
406
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
407
  silence_button = gr.Button("Remove Silence")
408
 
409
- with gr.Tab("Detect and Trim Audio"):
410
- gr.Markdown("Upload a main audio file and a target audio file. The app will detect the target audio in the main audio and trim it.")
411
- main_audio_input = gr.Audio(type="filepath", label="Upload Main Audio File")
412
- target_audio_input = gr.Audio(type="filepath", label="Upload Target Audio File")
413
- threshold_slider = gr.Slider(
414
- minimum=0.1, maximum=1.0, value=0.5, step=0.1,
415
- label="Detection Threshold",
416
- info="Higher values mean stricter detection."
417
- )
418
- trimmed_audio_output = gr.Audio(label="Trimmed Audio", type="filepath")
419
- timestamps_output = gr.Textbox(label="Detected Timestamps (in seconds)")
420
- detect_button = gr.Button("Detect and Trim")
421
-
422
  # Link buttons to functions
423
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
424
  transcribe_button.click(
@@ -431,11 +288,6 @@ with gr.Blocks() as demo:
431
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
432
  outputs=silence_output
433
  )
434
- detect_button.click(
435
- detect_and_trim_audio,
436
- inputs=[main_audio_input, target_audio_input, threshold_slider],
437
- outputs=[trimmed_audio_output, timestamps_output]
438
- )
439
 
440
  # Launch the Gradio interface
441
  demo.launch()
 
3
  import torch
4
  import os
5
  from pydub import AudioSegment, silence
6
+ from faster_whisper import WhisperModel # Import faster-whisper
 
 
 
 
 
 
 
 
 
7
 
8
  # Mapping of model names to Whisper model sizes
9
  MODELS = {
 
122
  # Reverse mapping of language codes to full language names
123
  CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
124
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def detect_language(audio_file):
126
  """Detect the language of the audio file."""
127
+ # Define device and compute type for faster-whisper
128
+ device = "cuda" if torch.cuda.is_available() else "cpu"
129
+ compute_type = "float32" if device == "cuda" else "int8"
130
 
131
+ # Load the faster-whisper model for language detection
132
+ model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
133
+
134
+ # Convert audio to 16kHz mono for better compatibility
135
+ audio = AudioSegment.from_file(audio_file)
136
+ audio = audio.set_frame_rate(16000).set_channels(1)
137
+ processed_audio_path = "processed_audio.wav"
138
+ audio.export(processed_audio_path, format="wav")
139
+
140
+ # Detect the language using faster-whisper
141
+ segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
142
+ detected_language_code = info.language
143
+
144
+ # Get the full language name from the code
145
+ detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
146
+
147
+ # Clean up processed audio file
148
+ os.remove(processed_audio_path)
149
+
150
+ return f"Detected Language: {detected_language}"
 
 
 
 
 
 
 
 
151
 
152
  def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
153
  """
 
161
  Returns:
162
  str: Path to the output audio file with silence removed.
163
  """
164
+ # Load the audio file
165
+ audio = AudioSegment.from_file(audio_file)
166
 
167
+ # Detect silent chunks
168
+ silent_chunks = silence.detect_silence(
169
+ audio,
170
+ min_silence_len=min_silence_len,
171
+ silence_thresh=silence_threshold
172
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
+ # Remove silent chunks
175
+ non_silent_audio = AudioSegment.empty()
176
+ start = 0
177
+ for chunk in silent_chunks:
178
+ non_silent_audio += audio[start:chunk[0]] # Add non-silent part
179
+ start = chunk[1] # Move to the end of the silent chunk
180
+ non_silent_audio += audio[start:] # Add the remaining part
181
 
182
+ # Export the processed audio
183
+ output_path = "silence_removed_audio.wav"
184
+ non_silent_audio.export(output_path, format="wav")
 
 
 
185
 
186
+ return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
189
  """Transcribe the audio file."""
190
+ # Convert audio to 16kHz mono for better compatibility
191
+ audio = AudioSegment.from_file(audio_file)
192
+ audio = audio.set_frame_rate(16000).set_channels(1)
193
+ processed_audio_path = "processed_audio.wav"
194
+ audio.export(processed_audio_path, format="wav")
195
 
196
+ # Load the appropriate model
197
+ if model_size == "Faster Whisper Large v3":
198
+ # Define device and compute type for faster-whisper
199
+ device = "cuda" if torch.cuda.is_available() else "cpu"
200
+ compute_type = "float32" if device == "cuda" else "int8"
 
201
 
202
+ # Use faster-whisper for the Systran model
203
+ model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
204
+ segments, info = model.transcribe(
205
+ processed_audio_path,
206
+ task="transcribe",
207
+ word_timestamps=True,
208
+ repetition_penalty=1.1,
209
+ temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
210
+ )
211
+ transcription = " ".join([segment.text for segment in segments])
212
+ detected_language_code = info.language
213
+ detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
214
+ else:
215
+ # Use the standard Whisper model
216
+ model = whisper.load_model(MODELS[model_size])
217
+
218
+ # Transcribe the audio
219
+ if language == "Auto Detect":
220
+ result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
221
+ detected_language_code = result.get("language", "unknown")
222
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
223
  else:
224
+ language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
225
+ result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
226
+ detected_language = language
 
 
 
 
 
 
 
 
 
 
 
 
 
 
227
 
228
+ transcription = result["text"]
229
+
230
+ # Clean up processed audio file
231
+ os.remove(processed_audio_path)
232
+
233
+ # Return transcription and detected language
234
+ return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
235
 
236
  # Define the Gradio interface
237
  with gr.Blocks() as demo:
238
+ gr.Markdown("# Audio Transcription and Language Detector") # Updated title
239
 
240
  with gr.Tab("Detect Language"):
241
  gr.Markdown("Upload an audio file to detect its language.")
 
276
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
277
  silence_button = gr.Button("Remove Silence")
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  # Link buttons to functions
280
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
281
  transcribe_button.click(
 
288
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
289
  outputs=silence_output
290
  )
 
 
 
 
 
291
 
292
  # Launch the Gradio interface
293
  demo.launch()