Athspi commited on
Commit
2eebdd2
·
verified ·
1 Parent(s): 4f33135

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -8
app.py CHANGED
@@ -260,6 +260,69 @@ def detect_voice_activity(audio_file, threshold=0.02):
260
 
261
  return output_path
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
264
  """Transcribe the audio file."""
265
  # Convert audio to 16kHz mono for better compatibility
@@ -352,16 +415,15 @@ with gr.Blocks() as demo:
352
  silence_button = gr.Button("Remove Silence")
353
 
354
  with gr.Tab("Voice Detection and Trimming"):
355
- gr.Markdown("Upload two audio files to detect voice activity and trim the audio.")
356
- voice_audio_input1 = gr.Audio(type="filepath", label="Upload Audio File 1")
357
- voice_audio_input2 = gr.Audio(type="filepath", label="Upload Audio File 2")
358
  voice_threshold_slider = gr.Slider(
359
  minimum=0.01, maximum=0.1, value=0.02, step=0.01,
360
  label="Voice Detection Threshold",
361
  info="Higher values detect louder sounds as voice."
362
  )
363
- voice_output1 = gr.Audio(label="Trimmed Audio 1", type="filepath")
364
- voice_output2 = gr.Audio(label="Trimmed Audio 2", type="filepath")
365
  voice_button = gr.Button("Detect and Trim Voice")
366
 
367
  # Link buttons to functions
@@ -377,9 +439,9 @@ with gr.Blocks() as demo:
377
  outputs=silence_output
378
  )
379
  voice_button.click(
380
- lambda audio1, audio2, threshold: (detect_voice_activity(audio1, threshold), detect_voice_activity(audio2, threshold)),
381
- inputs=[voice_audio_input1, voice_audio_input2, voice_threshold_slider],
382
- outputs=[voice_output1, voice_output2]
383
  )
384
 
385
  # Launch the Gradio interface
 
260
 
261
  return output_path
262
 
263
+ def detect_and_trim_audio(audio_file, threshold=0.02):
264
+ """
265
+ Detect voice activity in the audio file, trim the audio to include only voice segments,
266
+ and return the timestamps of the detected segments.
267
+
268
+ Args:
269
+ audio_file (str): Path to the input audio file.
270
+ threshold (float): Amplitude threshold for voice detection. Default is 0.02.
271
+
272
+ Returns:
273
+ str: Path to the output audio file with only voice segments.
274
+ list: List of timestamps (start, end) for the detected segments.
275
+ """
276
+ # Convert the input audio to WAV format
277
+ wav_path = convert_to_wav(audio_file)
278
+
279
+ # Load the WAV file
280
+ sample_rate, data = wavfile.read(wav_path)
281
+
282
+ # If the audio is stereo, convert it to mono by averaging the channels
283
+ if len(data.shape) > 1:
284
+ data = np.mean(data, axis=1)
285
+
286
+ # Normalize the audio data to the range [-1, 1]
287
+ if data.dtype != np.float32:
288
+ data = data.astype(np.float32) / np.iinfo(data.dtype).max
289
+
290
+ # Detect voice activity
291
+ voice_segments = []
292
+ is_voice = False
293
+ start = 0
294
+ for i, sample in enumerate(data):
295
+ if abs(sample) > threshold and not is_voice:
296
+ is_voice = True
297
+ start = i
298
+ elif abs(sample) <= threshold and is_voice:
299
+ is_voice = False
300
+ voice_segments.append((start, i))
301
+
302
+ # If the last segment is voice, add it
303
+ if is_voice:
304
+ voice_segments.append((start, len(data)))
305
+
306
+ # Trim the audio to include only voice segments
307
+ trimmed_audio = np.array([], dtype=np.float32)
308
+ for segment in voice_segments:
309
+ trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
310
+
311
+ # Convert the trimmed audio back to 16-bit integer format
312
+ trimmed_audio_int16 = np.int16(trimmed_audio * 32767)
313
+
314
+ # Export the trimmed audio
315
+ output_path = "voice_trimmed_audio.wav"
316
+ wavfile.write(output_path, sample_rate, trimmed_audio_int16)
317
+
318
+ # Calculate timestamps in seconds
319
+ timestamps = [(start / sample_rate, end / sample_rate) for start, end in voice_segments]
320
+
321
+ # Clean up the converted WAV file
322
+ os.remove(wav_path)
323
+
324
+ return output_path, timestamps
325
+
326
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
327
  """Transcribe the audio file."""
328
  # Convert audio to 16kHz mono for better compatibility
 
415
  silence_button = gr.Button("Remove Silence")
416
 
417
  with gr.Tab("Voice Detection and Trimming"):
418
+ gr.Markdown("Upload an audio file to detect voice activity and trim the audio.")
419
+ voice_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
 
420
  voice_threshold_slider = gr.Slider(
421
  minimum=0.01, maximum=0.1, value=0.02, step=0.01,
422
  label="Voice Detection Threshold",
423
  info="Higher values detect louder sounds as voice."
424
  )
425
+ voice_output = gr.Audio(label="Trimmed Audio", type="filepath")
426
+ timestamps_output = gr.Textbox(label="Detected Timestamps (seconds)")
427
  voice_button = gr.Button("Detect and Trim Voice")
428
 
429
  # Link buttons to functions
 
439
  outputs=silence_output
440
  )
441
  voice_button.click(
442
+ detect_and_trim_audio,
443
+ inputs=[voice_audio_input, voice_threshold_slider],
444
+ outputs=[voice_output, timestamps_output]
445
  )
446
 
447
  # Launch the Gradio interface