Athspi commited on
Commit
573f5cd
·
verified ·
1 Parent(s): dc75979

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -74
app.py CHANGED
@@ -2,11 +2,10 @@ import gradio as gr
2
  import whisper
3
  import torch
4
  import os
5
- import numpy as np
6
  from pydub import AudioSegment, silence
7
  from faster_whisper import WhisperModel # Import faster-whisper
8
- import noisereduce as nr # Import noisereduce for background noise removal
9
- from spleeter.separator import Separator # Import Spleeter for music separation
10
 
11
  # Mapping of model names to Whisper model sizes
12
  MODELS = {
@@ -188,63 +187,48 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
188
 
189
  return output_path
190
 
191
- def remove_background_noise(audio_file, noise_reduce_level=0.5):
192
  """
193
- Remove background noise from the audio file using AI-based noise reduction.
194
 
195
  Args:
196
  audio_file (str): Path to the input audio file.
197
- noise_reduce_level (float): Noise reduction level (0.0 to 1.0). Default is 0.5.
198
 
199
  Returns:
200
- str: Path to the output audio file with background noise removed.
201
  """
202
  # Load the audio file
203
- audio = AudioSegment.from_file(audio_file)
204
-
205
- # Convert audio to numpy array for noisereduce
206
- samples = np.array(audio.get_array_of_samples())
207
- sample_rate = audio.frame_rate
208
-
209
- # Perform noise reduction
210
- reduced_noise = nr.reduce_noise(
211
- y=samples,
212
- sr=sample_rate,
213
- prop_decrease=noise_reduce_level
214
- )
215
 
216
- # Convert back to AudioSegment
217
- reduced_audio = AudioSegment(
218
- reduced_noise.tobytes(),
219
- frame_rate=sample_rate,
220
- sample_width=audio.sample_width,
221
- channels=audio.channels
222
- )
223
-
224
- # Export the processed audio
225
- output_path = "noise_reduced_audio.wav"
226
- reduced_audio.export(output_path, format="wav")
227
 
228
- return output_path
229
-
230
- def remove_background_music(audio_file):
231
- """
232
- Remove background music from the audio file using Spleeter.
233
-
234
- Args:
235
- audio_file (str): Path to the input audio file.
236
-
237
- Returns:
238
- str: Path to the output audio file with background music removed.
239
- """
240
- # Initialize Spleeter separator (2 stems: vocals and accompaniment)
241
- separator = Separator('spleeter:2stems')
242
-
243
- # Separate vocals from background music
244
- separator.separate_to_file(audio_file, "output")
245
-
246
- # Load the separated vocals
247
- output_path = os.path.join("output", os.path.basename(audio_file).replace(".wav", ""), "vocals.wav")
 
 
 
 
248
 
249
  return output_path
250
 
@@ -339,22 +323,18 @@ with gr.Blocks() as demo:
339
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
340
  silence_button = gr.Button("Remove Silence")
341
 
342
- with gr.Tab("Remove Background Noise"):
343
- gr.Markdown("Upload an audio file to remove background noise.")
344
- noise_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
345
- noise_reduce_slider = gr.Slider(
346
- minimum=0.0, maximum=1.0, value=0.5, step=0.1,
347
- label="Noise Reduction Level",
348
- info="Higher values remove more noise."
 
349
  )
350
- noise_output = gr.Audio(label="Processed Audio (Noise Removed)", type="filepath")
351
- noise_button = gr.Button("Remove Background Noise")
352
-
353
- with gr.Tab("Remove Background Music"):
354
- gr.Markdown("Upload an audio file to remove background music.")
355
- music_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
356
- music_output = gr.Audio(label="Processed Audio (Music Removed)", type="filepath")
357
- music_button = gr.Button("Remove Background Music")
358
 
359
  # Link buttons to functions
360
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
@@ -368,15 +348,10 @@ with gr.Blocks() as demo:
368
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
369
  outputs=silence_output
370
  )
371
- noise_button.click(
372
- remove_background_noise,
373
- inputs=[noise_audio_input, noise_reduce_slider],
374
- outputs=noise_output
375
- )
376
- music_button.click(
377
- remove_background_music,
378
- inputs=music_audio_input,
379
- outputs=music_output
380
  )
381
 
382
  # Launch the Gradio interface
 
2
  import whisper
3
  import torch
4
  import os
 
5
  from pydub import AudioSegment, silence
6
  from faster_whisper import WhisperModel # Import faster-whisper
7
+ import numpy as np
8
+ from scipy.io import wavfile
9
 
10
  # Mapping of model names to Whisper model sizes
11
  MODELS = {
 
187
 
188
  return output_path
189
 
190
+ def detect_voice_activity(audio_file, threshold=0.02):
191
  """
192
+ Detect voice activity in the audio file and trim the audio to include only voice segments.
193
 
194
  Args:
195
  audio_file (str): Path to the input audio file.
196
+ threshold (float): Amplitude threshold for voice detection. Default is 0.02.
197
 
198
  Returns:
199
+ str: Path to the output audio file with only voice segments.
200
  """
201
  # Load the audio file
202
+ sample_rate, data = wavfile.read(audio_file)
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ # Normalize the audio data
205
+ if data.dtype != np.float32:
206
+ data = data.astype(np.float32) / np.iinfo(data.dtype).max
 
 
 
 
 
 
 
 
207
 
208
+ # Detect voice activity
209
+ voice_segments = []
210
+ is_voice = False
211
+ start = 0
212
+ for i, sample in enumerate(data):
213
+ if abs(sample) > threshold and not is_voice:
214
+ is_voice = True
215
+ start = i
216
+ elif abs(sample) <= threshold and is_voice:
217
+ is_voice = False
218
+ voice_segments.append((start, i))
219
+
220
+ # If the last segment is voice, add it
221
+ if is_voice:
222
+ voice_segments.append((start, len(data)))
223
+
224
+ # Trim the audio to include only voice segments
225
+ trimmed_audio = np.array([], dtype=np.float32)
226
+ for segment in voice_segments:
227
+ trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
228
+
229
+ # Export the trimmed audio
230
+ output_path = "voice_trimmed_audio.wav"
231
+ wavfile.write(output_path, sample_rate, trimmed_audio)
232
 
233
  return output_path
234
 
 
323
  silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
324
  silence_button = gr.Button("Remove Silence")
325
 
326
+ with gr.Tab("Voice Detection and Trimming"):
327
+ gr.Markdown("Upload two audio files to detect voice activity and trim the audio.")
328
+ voice_audio_input1 = gr.Audio(type="filepath", label="Upload Audio File 1")
329
+ voice_audio_input2 = gr.Audio(type="filepath", label="Upload Audio File 2")
330
+ voice_threshold_slider = gr.Slider(
331
+ minimum=0.01, maximum=0.1, value=0.02, step=0.01,
332
+ label="Voice Detection Threshold",
333
+ info="Higher values detect louder sounds as voice."
334
  )
335
+ voice_output1 = gr.Audio(label="Trimmed Audio 1", type="filepath")
336
+ voice_output2 = gr.Audio(label="Trimmed Audio 2", type="filepath")
337
+ voice_button = gr.Button("Detect and Trim Voice")
 
 
 
 
 
338
 
339
  # Link buttons to functions
340
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
 
348
  inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
349
  outputs=silence_output
350
  )
351
+ voice_button.click(
352
+ lambda audio1, audio2, threshold: (detect_voice_activity(audio1, threshold), detect_voice_activity(audio2, threshold)),
353
+ inputs=[voice_audio_input1, voice_audio_input2, voice_threshold_slider],
354
+ outputs=[voice_output1, voice_output2]
 
 
 
 
 
355
  )
356
 
357
  # Launch the Gradio interface