Update app.py
Browse files
app.py
CHANGED
@@ -2,11 +2,10 @@ import gradio as gr
|
|
2 |
import whisper
|
3 |
import torch
|
4 |
import os
|
5 |
-
import numpy as np
|
6 |
from pydub import AudioSegment, silence
|
7 |
from faster_whisper import WhisperModel # Import faster-whisper
|
8 |
-
import
|
9 |
-
from
|
10 |
|
11 |
# Mapping of model names to Whisper model sizes
|
12 |
MODELS = {
|
@@ -188,63 +187,48 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
|
|
188 |
|
189 |
return output_path
|
190 |
|
191 |
-
def
|
192 |
"""
|
193 |
-
|
194 |
|
195 |
Args:
|
196 |
audio_file (str): Path to the input audio file.
|
197 |
-
|
198 |
|
199 |
Returns:
|
200 |
-
str: Path to the output audio file with
|
201 |
"""
|
202 |
# Load the audio file
|
203 |
-
|
204 |
-
|
205 |
-
# Convert audio to numpy array for noisereduce
|
206 |
-
samples = np.array(audio.get_array_of_samples())
|
207 |
-
sample_rate = audio.frame_rate
|
208 |
-
|
209 |
-
# Perform noise reduction
|
210 |
-
reduced_noise = nr.reduce_noise(
|
211 |
-
y=samples,
|
212 |
-
sr=sample_rate,
|
213 |
-
prop_decrease=noise_reduce_level
|
214 |
-
)
|
215 |
|
216 |
-
#
|
217 |
-
|
218 |
-
|
219 |
-
frame_rate=sample_rate,
|
220 |
-
sample_width=audio.sample_width,
|
221 |
-
channels=audio.channels
|
222 |
-
)
|
223 |
-
|
224 |
-
# Export the processed audio
|
225 |
-
output_path = "noise_reduced_audio.wav"
|
226 |
-
reduced_audio.export(output_path, format="wav")
|
227 |
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
#
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
|
|
|
|
|
|
|
|
248 |
|
249 |
return output_path
|
250 |
|
@@ -339,22 +323,18 @@ with gr.Blocks() as demo:
|
|
339 |
silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
|
340 |
silence_button = gr.Button("Remove Silence")
|
341 |
|
342 |
-
with gr.Tab("
|
343 |
-
gr.Markdown("Upload
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
|
|
349 |
)
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
with gr.Tab("Remove Background Music"):
|
354 |
-
gr.Markdown("Upload an audio file to remove background music.")
|
355 |
-
music_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
|
356 |
-
music_output = gr.Audio(label="Processed Audio (Music Removed)", type="filepath")
|
357 |
-
music_button = gr.Button("Remove Background Music")
|
358 |
|
359 |
# Link buttons to functions
|
360 |
detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
|
@@ -368,15 +348,10 @@ with gr.Blocks() as demo:
|
|
368 |
inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
|
369 |
outputs=silence_output
|
370 |
)
|
371 |
-
|
372 |
-
|
373 |
-
inputs=[
|
374 |
-
outputs=
|
375 |
-
)
|
376 |
-
music_button.click(
|
377 |
-
remove_background_music,
|
378 |
-
inputs=music_audio_input,
|
379 |
-
outputs=music_output
|
380 |
)
|
381 |
|
382 |
# Launch the Gradio interface
|
|
|
2 |
import whisper
|
3 |
import torch
|
4 |
import os
|
|
|
5 |
from pydub import AudioSegment, silence
|
6 |
from faster_whisper import WhisperModel # Import faster-whisper
|
7 |
+
import numpy as np
|
8 |
+
from scipy.io import wavfile
|
9 |
|
10 |
# Mapping of model names to Whisper model sizes
|
11 |
MODELS = {
|
|
|
187 |
|
188 |
return output_path
|
189 |
|
190 |
+
def detect_voice_activity(audio_file, threshold=0.02):
|
191 |
"""
|
192 |
+
Detect voice activity in the audio file and trim the audio to include only voice segments.
|
193 |
|
194 |
Args:
|
195 |
audio_file (str): Path to the input audio file.
|
196 |
+
threshold (float): Amplitude threshold for voice detection. Default is 0.02.
|
197 |
|
198 |
Returns:
|
199 |
+
str: Path to the output audio file with only voice segments.
|
200 |
"""
|
201 |
# Load the audio file
|
202 |
+
sample_rate, data = wavfile.read(audio_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
+
# Normalize the audio data
|
205 |
+
if data.dtype != np.float32:
|
206 |
+
data = data.astype(np.float32) / np.iinfo(data.dtype).max
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
+
# Detect voice activity
|
209 |
+
voice_segments = []
|
210 |
+
is_voice = False
|
211 |
+
start = 0
|
212 |
+
for i, sample in enumerate(data):
|
213 |
+
if abs(sample) > threshold and not is_voice:
|
214 |
+
is_voice = True
|
215 |
+
start = i
|
216 |
+
elif abs(sample) <= threshold and is_voice:
|
217 |
+
is_voice = False
|
218 |
+
voice_segments.append((start, i))
|
219 |
+
|
220 |
+
# If the last segment is voice, add it
|
221 |
+
if is_voice:
|
222 |
+
voice_segments.append((start, len(data)))
|
223 |
+
|
224 |
+
# Trim the audio to include only voice segments
|
225 |
+
trimmed_audio = np.array([], dtype=np.float32)
|
226 |
+
for segment in voice_segments:
|
227 |
+
trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
|
228 |
+
|
229 |
+
# Export the trimmed audio
|
230 |
+
output_path = "voice_trimmed_audio.wav"
|
231 |
+
wavfile.write(output_path, sample_rate, trimmed_audio)
|
232 |
|
233 |
return output_path
|
234 |
|
|
|
323 |
silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
|
324 |
silence_button = gr.Button("Remove Silence")
|
325 |
|
326 |
+
with gr.Tab("Voice Detection and Trimming"):
|
327 |
+
gr.Markdown("Upload two audio files to detect voice activity and trim the audio.")
|
328 |
+
voice_audio_input1 = gr.Audio(type="filepath", label="Upload Audio File 1")
|
329 |
+
voice_audio_input2 = gr.Audio(type="filepath", label="Upload Audio File 2")
|
330 |
+
voice_threshold_slider = gr.Slider(
|
331 |
+
minimum=0.01, maximum=0.1, value=0.02, step=0.01,
|
332 |
+
label="Voice Detection Threshold",
|
333 |
+
info="Higher values detect louder sounds as voice."
|
334 |
)
|
335 |
+
voice_output1 = gr.Audio(label="Trimmed Audio 1", type="filepath")
|
336 |
+
voice_output2 = gr.Audio(label="Trimmed Audio 2", type="filepath")
|
337 |
+
voice_button = gr.Button("Detect and Trim Voice")
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
# Link buttons to functions
|
340 |
detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
|
|
|
348 |
inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
|
349 |
outputs=silence_output
|
350 |
)
|
351 |
+
voice_button.click(
|
352 |
+
lambda audio1, audio2, threshold: (detect_voice_activity(audio1, threshold), detect_voice_activity(audio2, threshold)),
|
353 |
+
inputs=[voice_audio_input1, voice_audio_input2, voice_threshold_slider],
|
354 |
+
outputs=[voice_output1, voice_output2]
|
|
|
|
|
|
|
|
|
|
|
355 |
)
|
356 |
|
357 |
# Launch the Gradio interface
|