Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

2eebdd2

verified ·

1 Parent(s): 4f33135

Update app.py

Browse files

Files changed (1) hide show

app.py +70 -8

app.py CHANGED Viewed

@@ -260,6 +260,69 @@ def detect_voice_activity(audio_file, threshold=0.02):
     return output_path
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
     # Convert audio to 16kHz mono for better compatibility
@@ -352,16 +415,15 @@ with gr.Blocks() as demo:
         silence_button = gr.Button("Remove Silence")
     with gr.Tab("Voice Detection and Trimming"):
-        gr.Markdown("Upload two audio files to detect voice activity and trim the audio.")
-        voice_audio_input1 = gr.Audio(type="filepath", label="Upload Audio File 1")
-        voice_audio_input2 = gr.Audio(type="filepath", label="Upload Audio File 2")
         voice_threshold_slider = gr.Slider(
             minimum=0.01, maximum=0.1, value=0.02, step=0.01,
             label="Voice Detection Threshold",
             info="Higher values detect louder sounds as voice."
         )
-        voice_output1 = gr.Audio(label="Trimmed Audio 1", type="filepath")
-        voice_output2 = gr.Audio(label="Trimmed Audio 2", type="filepath")
         voice_button = gr.Button("Detect and Trim Voice")
     # Link buttons to functions
@@ -377,9 +439,9 @@ with gr.Blocks() as demo:
         outputs=silence_output
     )
     voice_button.click(
-        lambda audio1, audio2, threshold: (detect_voice_activity(audio1, threshold), detect_voice_activity(audio2, threshold)),
-        inputs=[voice_audio_input1, voice_audio_input2, voice_threshold_slider],
-        outputs=[voice_output1, voice_output2]
     )
 # Launch the Gradio interface

     return output_path
+def detect_and_trim_audio(audio_file, threshold=0.02):
+    """
+    Detect voice activity in the audio file, trim the audio to include only voice segments,
+    and return the timestamps of the detected segments.
+    Args:
+        audio_file (str): Path to the input audio file.
+        threshold (float): Amplitude threshold for voice detection. Default is 0.02.
+    Returns:
+        str: Path to the output audio file with only voice segments.
+        list: List of timestamps (start, end) for the detected segments.
+    """
+    # Convert the input audio to WAV format
+    wav_path = convert_to_wav(audio_file)
+    # Load the WAV file
+    sample_rate, data = wavfile.read(wav_path)
+    # If the audio is stereo, convert it to mono by averaging the channels
+    if len(data.shape) > 1:
+        data = np.mean(data, axis=1)
+    # Normalize the audio data to the range [-1, 1]
+    if data.dtype != np.float32:
+        data = data.astype(np.float32) / np.iinfo(data.dtype).max
+    # Detect voice activity
+    voice_segments = []
+    is_voice = False
+    start = 0
+    for i, sample in enumerate(data):
+        if abs(sample) > threshold and not is_voice:
+            is_voice = True
+            start = i
+        elif abs(sample) <= threshold and is_voice:
+            is_voice = False
+            voice_segments.append((start, i))
+    # If the last segment is voice, add it
+    if is_voice:
+        voice_segments.append((start, len(data)))
+    # Trim the audio to include only voice segments
+    trimmed_audio = np.array([], dtype=np.float32)
+    for segment in voice_segments:
+        trimmed_audio = np.concatenate((trimmed_audio, data[segment[0]:segment[1]]))
+    # Convert the trimmed audio back to 16-bit integer format
+    trimmed_audio_int16 = np.int16(trimmed_audio * 32767)
+    # Export the trimmed audio
+    output_path = "voice_trimmed_audio.wav"
+    wavfile.write(output_path, sample_rate, trimmed_audio_int16)
+    # Calculate timestamps in seconds
+    timestamps = [(start / sample_rate, end / sample_rate) for start, end in voice_segments]
+    # Clean up the converted WAV file
+    os.remove(wav_path)
+    return output_path, timestamps
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
     # Convert audio to 16kHz mono for better compatibility
         silence_button = gr.Button("Remove Silence")
     with gr.Tab("Voice Detection and Trimming"):
+        gr.Markdown("Upload an audio file to detect voice activity and trim the audio.")
+        voice_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
         voice_threshold_slider = gr.Slider(
             minimum=0.01, maximum=0.1, value=0.02, step=0.01,
             label="Voice Detection Threshold",
             info="Higher values detect louder sounds as voice."
         )
+        voice_output = gr.Audio(label="Trimmed Audio", type="filepath")
+        timestamps_output = gr.Textbox(label="Detected Timestamps (seconds)")
         voice_button = gr.Button("Detect and Trim Voice")
     # Link buttons to functions
         outputs=silence_output
     )
     voice_button.click(
+        detect_and_trim_audio,
+        inputs=[voice_audio_input, voice_threshold_slider],
+        outputs=[voice_output, timestamps_output]
     )
 # Launch the Gradio interface