Athspi commited on
Commit
6bfef72
·
verified ·
1 Parent(s): 0a51f5f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -3
app.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
  import whisper
3
  import torch
4
  import os
5
- from pydub import AudioSegment
6
  from faster_whisper import WhisperModel # Import faster-whisper
7
 
8
  # Mapping of model names to Whisper model sizes
@@ -149,8 +149,48 @@ def detect_language(audio_file):
149
 
150
  return f"Detected Language: {detected_language}"
151
 
152
- def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
  """Transcribe the audio file."""
 
 
 
 
154
  # Convert audio to 16kHz mono for better compatibility
155
  audio = AudioSegment.from_file(audio_file)
156
  audio = audio.set_frame_rate(16000).set_channels(1)
@@ -221,12 +261,17 @@ with gr.Blocks() as demo:
221
  value="Faster Whisper Large v3", # Default to "Faster Whisper Large v3"
222
  interactive=True # Allow model selection by default
223
  )
 
224
  transcribe_output = gr.Textbox(label="Transcription and Detected Language")
225
  transcribe_button = gr.Button("Transcribe Audio")
226
 
227
  # Link buttons to functions
228
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
229
- transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
 
 
 
 
230
 
231
  # Launch the Gradio interface
232
  demo.launch()
 
2
  import whisper
3
  import torch
4
  import os
5
+ from pydub import AudioSegment, silence
6
  from faster_whisper import WhisperModel # Import faster-whisper
7
 
8
  # Mapping of model names to Whisper model sizes
 
149
 
150
  return f"Detected Language: {detected_language}"
151
 
152
+ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
153
+ """
154
+ Remove silence from the audio file using AI-based silence detection.
155
+
156
+ Args:
157
+ audio_file (str): Path to the input audio file.
158
+ silence_threshold (int): Silence threshold in dB. Default is -40 dB.
159
+ min_silence_len (int): Minimum length of silence to remove in milliseconds. Default is 500 ms.
160
+
161
+ Returns:
162
+ str: Path to the output audio file with silence removed.
163
+ """
164
+ # Load the audio file
165
+ audio = AudioSegment.from_file(audio_file)
166
+
167
+ # Detect silent chunks
168
+ silent_chunks = silence.detect_silence(
169
+ audio,
170
+ min_silence_len=min_silence_len,
171
+ silence_thresh=silence_threshold
172
+ )
173
+
174
+ # Remove silent chunks
175
+ non_silent_audio = AudioSegment.empty()
176
+ start = 0
177
+ for chunk in silent_chunks:
178
+ non_silent_audio += audio[start:chunk[0]] # Add non-silent part
179
+ start = chunk[1] # Move to the end of the silent chunk
180
+ non_silent_audio += audio[start:] # Add the remaining part
181
+
182
+ # Export the processed audio
183
+ output_path = "silence_removed_audio.wav"
184
+ non_silent_audio.export(output_path, format="wav")
185
+
186
+ return output_path
187
+
188
+ def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3", remove_silence_flag=False):
189
  """Transcribe the audio file."""
190
+ # Remove silence if the flag is enabled
191
+ if remove_silence_flag:
192
+ audio_file = remove_silence(audio_file)
193
+
194
  # Convert audio to 16kHz mono for better compatibility
195
  audio = AudioSegment.from_file(audio_file)
196
  audio = audio.set_frame_rate(16000).set_channels(1)
 
261
  value="Faster Whisper Large v3", # Default to "Faster Whisper Large v3"
262
  interactive=True # Allow model selection by default
263
  )
264
+ remove_silence_checkbox = gr.Checkbox(label="Remove Silence", value=False)
265
  transcribe_output = gr.Textbox(label="Transcription and Detected Language")
266
  transcribe_button = gr.Button("Transcribe Audio")
267
 
268
  # Link buttons to functions
269
  detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
270
+ transcribe_button.click(
271
+ transcribe_audio,
272
+ inputs=[transcribe_audio_input, language_dropdown, model_dropdown, remove_silence_checkbox],
273
+ outputs=transcribe_output
274
+ )
275
 
276
  # Launch the Gradio interface
277
  demo.launch()