Athspi commited on
Commit
8ceb7c6
·
verified ·
1 Parent(s): 380d6cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +183 -137
app.py CHANGED
@@ -125,32 +125,42 @@ LANGUAGE_NAME_TO_CODE = {
125
  # Reverse mapping of language codes to full language names
126
  CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
127
 
 
 
 
 
 
 
 
128
  def detect_language(audio_file):
129
  """Detect the language of the audio file."""
130
- # Define device and compute type for faster-whisper
131
- device = "cuda" if torch.cuda.is_available() else "cpu"
132
- compute_type = "float32" if device == "cuda" else "int8"
133
-
134
- # Load the faster-whisper model for language detection
135
- model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
136
-
137
- # Convert audio to 16kHz mono for better compatibility
138
- audio = AudioSegment.from_file(audio_file)
139
- audio = audio.set_frame_rate(16000).set_channels(1)
140
- processed_audio_path = "processed_audio.wav"
141
- audio.export(processed_audio_path, format="wav")
142
-
143
- # Detect the language using faster-whisper
144
- segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
145
- detected_language_code = info.language
146
-
147
- # Get the full language name from the code
148
- detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
149
-
150
- # Clean up processed audio file
151
- os.remove(processed_audio_path)
152
 
153
- return f"Detected Language: {detected_language}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
  def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
156
  """
@@ -164,29 +174,41 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
164
  Returns:
165
  str: Path to the output audio file with silence removed.
166
  """
167
- # Load the audio file
168
- audio = AudioSegment.from_file(audio_file)
169
-
170
- # Detect silent chunks
171
- silent_chunks = silence.detect_silence(
172
- audio,
173
- min_silence_len=min_silence_len,
174
- silence_thresh=silence_threshold
175
- )
176
-
177
- # Remove silent chunks
178
- non_silent_audio = AudioSegment.empty()
179
- start = 0
180
- for chunk in silent_chunks:
181
- non_silent_audio += audio[start:chunk[0]] # Add non-silent part
182
- start = chunk[1] # Move to the end of the silent chunk
183
- non_silent_audio += audio[start:] # Add the remaining part
184
-
185
- # Export the processed audio
186
- output_path = "silence_removed_audio.wav"
187
- non_silent_audio.export(output_path, format="wav")
188
 
189
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
  def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
192
  """
@@ -201,109 +223,133 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
201
  str: Path to the trimmed audio file.
202
  str: Detected timestamps in the format "start-end (in seconds)".
203
  """
204
- # Load audio files
205
- main_rate, main_data = wavfile.read(main_audio)
206
- target_rate, target_data = wavfile.read(target_audio)
207
-
208
- # Ensure both audio files have the same sample rate
209
- if main_rate != target_rate:
210
- raise ValueError("Sample rates of the main audio and target audio must match.")
211
 
212
- # Normalize audio data
213
- main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
214
- target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
215
-
216
- # Perform cross-correlation to detect the target audio in the main audio
217
- correlation = correlate(main_data, target_data, mode='valid')
218
- correlation = np.abs(correlation)
219
- max_corr = np.max(correlation)
220
-
221
- # Detect segments where the target audio is present
222
- detected_segments = []
223
- for i, corr_value in enumerate(correlation):
224
- if corr_value >= threshold * max_corr:
225
- start_time = i / main_rate
226
- end_time = (i + len(target_data)) / main_rate
227
- detected_segments.append((start_time, end_time))
228
-
229
- # Merge overlapping or nearby segments
230
- merged_segments = []
231
- for segment in detected_segments:
232
- if not merged_segments:
233
- merged_segments.append(segment)
234
- else:
235
- last_segment = merged_segments[-1]
236
- if segment[0] <= last_segment[1] + 1.0: # Merge if within 1 second
237
- merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
238
- else:
 
 
 
 
 
 
 
239
  merged_segments.append(segment)
240
-
241
- # Trim the main audio to include only the detected segments
242
- main_audio_segment = AudioSegment.from_file(main_audio)
243
- trimmed_audio = AudioSegment.empty()
244
- timestamps = []
245
- for segment in merged_segments:
246
- start_ms = int(segment[0] * 1000)
247
- end_ms = int(segment[1] * 1000)
248
- trimmed_audio += main_audio_segment[start_ms:end_ms]
249
- timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
250
-
251
- # Export the trimmed audio
252
- output_path = "trimmed_audio.wav"
253
- trimmed_audio.export(output_path, format="wav")
254
-
255
- # Format timestamps
256
- timestamps_str = "\n".join(timestamps)
257
-
258
- return output_path, timestamps_str
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
261
  """Transcribe the audio file."""
262
- # Convert audio to 16kHz mono for better compatibility
263
- audio = AudioSegment.from_file(audio_file)
264
- audio = audio.set_frame_rate(16000).set_channels(1)
265
- processed_audio_path = "processed_audio.wav"
266
- audio.export(processed_audio_path, format="wav")
267
 
268
- # Load the appropriate model
269
- if model_size == "Faster Whisper Large v3":
270
- # Define device and compute type for faster-whisper
271
- device = "cuda" if torch.cuda.is_available() else "cpu"
272
- compute_type = "float32" if device == "cuda" else "int8"
273
 
274
- # Use faster-whisper for the Systran model
275
- model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
276
- segments, info = model.transcribe(
277
- processed_audio_path,
278
- task="transcribe",
279
- word_timestamps=True,
280
- repetition_penalty=1.1,
281
- temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
282
- )
283
- transcription = " ".join([segment.text for segment in segments])
284
- detected_language_code = info.language
285
- detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
286
- else:
287
- # Use the standard Whisper model
288
- model = whisper.load_model(MODELS[model_size])
289
 
290
- # Transcribe the audio
291
- if language == "Auto Detect":
292
- result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
293
- detected_language_code = result.get("language", "unknown")
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
295
  else:
296
- language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
297
- result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
298
- detected_language = language
 
 
 
 
 
 
 
 
 
 
 
299
 
300
- transcription = result["text"]
301
-
302
- # Clean up processed audio file
303
- os.remove(processed_audio_path)
304
-
305
- # Return transcription and detected language
306
- return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 
307
 
308
  # Define the Gradio interface
309
  with gr.Blocks() as demo:
 
125
  # Reverse mapping of language codes to full language names
126
  CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
127
 
128
+ def convert_to_wav(audio_file):
129
+ """Convert any audio file to WAV format."""
130
+ audio = AudioSegment.from_file(audio_file)
131
+ wav_path = "temp_audio.wav"
132
+ audio.export(wav_path, format="wav")
133
+ return wav_path
134
+
135
  def detect_language(audio_file):
136
  """Detect the language of the audio file."""
137
+ if audio_file is None:
138
+ return "Error: No audio file uploaded."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
+ try:
141
+ # Convert audio to WAV format
142
+ wav_path = convert_to_wav(audio_file)
143
+
144
+ # Define device and compute type for faster-whisper
145
+ device = "cuda" if torch.cuda.is_available() else "cpu"
146
+ compute_type = "float32" if device == "cuda" else "int8"
147
+
148
+ # Load the faster-whisper model for language detection
149
+ model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
150
+
151
+ # Detect the language using faster-whisper
152
+ segments, info = model.transcribe(wav_path, task="translate", language=None)
153
+ detected_language_code = info.language
154
+
155
+ # Get the full language name from the code
156
+ detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
157
+
158
+ # Clean up temporary WAV file
159
+ os.remove(wav_path)
160
+
161
+ return f"Detected Language: {detected_language}"
162
+ except Exception as e:
163
+ return f"Error: {str(e)}"
164
 
165
  def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
166
  """
 
174
  Returns:
175
  str: Path to the output audio file with silence removed.
176
  """
177
+ if audio_file is None:
178
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
 
180
+ try:
181
+ # Convert audio to WAV format
182
+ wav_path = convert_to_wav(audio_file)
183
+
184
+ # Load the audio file
185
+ audio = AudioSegment.from_file(wav_path)
186
+
187
+ # Detect silent chunks
188
+ silent_chunks = silence.detect_silence(
189
+ audio,
190
+ min_silence_len=min_silence_len,
191
+ silence_thresh=silence_threshold
192
+ )
193
+
194
+ # Remove silent chunks
195
+ non_silent_audio = AudioSegment.empty()
196
+ start = 0
197
+ for chunk in silent_chunks:
198
+ non_silent_audio += audio[start:chunk[0]] # Add non-silent part
199
+ start = chunk[1] # Move to the end of the silent chunk
200
+ non_silent_audio += audio[start:] # Add the remaining part
201
+
202
+ # Export the processed audio
203
+ output_path = "silence_removed_audio.wav"
204
+ non_silent_audio.export(output_path, format="wav")
205
+
206
+ # Clean up temporary WAV file
207
+ os.remove(wav_path)
208
+
209
+ return output_path
210
+ except Exception as e:
211
+ return f"Error: {str(e)}"
212
 
213
  def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
214
  """
 
223
  str: Path to the trimmed audio file.
224
  str: Detected timestamps in the format "start-end (in seconds)".
225
  """
226
+ if main_audio is None or target_audio is None:
227
+ return None, "Error: Please upload both main and target audio files."
 
 
 
 
 
228
 
229
+ try:
230
+ # Convert audio files to WAV format
231
+ main_wav_path = convert_to_wav(main_audio)
232
+ target_wav_path = convert_to_wav(target_audio)
233
+
234
+ # Load audio files
235
+ main_rate, main_data = wavfile.read(main_wav_path)
236
+ target_rate, target_data = wavfile.read(target_wav_path)
237
+
238
+ # Ensure both audio files have the same sample rate
239
+ if main_rate != target_rate:
240
+ raise ValueError("Sample rates of the main audio and target audio must match.")
241
+
242
+ # Normalize audio data
243
+ main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
244
+ target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
245
+
246
+ # Perform cross-correlation to detect the target audio in the main audio
247
+ correlation = correlate(main_data, target_data, mode='valid')
248
+ correlation = np.abs(correlation)
249
+ max_corr = np.max(correlation)
250
+
251
+ # Detect segments where the target audio is present
252
+ detected_segments = []
253
+ for i, corr_value in enumerate(correlation):
254
+ if corr_value >= threshold * max_corr:
255
+ start_time = i / main_rate
256
+ end_time = (i + len(target_data)) / main_rate
257
+ detected_segments.append((start_time, end_time))
258
+
259
+ # Merge overlapping or nearby segments
260
+ merged_segments = []
261
+ for segment in detected_segments:
262
+ if not merged_segments:
263
  merged_segments.append(segment)
264
+ else:
265
+ last_segment = merged_segments[-1]
266
+ if segment[0] <= last_segment[1] + 1.0: # Merge if within 1 second
267
+ merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
268
+ else:
269
+ merged_segments.append(segment)
270
+
271
+ # Trim the main audio to include only the detected segments
272
+ main_audio_segment = AudioSegment.from_file(main_wav_path)
273
+ trimmed_audio = AudioSegment.empty()
274
+ timestamps = []
275
+ for segment in merged_segments:
276
+ start_ms = int(segment[0] * 1000)
277
+ end_ms = int(segment[1] * 1000)
278
+ trimmed_audio += main_audio_segment[start_ms:end_ms]
279
+ timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
280
+
281
+ # Export the trimmed audio
282
+ output_path = "trimmed_audio.wav"
283
+ trimmed_audio.export(output_path, format="wav")
284
+
285
+ # Format timestamps
286
+ timestamps_str = "\n".join(timestamps)
287
+
288
+ # Clean up temporary WAV files
289
+ os.remove(main_wav_path)
290
+ os.remove(target_wav_path)
291
+
292
+ return output_path, timestamps_str
293
+ except Exception as e:
294
+ return None, f"Error: {str(e)}"
295
 
296
  def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
297
  """Transcribe the audio file."""
298
+ if audio_file is None:
299
+ return "Error: No audio file uploaded."
 
 
 
300
 
301
+ try:
302
+ # Convert audio to WAV format
303
+ wav_path = convert_to_wav(audio_file)
 
 
304
 
305
+ # Convert audio to 16kHz mono for better compatibility
306
+ audio = AudioSegment.from_file(wav_path)
307
+ audio = audio.set_frame_rate(16000).set_channels(1)
308
+ processed_audio_path = "processed_audio.wav"
309
+ audio.export(processed_audio_path, format="wav")
 
 
 
 
 
 
 
 
 
 
310
 
311
+ # Load the appropriate model
312
+ if model_size == "Faster Whisper Large v3":
313
+ # Define device and compute type for faster-whisper
314
+ device = "cuda" if torch.cuda.is_available() else "cpu"
315
+ compute_type = "float32" if device == "cuda" else "int8"
316
+
317
+ # Use faster-whisper for the Systran model
318
+ model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
319
+ segments, info = model.transcribe(
320
+ processed_audio_path,
321
+ task="transcribe",
322
+ word_timestamps=True,
323
+ repetition_penalty=1.1,
324
+ temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
325
+ )
326
+ transcription = " ".join([segment.text for segment in segments])
327
+ detected_language_code = info.language
328
  detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
329
  else:
330
+ # Use the standard Whisper model
331
+ model = whisper.load_model(MODELS[model_size])
332
+
333
+ # Transcribe the audio
334
+ if language == "Auto Detect":
335
+ result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language
336
+ detected_language_code = result.get("language", "unknown")
337
+ detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
338
+ else:
339
+ language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found
340
+ result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
341
+ detected_language = language
342
+
343
+ transcription = result["text"]
344
 
345
+ # Clean up processed audio file
346
+ os.remove(processed_audio_path)
347
+ os.remove(wav_path)
348
+
349
+ # Return transcription and detected language
350
+ return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
351
+ except Exception as e:
352
+ return f"Error: {str(e)}"
353
 
354
  # Define the Gradio interface
355
  with gr.Blocks() as demo: