Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

8ceb7c6

verified ·

1 Parent(s): 380d6cf

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -137

app.py CHANGED Viewed

@@ -125,32 +125,42 @@ LANGUAGE_NAME_TO_CODE = {
 # Reverse mapping of language codes to full language names
 CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
 def detect_language(audio_file):
     """Detect the language of the audio file."""
-    # Define device and compute type for faster-whisper
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    compute_type = "float32" if device == "cuda" else "int8"
-    # Load the faster-whisper model for language detection
-    model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
-    # Convert audio to 16kHz mono for better compatibility
-    audio = AudioSegment.from_file(audio_file)
-    audio = audio.set_frame_rate(16000).set_channels(1)
-    processed_audio_path = "processed_audio.wav"
-    audio.export(processed_audio_path, format="wav")
-    # Detect the language using faster-whisper
-    segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
-    detected_language_code = info.language
-    # Get the full language name from the code
-    detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
-    # Clean up processed audio file
-    os.remove(processed_audio_path)
-    return f"Detected Language: {detected_language}"
 def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     """
@@ -164,29 +174,41 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     Returns:
         str: Path to the output audio file with silence removed.
     """
-    # Load the audio file
-    audio = AudioSegment.from_file(audio_file)
-    # Detect silent chunks
-    silent_chunks = silence.detect_silence(
-        audio,
-        min_silence_len=min_silence_len,
-        silence_thresh=silence_threshold
-    )
-    # Remove silent chunks
-    non_silent_audio = AudioSegment.empty()
-    start = 0
-    for chunk in silent_chunks:
-        non_silent_audio += audio[start:chunk[0]]  # Add non-silent part
-        start = chunk[1]  # Move to the end of the silent chunk
-    non_silent_audio += audio[start:]  # Add the remaining part
-    # Export the processed audio
-    output_path = "silence_removed_audio.wav"
-    non_silent_audio.export(output_path, format="wav")
-    return output_path
 def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
     """
@@ -201,109 +223,133 @@ def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
         str: Path to the trimmed audio file.
         str: Detected timestamps in the format "start-end (in seconds)".
     """
-    # Load audio files
-    main_rate, main_data = wavfile.read(main_audio)
-    target_rate, target_data = wavfile.read(target_audio)
-    # Ensure both audio files have the same sample rate
-    if main_rate != target_rate:
-        raise ValueError("Sample rates of the main audio and target audio must match.")
-    # Normalize audio data
-    main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
-    target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
-    # Perform cross-correlation to detect the target audio in the main audio
-    correlation = correlate(main_data, target_data, mode='valid')
-    correlation = np.abs(correlation)
-    max_corr = np.max(correlation)
-    # Detect segments where the target audio is present
-    detected_segments = []
-    for i, corr_value in enumerate(correlation):
-        if corr_value >= threshold * max_corr:
-            start_time = i / main_rate
-            end_time = (i + len(target_data)) / main_rate
-            detected_segments.append((start_time, end_time))
-    # Merge overlapping or nearby segments
-    merged_segments = []
-    for segment in detected_segments:
-        if not merged_segments:
-            merged_segments.append(segment)
-        else:
-            last_segment = merged_segments[-1]
-            if segment[0] <= last_segment[1] + 1.0:  # Merge if within 1 second
-                merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
-            else:
                 merged_segments.append(segment)
-    # Trim the main audio to include only the detected segments
-    main_audio_segment = AudioSegment.from_file(main_audio)
-    trimmed_audio = AudioSegment.empty()
-    timestamps = []
-    for segment in merged_segments:
-        start_ms = int(segment[0] * 1000)
-        end_ms = int(segment[1] * 1000)
-        trimmed_audio += main_audio_segment[start_ms:end_ms]
-        timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
-    # Export the trimmed audio
-    output_path = "trimmed_audio.wav"
-    trimmed_audio.export(output_path, format="wav")
-    # Format timestamps
-    timestamps_str = "\n".join(timestamps)
-    return output_path, timestamps_str
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
-    # Convert audio to 16kHz mono for better compatibility
-    audio = AudioSegment.from_file(audio_file)
-    audio = audio.set_frame_rate(16000).set_channels(1)
-    processed_audio_path = "processed_audio.wav"
-    audio.export(processed_audio_path, format="wav")
-    # Load the appropriate model
-    if model_size == "Faster Whisper Large v3":
-        # Define device and compute type for faster-whisper
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        compute_type = "float32" if device == "cuda" else "int8"
-        # Use faster-whisper for the Systran model
-        model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
-        segments, info = model.transcribe(
-            processed_audio_path,
-            task="transcribe",
-            word_timestamps=True,
-            repetition_penalty=1.1,
-            temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
-        )
-        transcription = " ".join([segment.text for segment in segments])
-        detected_language_code = info.language
-        detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
-    else:
-        # Use the standard Whisper model
-        model = whisper.load_model(MODELS[model_size])
-        # Transcribe the audio
-        if language == "Auto Detect":
-            result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
-            detected_language_code = result.get("language", "unknown")
             detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
         else:
-            language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
-            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
-            detected_language = language
-        transcription = result["text"]
-    # Clean up processed audio file
-    os.remove(processed_audio_path)
-    # Return transcription and detected language
-    return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 # Define the Gradio interface
 with gr.Blocks() as demo:

 # Reverse mapping of language codes to full language names
 CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
+def convert_to_wav(audio_file):
+    """Convert any audio file to WAV format."""
+    audio = AudioSegment.from_file(audio_file)
+    wav_path = "temp_audio.wav"
+    audio.export(wav_path, format="wav")
+    return wav_path
 def detect_language(audio_file):
     """Detect the language of the audio file."""
+    if audio_file is None:
+        return "Error: No audio file uploaded."
+    try:
+        # Convert audio to WAV format
+        wav_path = convert_to_wav(audio_file)
+        # Define device and compute type for faster-whisper
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        compute_type = "float32" if device == "cuda" else "int8"
+        # Load the faster-whisper model for language detection
+        model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
+        # Detect the language using faster-whisper
+        segments, info = model.transcribe(wav_path, task="translate", language=None)
+        detected_language_code = info.language
+        # Get the full language name from the code
+        detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
+        # Clean up temporary WAV file
+        os.remove(wav_path)
+        return f"Detected Language: {detected_language}"
+    except Exception as e:
+        return f"Error: {str(e)}"
 def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     """
     Returns:
         str: Path to the output audio file with silence removed.
     """
+    if audio_file is None:
+        return None
+    try:
+        # Convert audio to WAV format
+        wav_path = convert_to_wav(audio_file)
+        # Load the audio file
+        audio = AudioSegment.from_file(wav_path)
+        # Detect silent chunks
+        silent_chunks = silence.detect_silence(
+            audio,
+            min_silence_len=min_silence_len,
+            silence_thresh=silence_threshold
+        )
+        # Remove silent chunks
+        non_silent_audio = AudioSegment.empty()
+        start = 0
+        for chunk in silent_chunks:
+            non_silent_audio += audio[start:chunk[0]]  # Add non-silent part
+            start = chunk[1]  # Move to the end of the silent chunk
+        non_silent_audio += audio[start:]  # Add the remaining part
+        # Export the processed audio
+        output_path = "silence_removed_audio.wav"
+        non_silent_audio.export(output_path, format="wav")
+        # Clean up temporary WAV file
+        os.remove(wav_path)
+        return output_path
+    except Exception as e:
+        return f"Error: {str(e)}"
 def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
     """
         str: Path to the trimmed audio file.
         str: Detected timestamps in the format "start-end (in seconds)".
     """
+    if main_audio is None or target_audio is None:
+        return None, "Error: Please upload both main and target audio files."
+    try:
+        # Convert audio files to WAV format
+        main_wav_path = convert_to_wav(main_audio)
+        target_wav_path = convert_to_wav(target_audio)
+        # Load audio files
+        main_rate, main_data = wavfile.read(main_wav_path)
+        target_rate, target_data = wavfile.read(target_wav_path)
+        # Ensure both audio files have the same sample rate
+        if main_rate != target_rate:
+            raise ValueError("Sample rates of the main audio and target audio must match.")
+        # Normalize audio data
+        main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
+        target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
+        # Perform cross-correlation to detect the target audio in the main audio
+        correlation = correlate(main_data, target_data, mode='valid')
+        correlation = np.abs(correlation)
+        max_corr = np.max(correlation)
+        # Detect segments where the target audio is present
+        detected_segments = []
+        for i, corr_value in enumerate(correlation):
+            if corr_value >= threshold * max_corr:
+                start_time = i / main_rate
+                end_time = (i + len(target_data)) / main_rate
+                detected_segments.append((start_time, end_time))
+        # Merge overlapping or nearby segments
+        merged_segments = []
+        for segment in detected_segments:
+            if not merged_segments:
                 merged_segments.append(segment)
+            else:
+                last_segment = merged_segments[-1]
+                if segment[0] <= last_segment[1] + 1.0:  # Merge if within 1 second
+                    merged_segments[-1] = (last_segment[0], max(last_segment[1], segment[1]))
+                else:
+                    merged_segments.append(segment)
+        # Trim the main audio to include only the detected segments
+        main_audio_segment = AudioSegment.from_file(main_wav_path)
+        trimmed_audio = AudioSegment.empty()
+        timestamps = []
+        for segment in merged_segments:
+            start_ms = int(segment[0] * 1000)
+            end_ms = int(segment[1] * 1000)
+            trimmed_audio += main_audio_segment[start_ms:end_ms]
+            timestamps.append(f"{segment[0]:.2f}-{segment[1]:.2f}")
+        # Export the trimmed audio
+        output_path = "trimmed_audio.wav"
+        trimmed_audio.export(output_path, format="wav")
+        # Format timestamps
+        timestamps_str = "\n".join(timestamps)
+        # Clean up temporary WAV files
+        os.remove(main_wav_path)
+        os.remove(target_wav_path)
+        return output_path, timestamps_str
+    except Exception as e:
+        return None, f"Error: {str(e)}"
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
+    if audio_file is None:
+        return "Error: No audio file uploaded."
+    try:
+        # Convert audio to WAV format
+        wav_path = convert_to_wav(audio_file)
+        # Convert audio to 16kHz mono for better compatibility
+        audio = AudioSegment.from_file(wav_path)
+        audio = audio.set_frame_rate(16000).set_channels(1)
+        processed_audio_path = "processed_audio.wav"
+        audio.export(processed_audio_path, format="wav")
+        # Load the appropriate model
+        if model_size == "Faster Whisper Large v3":
+            # Define device and compute type for faster-whisper
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            compute_type = "float32" if device == "cuda" else "int8"
+            # Use faster-whisper for the Systran model
+            model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
+            segments, info = model.transcribe(
+                processed_audio_path,
+                task="transcribe",
+                word_timestamps=True,
+                repetition_penalty=1.1,
+                temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
+            )
+            transcription = " ".join([segment.text for segment in segments])
+            detected_language_code = info.language
             detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
         else:
+            # Use the standard Whisper model
+            model = whisper.load_model(MODELS[model_size])
+            # Transcribe the audio
+            if language == "Auto Detect":
+                result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
+                detected_language_code = result.get("language", "unknown")
+                detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
+            else:
+                language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
+                result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
+                detected_language = language
+            transcription = result["text"]
+        # Clean up processed audio file
+        os.remove(processed_audio_path)
+        os.remove(wav_path)
+        # Return transcription and detected language
+        return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
+    except Exception as e:
+        return f"Error: {str(e)}"
 # Define the Gradio interface
 with gr.Blocks() as demo: