Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

dbffdf4

verified ·

1 Parent(s): c885037

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -232

app.py CHANGED Viewed

@@ -3,16 +3,7 @@ import whisper
 import torch
 import os
 from pydub import AudioSegment, silence
-from faster_whisper import WhisperModel
-import numpy as np
-from scipy.io import wavfile
-from scipy.signal import correlate
-import tempfile
-import logging
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 # Mapping of model names to Whisper model sizes
 MODELS = {
@@ -131,51 +122,32 @@ LANGUAGE_NAME_TO_CODE = {
 # Reverse mapping of language codes to full language names
 CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
-def convert_to_wav(audio_file):
-    """Convert any audio file to WAV format."""
-    audio = AudioSegment.from_file(audio_file)
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
-        wav_path = temp_wav.name
-        audio.export(wav_path, format="wav")
-    return wav_path
-def resample_audio(audio_segment, target_sample_rate):
-    """Resample an audio segment to the target sample rate."""
-    return audio_segment.set_frame_rate(target_sample_rate)
 def detect_language(audio_file):
     """Detect the language of the audio file."""
-    if audio_file is None:
-        return "Error: No audio file uploaded."
-    try:
-        # Define device and compute type for faster-whisper
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        compute_type = "float32" if device == "cuda" else "int8"
-        # Load the faster-whisper model for language detection
-        model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
-        # Convert audio to 16kHz mono for better compatibility
-        audio = AudioSegment.from_file(audio_file)
-        audio = audio.set_frame_rate(16000).set_channels(1)
-        processed_audio_path = "processed_audio.wav"
-        audio.export(processed_audio_path, format="wav")
-        # Detect the language using faster-whisper
-        segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
-        detected_language_code = info.language
-        # Get the full language name from the code
-        detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
-        # Clean up processed audio file
-        os.remove(processed_audio_path)
-        return f"Detected Language: {detected_language}"
-    except Exception as e:
-        logger.error(f"Error in detect_language: {str(e)}", exc_info=True)
-        return f"Error: {str(e)}"
 def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     """
@@ -189,183 +161,81 @@ def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     Returns:
         str: Path to the output audio file with silence removed.
     """
-    if audio_file is None:
-        return None
-    try:
-        # Convert audio to WAV format
-        wav_path = convert_to_wav(audio_file)
-        # Load the audio file
-        audio = AudioSegment.from_file(wav_path)
-        # Detect silent chunks
-        silent_chunks = silence.detect_silence(
-            audio,
-            min_silence_len=min_silence_len,
-            silence_thresh=silence_threshold
-        )
-        # Remove silent chunks
-        non_silent_audio = AudioSegment.empty()
-        start = 0
-        for chunk in silent_chunks:
-            non_silent_audio += audio[start:chunk[0]]  # Add non-silent part
-            start = chunk[1]  # Move to the end of the silent chunk
-        non_silent_audio += audio[start:]  # Add the remaining part
-        # Export the processed audio
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
-            output_path = temp_output.name
-            non_silent_audio.export(output_path, format="wav")
-        # Clean up temporary WAV file
-        os.remove(wav_path)
-        return output_path
-    except Exception as e:
-        logger.error(f"Error in remove_silence: {str(e)}")
-        return f"Error: {str(e)}"
-def detect_and_trim_audio(main_audio, target_audio, threshold=0.5):
-    """
-    Detect the target audio in the main audio and trim the main audio to include only the detected segments.
-    Args:
-        main_audio (str): Path to the main audio file.
-        target_audio (str): Path to the target audio file.
-        threshold (float): Detection threshold (0 to 1). Higher values mean stricter detection.
-    Returns:
-        str: Path to the trimmed audio file.
-        str: Detected timestamps in the format "start-end (in seconds)".
-    """
-    if main_audio is None or target_audio is None:
-        return None, "Error: Please upload both main and target audio files."
-    try:
-        # Convert audio files to WAV format
-        main_wav_path = convert_to_wav(main_audio)
-        target_wav_path = convert_to_wav(target_audio)
-        # Load audio files
-        main_rate, main_data = wavfile.read(main_wav_path)
-        target_rate, target_data = wavfile.read(target_wav_path)
-        # Ensure both audio files have the same sample rate
-        if main_rate != target_rate:
-            logger.warning(f"Sample rates differ: main_audio={main_rate}, target_audio={target_rate}. Resampling target audio.")
-            target_segment = AudioSegment.from_file(target_wav_path)
-            target_segment = resample_audio(target_segment, main_rate)
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_resampled:
-                resampled_path = temp_resampled.name
-                target_segment.export(resampled_path, format="wav")
-            target_rate, target_data = wavfile.read(resampled_path)
-        # Normalize audio data
-        main_data = main_data.astype(np.float32) / np.iinfo(main_data.dtype).max
-        target_data = target_data.astype(np.float32) / np.iinfo(target_data.dtype).max
-        # Perform cross-correlation to detect the target audio in the main audio
-        correlation = correlate(main_data, target_data, mode='valid')
-        correlation = np.abs(correlation)
-        max_corr = np.max(correlation)
-        # Find the peak in the cross-correlation result
-        peak_index = np.argmax(correlation)
-        peak_value = correlation[peak_index]
-        # Check if the peak value exceeds the threshold
-        if peak_value < threshold * max_corr:
-            return None, "Error: Target audio not detected in the main audio."
-        # Calculate the start and end times of the target audio in the main audio
-        start_time = peak_index / main_rate
-        end_time = (peak_index + len(target_data)) / main_rate
-        # Trim the main audio to include only the detected segment
-        main_audio_segment = AudioSegment.from_file(main_wav_path)
-        start_ms = int(start_time * 1000)
-        end_ms = int(end_time * 1000)
-        trimmed_audio = main_audio_segment[start_ms:end_ms]
-        # Export the trimmed audio
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_output:
-            output_path = temp_output.name
-            trimmed_audio.export(output_path, format="wav")
-        # Format timestamps
-        timestamps_str = f"{start_time:.2f}-{end_time:.2f}"
-        # Clean up temporary WAV files
-        os.remove(main_wav_path)
-        os.remove(target_wav_path)
-        if 'resampled_path' in locals():
-            os.remove(resampled_path)
-        return output_path, timestamps_str
-    except Exception as e:
-        logger.error(f"Error in detect_and_trim_audio: {str(e)}")
-        return None, f"Error: {str(e)}"
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
-    if audio_file is None:
-        return "Error: No audio file uploaded."
-    try:
-        # Convert audio to 16kHz mono for better compatibility
-        audio = AudioSegment.from_file(audio_file)
-        audio = audio.set_frame_rate(16000).set_channels(1)
-        processed_audio_path = "processed_audio.wav"
-        audio.export(processed_audio_path, format="wav")
-        # Load the appropriate model
-        if model_size == "Faster Whisper Large v3":
-            # Define device and compute type for faster-whisper
-            device = "cuda" if torch.cuda.is_available() else "cpu"
-            compute_type = "float32" if device == "cuda" else "int8"
-            # Use faster-whisper for the Systran model
-            model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
-            segments, info = model.transcribe(
-                processed_audio_path,
-                task="transcribe",
-                word_timestamps=True,
-                repetition_penalty=1.1,
-                temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
-            )
-            transcription = " ".join([segment.text for segment in segments])
-            detected_language_code = info.language
             detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
         else:
-            # Use the standard Whisper model
-            model = whisper.load_model(MODELS[model_size])
-            # Transcribe the audio
-            if language == "Auto Detect":
-                result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
-                detected_language_code = result.get("language", "unknown")
-                detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
-            else:
-                language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
-                result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
-                detected_language = language
-            transcription = result["text"]
-        # Clean up processed audio file
-        os.remove(processed_audio_path)
-        # Return transcription and detected language
-        return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
-    except Exception as e:
-        logger.error(f"Error in transcribe_audio: {str(e)}")
-        return f"Error: {str(e)}"
 # Define the Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Audio Processing Tool")
     with gr.Tab("Detect Language"):
         gr.Markdown("Upload an audio file to detect its language.")
@@ -406,19 +276,6 @@ with gr.Blocks() as demo:
         silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
         silence_button = gr.Button("Remove Silence")
-    with gr.Tab("Detect and Trim Audio"):
-        gr.Markdown("Upload a main audio file and a target audio file. The app will detect the target audio in the main audio and trim it.")
-        main_audio_input = gr.Audio(type="filepath", label="Upload Main Audio File")
-        target_audio_input = gr.Audio(type="filepath", label="Upload Target Audio File")
-        threshold_slider = gr.Slider(
-            minimum=0.1, maximum=1.0, value=0.5, step=0.1,
-            label="Detection Threshold",
-            info="Higher values mean stricter detection."
-        )
-        trimmed_audio_output = gr.Audio(label="Trimmed Audio", type="filepath")
-        timestamps_output = gr.Textbox(label="Detected Timestamps (in seconds)")
-        detect_button = gr.Button("Detect and Trim")
     # Link buttons to functions
     detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
     transcribe_button.click(
@@ -431,11 +288,6 @@ with gr.Blocks() as demo:
         inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
         outputs=silence_output
     )
-    detect_button.click(
-        detect_and_trim_audio,
-        inputs=[main_audio_input, target_audio_input, threshold_slider],
-        outputs=[trimmed_audio_output, timestamps_output]
-    )
 # Launch the Gradio interface
 demo.launch()

 import torch
 import os
 from pydub import AudioSegment, silence
+from faster_whisper import WhisperModel  # Import faster-whisper
 # Mapping of model names to Whisper model sizes
 MODELS = {
 # Reverse mapping of language codes to full language names
 CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()}
 def detect_language(audio_file):
     """Detect the language of the audio file."""
+    # Define device and compute type for faster-whisper
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    compute_type = "float32" if device == "cuda" else "int8"
+    # Load the faster-whisper model for language detection
+    model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type)
+    # Convert audio to 16kHz mono for better compatibility
+    audio = AudioSegment.from_file(audio_file)
+    audio = audio.set_frame_rate(16000).set_channels(1)
+    processed_audio_path = "processed_audio.wav"
+    audio.export(processed_audio_path, format="wav")
+    # Detect the language using faster-whisper
+    segments, info = model.transcribe(processed_audio_path, task="translate", language=None)
+    detected_language_code = info.language
+    # Get the full language name from the code
+    detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
+    # Clean up processed audio file
+    os.remove(processed_audio_path)
+    return f"Detected Language: {detected_language}"
 def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500):
     """
     Returns:
         str: Path to the output audio file with silence removed.
     """
+    # Load the audio file
+    audio = AudioSegment.from_file(audio_file)
+    # Detect silent chunks
+    silent_chunks = silence.detect_silence(
+        audio,
+        min_silence_len=min_silence_len,
+        silence_thresh=silence_threshold
+    )
+    # Remove silent chunks
+    non_silent_audio = AudioSegment.empty()
+    start = 0
+    for chunk in silent_chunks:
+        non_silent_audio += audio[start:chunk[0]]  # Add non-silent part
+        start = chunk[1]  # Move to the end of the silent chunk
+    non_silent_audio += audio[start:]  # Add the remaining part
+    # Export the processed audio
+    output_path = "silence_removed_audio.wav"
+    non_silent_audio.export(output_path, format="wav")
+    return output_path
 def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"):
     """Transcribe the audio file."""
+    # Convert audio to 16kHz mono for better compatibility
+    audio = AudioSegment.from_file(audio_file)
+    audio = audio.set_frame_rate(16000).set_channels(1)
+    processed_audio_path = "processed_audio.wav"
+    audio.export(processed_audio_path, format="wav")
+    # Load the appropriate model
+    if model_size == "Faster Whisper Large v3":
+        # Define device and compute type for faster-whisper
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        compute_type = "float32" if device == "cuda" else "int8"
+        # Use faster-whisper for the Systran model
+        model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type)
+        segments, info = model.transcribe(
+            processed_audio_path,
+            task="transcribe",
+            word_timestamps=True,
+            repetition_penalty=1.1,
+            temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0],
+        )
+        transcription = " ".join([segment.text for segment in segments])
+        detected_language_code = info.language
+        detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
+    else:
+        # Use the standard Whisper model
+        model = whisper.load_model(MODELS[model_size])
+        # Transcribe the audio
+        if language == "Auto Detect":
+            result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
+            detected_language_code = result.get("language", "unknown")
             detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language")
         else:
+            language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
+            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
+            detected_language = language
+        transcription = result["text"]
+    # Clean up processed audio file
+    os.remove(processed_audio_path)
+    # Return transcription and detected language
+    return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
 # Define the Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Audio Transcription and Language Detector")  # Updated title
     with gr.Tab("Detect Language"):
         gr.Markdown("Upload an audio file to detect its language.")
         silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath")
         silence_button = gr.Button("Remove Silence")
     # Link buttons to functions
     detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
     transcribe_button.click(
         inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider],
         outputs=silence_output
     )
 # Launch the Gradio interface
 demo.launch()