import gradio as gr import whisper import torch import os from pydub import AudioSegment, silence from faster_whisper import WhisperModel # Import faster-whisper from spleeter.separator import Separator # Import Spleeter for music separation # Mapping of model names to Whisper model sizes MODELS = { "Tiny (Fastest)": "tiny", "Base (Faster)": "base", "Small (Balanced)": "small", "Medium (Accurate)": "medium", "Large (Most Accurate)": "large", "Faster Whisper Large v3": "Systran/faster-whisper-large-v3" # Renamed and set as default } # Mapping of full language names to language codes LANGUAGE_NAME_TO_CODE = { "Auto Detect": "Auto Detect", "English": "en", "Chinese": "zh", "German": "de", "Spanish": "es", "Russian": "ru", "Korean": "ko", "French": "fr", "Japanese": "ja", "Portuguese": "pt", "Turkish": "tr", "Polish": "pl", "Catalan": "ca", "Dutch": "nl", "Arabic": "ar", "Swedish": "sv", "Italian": "it", "Indonesian": "id", "Hindi": "hi", "Finnish": "fi", "Vietnamese": "vi", "Hebrew": "he", "Ukrainian": "uk", "Greek": "el", "Malay": "ms", "Czech": "cs", "Romanian": "ro", "Danish": "da", "Hungarian": "hu", "Tamil": "ta", "Norwegian": "no", "Thai": "th", "Urdu": "ur", "Croatian": "hr", "Bulgarian": "bg", "Lithuanian": "lt", "Latin": "la", "Maori": "mi", "Malayalam": "ml", "Welsh": "cy", "Slovak": "sk", "Telugu": "te", "Persian": "fa", "Latvian": "lv", "Bengali": "bn", "Serbian": "sr", "Azerbaijani": "az", "Slovenian": "sl", "Kannada": "kn", "Estonian": "et", "Macedonian": "mk", "Breton": "br", "Basque": "eu", "Icelandic": "is", "Armenian": "hy", "Nepali": "ne", "Mongolian": "mn", "Bosnian": "bs", "Kazakh": "kk", "Albanian": "sq", "Swahili": "sw", "Galician": "gl", "Marathi": "mr", "Punjabi": "pa", "Sinhala": "si", # Sinhala support "Khmer": "km", "Shona": "sn", "Yoruba": "yo", "Somali": "so", "Afrikaans": "af", "Occitan": "oc", "Georgian": "ka", "Belarusian": "be", "Tajik": "tg", "Sindhi": "sd", "Gujarati": "gu", "Amharic": "am", "Yiddish": "yi", "Lao": "lo", "Uzbek": "uz", "Faroese": "fo", "Haitian Creole": "ht", "Pashto": "ps", "Turkmen": "tk", "Nynorsk": "nn", "Maltese": "mt", "Sanskrit": "sa", "Luxembourgish": "lb", "Burmese": "my", "Tibetan": "bo", "Tagalog": "tl", "Malagasy": "mg", "Assamese": "as", "Tatar": "tt", "Hawaiian": "haw", "Lingala": "ln", "Hausa": "ha", "Bashkir": "ba", "Javanese": "jw", "Sundanese": "su", } # Reverse mapping of language codes to full language names CODE_TO_LANGUAGE_NAME = {v: k for k, v in LANGUAGE_NAME_TO_CODE.items()} def detect_language(audio_file): """Detect the language of the audio file.""" # Define device and compute type for faster-whisper device = "cuda" if torch.cuda.is_available() else "cpu" compute_type = "float32" if device == "cuda" else "int8" # Load the faster-whisper model for language detection model = WhisperModel(MODELS["Faster Whisper Large v3"], device=device, compute_type=compute_type) # Convert audio to 16kHz mono for better compatibility audio = AudioSegment.from_file(audio_file) audio = audio.set_frame_rate(16000).set_channels(1) processed_audio_path = "processed_audio.wav" audio.export(processed_audio_path, format="wav") # Detect the language using faster-whisper segments, info = model.transcribe(processed_audio_path, task="translate", language=None) detected_language_code = info.language # Get the full language name from the code detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language") # Clean up processed audio file os.remove(processed_audio_path) return f"Detected Language: {detected_language}" def remove_silence(audio_file, silence_threshold=-40, min_silence_len=500): """ Remove silence from the audio file using AI-based silence detection. Args: audio_file (str): Path to the input audio file. silence_threshold (int): Silence threshold in dB. Default is -40 dB. min_silence_len (int): Minimum length of silence to remove in milliseconds. Default is 500 ms. Returns: str: Path to the output audio file with silence removed. """ # Load the audio file audio = AudioSegment.from_file(audio_file) # Detect silent chunks silent_chunks = silence.detect_silence( audio, min_silence_len=min_silence_len, silence_thresh=silence_threshold ) # Remove silent chunks non_silent_audio = AudioSegment.empty() start = 0 for chunk in silent_chunks: non_silent_audio += audio[start:chunk[0]] # Add non-silent part start = chunk[1] # Move to the end of the silent chunk non_silent_audio += audio[start:] # Add the remaining part # Export the processed audio output_path = "silence_removed_audio.wav" non_silent_audio.export(output_path, format="wav") return output_path def remove_background_music(audio_file): """ Remove background music from the audio file using Spleeter. Args: audio_file (str): Path to the input audio file. Returns: str: Path to the output audio file with background music removed. """ # Initialize Spleeter separator (2 stems: vocals and accompaniment) separator = Separator('spleeter:2stems') # Separate the audio into vocals and accompaniment output_folder = "output" separator.separate_to_file(audio_file, output_folder) # Load the separated vocals base_name = os.path.splitext(os.path.basename(audio_file))[0] vocals_path = os.path.join(output_folder, base_name, "vocals.wav") # Return the path to the vocals file return vocals_path def transcribe_audio(audio_file, language="Auto Detect", model_size="Faster Whisper Large v3"): """Transcribe the audio file.""" # Convert audio to 16kHz mono for better compatibility audio = AudioSegment.from_file(audio_file) audio = audio.set_frame_rate(16000).set_channels(1) processed_audio_path = "processed_audio.wav" audio.export(processed_audio_path, format="wav") # Load the appropriate model if model_size == "Faster Whisper Large v3": # Define device and compute type for faster-whisper device = "cuda" if torch.cuda.is_available() else "cpu" compute_type = "float32" if device == "cuda" else "int8" # Use faster-whisper for the Systran model model = WhisperModel(MODELS[model_size], device=device, compute_type=compute_type) segments, info = model.transcribe( processed_audio_path, task="transcribe", word_timestamps=True, repetition_penalty=1.1, temperature=[0.0, 0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0], ) transcription = " ".join([segment.text for segment in segments]) detected_language_code = info.language detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language") else: # Use the standard Whisper model model = whisper.load_model(MODELS[model_size]) # Transcribe the audio if language == "Auto Detect": result = model.transcribe(processed_audio_path, fp16=False) # Auto-detect language detected_language_code = result.get("language", "unknown") detected_language = CODE_TO_LANGUAGE_NAME.get(detected_language_code, "Unknown Language") else: language_code = LANGUAGE_NAME_TO_CODE.get(language, "en") # Default to English if not found result = model.transcribe(processed_audio_path, language=language_code, fp16=False) detected_language = language transcription = result["text"] # Clean up processed audio file os.remove(processed_audio_path) # Return transcription and detected language return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}" # Define the Gradio interface with gr.Blocks() as demo: gr.Markdown("# Audio Transcription and Language Detection") with gr.Tab("Detect Language"): gr.Markdown("Upload an audio file to detect its language.") detect_audio_input = gr.Audio(type="filepath", label="Upload Audio File") detect_language_output = gr.Textbox(label="Detected Language") detect_button = gr.Button("Detect Language") with gr.Tab("Transcribe Audio"): gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.") transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File") language_dropdown = gr.Dropdown( choices=list(LANGUAGE_NAME_TO_CODE.keys()), # Full language names label="Select Language", value="Auto Detect" ) model_dropdown = gr.Dropdown( choices=list(MODELS.keys()), # Model options label="Select Model", value="Faster Whisper Large v3", # Default to "Faster Whisper Large v3" interactive=True # Allow model selection by default ) transcribe_output = gr.Textbox(label="Transcription and Detected Language") transcribe_button = gr.Button("Transcribe Audio") with gr.Tab("Remove Silence"): gr.Markdown("Upload an audio file to remove silence.") silence_audio_input = gr.Audio(type="filepath", label="Upload Audio File") silence_threshold_slider = gr.Slider( minimum=-60, maximum=-20, value=-40, step=1, label="Silence Threshold (dB)", info="Lower values detect quieter sounds as silence." ) min_silence_len_slider = gr.Slider( minimum=100, maximum=2000, value=500, step=100, label="Minimum Silence Length (ms)", info="Minimum duration of silence to remove." ) silence_output = gr.Audio(label="Processed Audio (Silence Removed)", type="filepath") silence_button = gr.Button("Remove Silence") with gr.Tab("Remove Background Music"): gr.Markdown("Upload an audio file to remove background music.") bg_music_audio_input = gr.Audio(type="filepath", label="Upload Audio File") bg_music_output = gr.Audio(label="Processed Audio (Background Music Removed)", type="filepath") bg_music_button = gr.Button("Remove Background Music") # Link buttons to functions detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output) transcribe_button.click( transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output ) silence_button.click( remove_silence, inputs=[silence_audio_input, silence_threshold_slider, min_silence_len_slider], outputs=silence_output ) bg_music_button.click( remove_background_music, inputs=bg_music_audio_input, outputs=bg_music_output ) # Launch the Gradio interface demo.launch()