Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

7fe102a

verified ·

1 Parent(s): b25ba0c

Update app.py

Browse files

Files changed (1) hide show

app.py +191 -31

app.py CHANGED Viewed

@@ -1,36 +1,196 @@
 import gradio as gr
-from faster_whisper import WhisperModel
-# Load the Faster Whisper model
-model = WhisperModel("large-v3", device="cpu")  # Use "cuda" for GPU acceleration
-# Define the transcription function
-def transcribe_audio(audio_file):
-    """
-    Transcribes the audio file using the Faster Whisper model.
-    """
-    try:
-        segments, info = model.transcribe(audio_file, beam_size=5)  # Adjust beam_size as needed
-        transcription = "\n".join(
-            [f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}" for segment in segments]
-        )
-        return transcription
-    except Exception as e:
-        return f"Error: {str(e)}"
-# Create the Gradio interface
-interface = gr.Interface(
-    fn=transcribe_audio,  # Function to process the input
-    inputs=gr.Audio(type="filepath", label="Upload Audio"),  # Corrected input component
-    outputs=gr.Textbox(label="Transcription"),  # Output: Textbox for the transcription
-    title="Audio-to-Text Transcription",
-    description=(
-        "Upload an audio file and get the transcription using the Faster Whisper model "
-        "large-v3. Supports high-quality transcription with beam search."
-    ),
-    allow_flagging="never",
-)
-# Launch the Gradio app
-if __name__ == "__main__":
-    interface.launch(server_name="0.0.0.0", server_port=7860, share=True)

 import gradio as gr
+import whisper
+import os
+from pydub import AudioSegment
+# Mapping of model names to Whisper model sizes
+MODELS = {
+    "Tiny (Fastest)": "tiny",
+    "Base (Faster)": "base",
+    "Small (Balanced)": "small",
+    "Medium (Accurate)": "medium",
+    "Large (Most Accurate)": "large"
+}
+# Mapping of full language names to language codes
+LANGUAGE_NAME_TO_CODE = {
+    "Auto Detect": "Auto Detect",
+    "English": "en",
+    "Chinese": "zh",
+    "German": "de",
+    "Spanish": "es",
+    "Russian": "ru",
+    "Korean": "ko",
+    "French": "fr",
+    "Japanese": "ja",
+    "Portuguese": "pt",
+    "Turkish": "tr",
+    "Polish": "pl",
+    "Catalan": "ca",
+    "Dutch": "nl",
+    "Arabic": "ar",
+    "Swedish": "sv",
+    "Italian": "it",
+    "Indonesian": "id",
+    "Hindi": "hi",
+    "Finnish": "fi",
+    "Vietnamese": "vi",
+    "Hebrew": "he",
+    "Ukrainian": "uk",
+    "Greek": "el",
+    "Malay": "ms",
+    "Czech": "cs",
+    "Romanian": "ro",
+    "Danish": "da",
+    "Hungarian": "hu",
+    "Tamil": "ta",
+    "Norwegian": "no",
+    "Thai": "th",
+    "Urdu": "ur",
+    "Croatian": "hr",
+    "Bulgarian": "bg",
+    "Lithuanian": "lt",
+    "Latin": "la",
+    "Maori": "mi",
+    "Malayalam": "ml",
+    "Welsh": "cy",
+    "Slovak": "sk",
+    "Telugu": "te",
+    "Persian": "fa",
+    "Latvian": "lv",
+    "Bengali": "bn",
+    "Serbian": "sr",
+    "Azerbaijani": "az",
+    "Slovenian": "sl",
+    "Kannada": "kn",
+    "Estonian": "et",
+    "Macedonian": "mk",
+    "Breton": "br",
+    "Basque": "eu",
+    "Icelandic": "is",
+    "Armenian": "hy",
+    "Nepali": "ne",
+    "Mongolian": "mn",
+    "Bosnian": "bs",
+    "Kazakh": "kk",
+    "Albanian": "sq",
+    "Swahili": "sw",
+    "Galician": "gl",
+    "Marathi": "mr",
+    "Punjabi": "pa",
+    "Sinhala": "si",  # Sinhala support
+    "Khmer": "km",
+    "Shona": "sn",
+    "Yoruba": "yo",
+    "Somali": "so",
+    "Afrikaans": "af",
+    "Occitan": "oc",
+    "Georgian": "ka",
+    "Belarusian": "be",
+    "Tajik": "tg",
+    "Sindhi": "sd",
+    "Gujarati": "gu",
+    "Amharic": "am",
+    "Yiddish": "yi",
+    "Lao": "lo",
+    "Uzbek": "uz",
+    "Faroese": "fo",
+    "Haitian Creole": "ht",
+    "Pashto": "ps",
+    "Turkmen": "tk",
+    "Nynorsk": "nn",
+    "Maltese": "mt",
+    "Sanskrit": "sa",
+    "Luxembourgish": "lb",
+    "Burmese": "my",
+    "Tibetan": "bo",
+    "Tagalog": "tl",
+    "Malagasy": "mg",
+    "Assamese": "as",
+    "Tatar": "tt",
+    "Hawaiian": "haw",
+    "Lingala": "ln",
+    "Hausa": "ha",
+    "Bashkir": "ba",
+    "Javanese": "jw",
+    "Sundanese": "su",
+}
+def detect_language(audio_file):
+    """Detect the language of the audio file."""
+    # Load the Whisper model (use "base" for faster detection)
+    model = whisper.load_model("base")
+    # Convert audio to 16kHz mono for better compatibility with Whisper
+    audio = AudioSegment.from_file(audio_file)
+    audio = audio.set_frame_rate(16000).set_channels(1)
+    processed_audio_path = "processed_audio.wav"
+    audio.export(processed_audio_path, format="wav")
+    # Detect the language
+    result = model.transcribe(processed_audio_path, task="detect_language", fp16=False)
+    detected_language = result.get("language", "unknown")
+    # Clean up processed audio file
+    os.remove(processed_audio_path)
+    return f"Detected Language: {detected_language}"
+def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
+    """Transcribe the audio file."""
+    # Load the selected Whisper model
+    model = whisper.load_model(MODELS[model_size])
+    # Convert audio to 16kHz mono for better compatibility with Whisper
+    audio = AudioSegment.from_file(audio_file)
+    audio = audio.set_frame_rate(16000).set_channels(1)
+    processed_audio_path = "processed_audio.wav"
+    audio.export(processed_audio_path, format="wav")
+    # Transcribe the audio
+    if language == "Auto Detect":
+        result = model.transcribe(processed_audio_path, fp16=False)  # Auto-detect language
+        detected_language = result.get("language", "unknown")
+    else:
+        language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
+        result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
+        detected_language = language_code
+    # Clean up processed audio file
+    os.remove(processed_audio_path)
+    # Return transcription and detected language
+    return f"Detected Language: {detected_language}\n\nTranscription:\n{result['text']}"
+# Define the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Audio Transcription and Language Detection")
+    with gr.Tab("Detect Language"):
+        gr.Markdown("Upload an audio file to detect its language.")
+        detect_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
+        detect_language_output = gr.Textbox(label="Detected Language")
+        detect_button = gr.Button("Detect Language")
+    with gr.Tab("Transcribe Audio"):
+        gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
+        transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
+        language_dropdown = gr.Dropdown(
+            choices=list(LANGUAGE_NAME_TO_CODE.keys()),  # Full language names
+            label="Select Language",
+            value="Auto Detect"
+        )
+        model_dropdown = gr.Dropdown(
+            choices=list(MODELS.keys()),  # Model options
+            label="Select Model",
+            value="Base (Faster)"  # Default to "Base" model
+        )
+        transcribe_output = gr.Textbox(label="Transcription and Detected Language")
+        transcribe_button = gr.Button("Transcribe Audio")
+    # Link buttons to functions
+    detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
+    transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
+# Launch the Gradio interface
+demo.launch()