Spaces:

Athspi
/

Ai-audio

Running

App Files Files Community

Athspi commited on Jan 12

Commit

1e0f1bc

verified ·

1 Parent(s): 5a84705

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -223

app.py CHANGED Viewed

@@ -1,224 +1,28 @@
 import gradio as gr
-import whisper
-import os
-from pydub import AudioSegment
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
-# Mapping of model names to Whisper model sizes
-MODELS = {
-    "Tiny (Fastest)": "tiny",
-    "Base (Faster)": "base",
-    "Small (Balanced)": "small",
-    "Medium (Accurate)": "medium",
-    "Large (Most Accurate)": "large",
-    "Fine-Tuned Hindi": "yash-04/whisper-base-hindi",  # Hindi fine-tuned model
-    "Fine-Tuned Tamil": "mahimairaja/whisper-base-tamil"  # Tamil fine-tuned model
-}
-# Mapping of full language names to language codes
-LANGUAGE_NAME_TO_CODE = {
-    "Auto Detect": "Auto Detect",
-    "English": "en",
-    "Chinese": "zh",
-    "German": "de",
-    "Spanish": "es",
-    "Russian": "ru",
-    "Korean": "ko",
-    "French": "fr",
-    "Japanese": "ja",
-    "Portuguese": "pt",
-    "Turkish": "tr",
-    "Polish": "pl",
-    "Catalan": "ca",
-    "Dutch": "nl",
-    "Arabic": "ar",
-    "Swedish": "sv",
-    "Italian": "it",
-    "Indonesian": "id",
-    "Hindi": "hi",
-    "Finnish": "fi",
-    "Vietnamese": "vi",
-    "Hebrew": "he",
-    "Ukrainian": "uk",
-    "Greek": "el",
-    "Malay": "ms",
-    "Czech": "cs",
-    "Romanian": "ro",
-    "Danish": "da",
-    "Hungarian": "hu",
-    "Tamil": "ta",
-    "Norwegian": "no",
-    "Thai": "th",
-    "Urdu": "ur",
-    "Croatian": "hr",
-    "Bulgarian": "bg",
-    "Lithuanian": "lt",
-    "Latin": "la",
-    "Maori": "mi",
-    "Malayalam": "ml",
-    "Welsh": "cy",
-    "Slovak": "sk",
-    "Telugu": "te",
-    "Persian": "fa",
-    "Latvian": "lv",
-    "Bengali": "bn",
-    "Serbian": "sr",
-    "Azerbaijani": "az",
-    "Slovenian": "sl",
-    "Kannada": "kn",
-    "Estonian": "et",
-    "Macedonian": "mk",
-    "Breton": "br",
-    "Basque": "eu",
-    "Icelandic": "is",
-    "Armenian": "hy",
-    "Nepali": "ne",
-    "Mongolian": "mn",
-    "Bosnian": "bs",
-    "Kazakh": "kk",
-    "Albanian": "sq",
-    "Swahili": "sw",
-    "Galician": "gl",
-    "Marathi": "mr",
-    "Punjabi": "pa",
-    "Sinhala": "si",
-    "Khmer": "km",
-    "Shona": "sn",
-    "Yoruba": "yo",
-    "Somali": "so",
-    "Afrikaans": "af",
-    "Occitan": "oc",
-    "Georgian": "ka",
-    "Belarusian": "be",
-    "Tajik": "tg",
-    "Sindhi": "sd",
-    "Gujarati": "gu",
-    "Amharic": "am",
-    "Yiddish": "yi",
-    "Lao": "lo",
-    "Uzbek": "uz",
-    "Faroese": "fo",
-    "Haitian Creole": "ht",
-    "Pashto": "ps",
-    "Turkmen": "tk",
-    "Nynorsk": "nn",
-    "Maltese": "mt",
-    "Sanskrit": "sa",
-    "Luxembourgish": "lb",
-    "Burmese": "my",
-    "Tibetan": "bo",
-    "Tagalog": "tl",
-    "Malagasy": "mg",
-    "Assamese": "as",
-    "Tatar": "tt",
-    "Hawaiian": "haw",
-    "Lingala": "ln",
-    "Hausa": "ha",
-    "Bashkir": "ba",
-    "Javanese": "jw",
-    "Sundanese": "su",
-}
-def detect_language(audio_file):
-    """Detect the language of the audio file."""
-    # Load the Whisper model (use "base" for faster detection)
-    model = whisper.load_model("base")
-    # Convert audio to 16kHz mono for better compatibility with Whisper
-    audio = AudioSegment.from_file(audio_file)
-    audio = audio.set_frame_rate(16000).set_channels(1)
-    processed_audio_path = "processed_audio.wav"
-    audio.export(processed_audio_path, format="wav")
-    # Detect the language
-    result = model.transcribe(processed_audio_path, task="detect_language", fp16=False)
-    detected_language = result.get("language", "unknown")
-    # Clean up processed audio file
-    os.remove(processed_audio_path)
-    return f"Detected Language: {detected_language}"
-def transcribe_audio(audio_file, language="Auto Detect", model_size="Base (Faster)"):
-    """Transcribe the audio file."""
-    # Map language to fine-tuned model
-    language_to_model = {
-        "Hindi": "yash-04/whisper-base-hindi",
-        "Tamil": "mahimairaja/whisper-base-tamil",
-        # Add more mappings as needed
-    }
-    # Load the selected Whisper model
-    if language in language_to_model:
-        model_name = language_to_model[language]
-        model = WhisperForConditionalGeneration.from_pretrained(model_name)
-        processor = WhisperProcessor.from_pretrained(model_name)
-    else:
-        model = whisper.load_model(MODELS[model_size])
-        processor = None  # Use default Whisper processor
-    # Convert audio to 16kHz mono for better compatibility with Whisper
-    audio = AudioSegment.from_file(audio_file)
-    audio = audio.set_frame_rate(16000).set_channels(1)
-    processed_audio_path = "processed_audio.wav"
-    audio.export(processed_audio_path, format="wav")
-    # Transcribe the audio
-    if language == "Auto Detect":
-        if processor:
-            inputs = processor(processed_audio_path, return_tensors="pt", sampling_rate=16000)
-            result = model.generate(inputs.input_features)
-            transcription = processor.batch_decode(result, skip_special_tokens=True)[0]
-        else:
-            result = model.transcribe(processed_audio_path, fp16=False)
-            transcription = result["text"]
-        detected_language = result.get("language", "unknown")
-    else:
-        language_code = LANGUAGE_NAME_TO_CODE.get(language, "en")  # Default to English if not found
-        if processor:
-            inputs = processor(processed_audio_path, return_tensors="pt", sampling_rate=16000)
-            result = model.generate(inputs.input_features, language=language_code)
-            transcription = processor.batch_decode(result, skip_special_tokens=True)[0]
-        else:
-            result = model.transcribe(processed_audio_path, language=language_code, fp16=False)
-            transcription = result["text"]
-        detected_language = language_code
-    # Clean up processed audio file
-    os.remove(processed_audio_path)
-    # Return transcription and detected language
-    return f"Detected Language: {detected_language}\n\nTranscription:\n{transcription}"
-# Define the Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("# Audio Transcription and Language Detection")
-    with gr.Tab("Detect Language"):
-        gr.Markdown("Upload an audio file to detect its language.")
-        detect_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
-        detect_language_output = gr.Textbox(label="Detected Language")
-        detect_button = gr.Button("Detect Language")
-    with gr.Tab("Transcribe Audio"):
-        gr.Markdown("Upload an audio file, select a language (or choose 'Auto Detect'), and choose a model for transcription.")
-        transcribe_audio_input = gr.Audio(type="filepath", label="Upload Audio File")
-        language_dropdown = gr.Dropdown(
-            choices=list(LANGUAGE_NAME_TO_CODE.keys()),  # Full language names
-            label="Select Language",
-            value="Auto Detect"
-        )
-        model_dropdown = gr.Dropdown(
-            choices=list(MODELS.keys()),  # Model options
-            label="Select Model",
-            value="Base (Faster)"  # Default to "Base" model
-        )
-        transcribe_output = gr.Textbox(label="Transcription and Detected Language")
-        transcribe_button = gr.Button("Transcribe Audio")
-    # Link buttons to functions
-    detect_button.click(detect_language, inputs=detect_audio_input, outputs=detect_language_output)
-    transcribe_button.click(transcribe_audio, inputs=[transcribe_audio_input, language_dropdown, model_dropdown], outputs=transcribe_output)
-# Launch the Gradio interface
-demo.launch()

 import gradio as gr
+from transformers import pipeline
+# Load the Whisper model from Hugging Face
+model_name = "Subhaka/whisper-small-Sinhala-Fine_Tune"
+transcriber = pipeline("automatic-speech-recognition", model=model_name)
+# Define a transcription function
+def transcribe_audio(audio_file):
+    try:
+        transcription = transcriber(audio_file)["text"]
+        return transcription
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Create Gradio interface
+interface = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(source="upload", type="filepath", label="Upload Audio"),
+    outputs=gr.Textbox(label="Transcription"),
+    title="Sinhala Audio-to-Text Transcription",
+    description="Upload an audio file and get the transcription in Sinhala using the Whisper model fine-tuned by Subhaka.",
+    allow_flagging="never"
+)
+# Launch the Gradio app
+if __name__ == "__main__":
+    interface.launch(server_name="0.0.0.0", server_port=7860, share=True)