from transformers import pipeline import torch import gradio as gr # Identifiant du modèle sur Hugging Face model_id = "ilyes25/wav2vec2-large-mms-1b-DZ" # Détermine le device (GPU si disponible, sinon CPU) device = 0 if torch.cuda.is_available() else -1 torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Création du pipeline pour l'ASR pipe = pipeline( task="automatic-speech-recognition", model=model_id, device=device, torch_dtype=torch_dtype, framework="pt" ) LANGUAGES = { "Kabyle": "kab", "Arabe": "ar", "Français": "fr" } def transcribe_audio(audio_path, lang_name): lang = LANGUAGES[lang_name] pipe.model.load_adapter(lang) pipe.tokenizer.set_target_lang(lang) result = pipe(audio_path) return result["text"].replace("", "").replace("", "").strip() # Interface Gradio avec deux onglets (microphone et upload) with gr.Blocks() as app: gr.Markdown("## Transcription Speech-to-Text avec sélection de langue") with gr.Tabs(): with gr.TabItem("Utiliser le Microphone"): mic_input = gr.Audio(sources="microphone", type="filepath", label="Enregistrez votre audio") lang_dropdown = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Langue", value="Kabyle") mic_output = gr.Textbox(label="Transcription") mic_button = gr.Button("Transcrire") mic_button.click(transcribe_audio, inputs=[mic_input, lang_dropdown], outputs=mic_output) with gr.TabItem("Téléverser un Fichier Audio"): file_input = gr.Audio(sources="upload", type="filepath", label="Téléversez votre audio") lang_dropdown_file = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Langue", value="Kabyle") file_output = gr.Textbox(label="Transcription") file_button = gr.Button("Transcrire") file_button.click(transcribe_audio, inputs=[file_input, lang_dropdown_file], outputs=file_output) if __name__ == "__main__": app.launch()