import gradio as gr import torchaudio from transformers import pipeline import torch from datasets import load_dataset # Modèle 1 : Transcription audio Wolof -> texte Wolof pipe_wolof = pipeline( task="automatic-speech-recognition", model="bilalfaye/wav2vec2-large-mms-1b-wolof", processor="bilalfaye/wav2vec2-large-mms-1b-wolof", device="cuda" if torch.cuda.is_available() else "cpu" ) # Fonction 1 : Transcription audio Wolof -> texte Wolof def transcribe_audio_wolof(audio): # Charger l'audio avec torchaudio waveform, sample_rate = torchaudio.load(audio) # Convertir stéréo en mono if waveform.shape[0] > 1: mono_audio = waveform.mean(dim=0, keepdim=True) else: mono_audio = waveform # Rééchantillonner à 16 kHz si nécessaire if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) mono_audio = resampler(mono_audio) sample_rate = 16000 # Convertir en tableau numpy mono_audio = mono_audio.squeeze(0).numpy() # Transcrire l'audio result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate}) return result['text'] # Modèle 2 : Texte Wolof -> audio Wolof synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof") # Charger les embeddings pour les voix masculine et féminine embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embedding_male = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) speaker_embedding_female = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # Fonction 2 : Texte Wolof -> audio Wolof () def text_to_speech_wolof(text, voice_type): embedding = speaker_embedding_male if voice_type == "Male" else speaker_embedding_female speech = synthesiser_wolof(text, forward_params={"speaker_embeddings": embedding}) return speech["sampling_rate"], speech["audio"] # Interface Gradio with gr.Blocks() as app: with gr.Tab("Transcription Audio -> Texte"): gr.Markdown("### Transcription audio Wolof vers texte") audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Enregistrer ou importer un fichier audio") transcription_output = gr.Textbox(label="Texte transcrit") transcribe_button = gr.Button("Transcrire") transcribe_button.click(transcribe_audio_wolof, inputs=audio_input, outputs=transcription_output) with gr.Tab("Texte -> Synthèse Vocale"): gr.Markdown("### Conversion de texte Wolof en audio") text_input = gr.Textbox(label="Entrez du texte en Wolof") voice_selector = gr.Radio(["Male", "Female"], label="Type de voix", value="Male") audio_output = gr.Audio(label="Synthèse vocale") synthesize_button = gr.Button("Synthétiser") synthesize_button.click(text_to_speech_wolof, inputs=[text_input, voice_selector], outputs=audio_output) # Lancer l'application app.launch(debug=True, share=True)