bilalfaye's picture
Update app.py
dbc6751 verified
import gradio as gr
import torchaudio
from transformers import pipeline
import torch
from datasets import load_dataset
# Modèle 1 : Transcription audio Wolof -> texte Wolof
pipe_wolof = pipeline(
task="automatic-speech-recognition",
model="bilalfaye/wav2vec2-large-mms-1b-wolof",
processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
device="cuda" if torch.cuda.is_available() else "cpu"
)
# Fonction 1 : Transcription audio Wolof -> texte Wolof
def transcribe_audio_wolof(audio):
# Charger l'audio avec torchaudio
waveform, sample_rate = torchaudio.load(audio)
# Convertir stéréo en mono
if waveform.shape[0] > 1:
mono_audio = waveform.mean(dim=0, keepdim=True)
else:
mono_audio = waveform
# Rééchantillonner à 16 kHz si nécessaire
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
mono_audio = resampler(mono_audio)
sample_rate = 16000
# Convertir en tableau numpy
mono_audio = mono_audio.squeeze(0).numpy()
# Transcrire l'audio
result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
return result['text']
# Modèle 2 : Texte Wolof -> audio Wolof
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")
# Charger les embeddings pour les voix masculine et féminine
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding_male = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
speaker_embedding_female = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# Fonction 2 : Texte Wolof -> audio Wolof ()
def text_to_speech_wolof(text, voice_type):
embedding = speaker_embedding_male if voice_type == "Male" else speaker_embedding_female
speech = synthesiser_wolof(text, forward_params={"speaker_embeddings": embedding})
return speech["sampling_rate"], speech["audio"]
# Interface Gradio
with gr.Blocks() as app:
with gr.Tab("Transcription Audio -> Texte"):
gr.Markdown("### Transcription audio Wolof vers texte")
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Enregistrer ou importer un fichier audio")
transcription_output = gr.Textbox(label="Texte transcrit")
transcribe_button = gr.Button("Transcrire")
transcribe_button.click(transcribe_audio_wolof, inputs=audio_input, outputs=transcription_output)
with gr.Tab("Texte -> Synthèse Vocale"):
gr.Markdown("### Conversion de texte Wolof en audio")
text_input = gr.Textbox(label="Entrez du texte en Wolof")
voice_selector = gr.Radio(["Male", "Female"], label="Type de voix", value="Male")
audio_output = gr.Audio(label="Synthèse vocale")
synthesize_button = gr.Button("Synthétiser")
synthesize_button.click(text_to_speech_wolof, inputs=[text_input, voice_selector], outputs=audio_output)
# Lancer l'application
app.launch(debug=True, share=True)