Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torchaudio | |
from transformers import pipeline | |
import torch | |
from datasets import load_dataset | |
# Modèle 1 : Transcription audio Wolof -> texte Wolof | |
pipe_wolof = pipeline( | |
task="automatic-speech-recognition", | |
model="bilalfaye/wav2vec2-large-mms-1b-wolof", | |
processor="bilalfaye/wav2vec2-large-mms-1b-wolof", | |
device="cuda" if torch.cuda.is_available() else "cpu" | |
) | |
# Fonction 1 : Transcription audio Wolof -> texte Wolof | |
def transcribe_audio_wolof(audio): | |
# Charger l'audio avec torchaudio | |
waveform, sample_rate = torchaudio.load(audio) | |
# Convertir stéréo en mono | |
if waveform.shape[0] > 1: | |
mono_audio = waveform.mean(dim=0, keepdim=True) | |
else: | |
mono_audio = waveform | |
# Rééchantillonner à 16 kHz si nécessaire | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
mono_audio = resampler(mono_audio) | |
sample_rate = 16000 | |
# Convertir en tableau numpy | |
mono_audio = mono_audio.squeeze(0).numpy() | |
# Transcrire l'audio | |
result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate}) | |
return result['text'] | |
# Modèle 2 : Texte Wolof -> audio Wolof | |
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof") | |
# Charger les embeddings pour les voix masculine et féminine | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embedding_male = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0) | |
speaker_embedding_female = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
# Fonction 2 : Texte Wolof -> audio Wolof () | |
def text_to_speech_wolof(text, voice_type): | |
embedding = speaker_embedding_male if voice_type == "Male" else speaker_embedding_female | |
speech = synthesiser_wolof(text, forward_params={"speaker_embeddings": embedding}) | |
return speech["sampling_rate"], speech["audio"] | |
# Interface Gradio | |
with gr.Blocks() as app: | |
with gr.Tab("Transcription Audio -> Texte"): | |
gr.Markdown("### Transcription audio Wolof vers texte") | |
audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Enregistrer ou importer un fichier audio") | |
transcription_output = gr.Textbox(label="Texte transcrit") | |
transcribe_button = gr.Button("Transcrire") | |
transcribe_button.click(transcribe_audio_wolof, inputs=audio_input, outputs=transcription_output) | |
with gr.Tab("Texte -> Synthèse Vocale"): | |
gr.Markdown("### Conversion de texte Wolof en audio") | |
text_input = gr.Textbox(label="Entrez du texte en Wolof") | |
voice_selector = gr.Radio(["Male", "Female"], label="Type de voix", value="Male") | |
audio_output = gr.Audio(label="Synthèse vocale") | |
synthesize_button = gr.Button("Synthétiser") | |
synthesize_button.click(text_to_speech_wolof, inputs=[text_input, voice_selector], outputs=audio_output) | |
# Lancer l'application | |
app.launch(debug=True, share=True) | |