import gradio as gr
import torchaudio
from transformers import pipeline
import torch
from datasets import load_dataset

# Modèle 1 : Transcription audio Wolof -> texte Wolof
pipe_wolof = pipeline(
    task="automatic-speech-recognition",
    model="bilalfaye/wav2vec2-large-mms-1b-wolof",
    processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Fonction 1 : Transcription audio Wolof -> texte Wolof
def transcribe_audio_wolof(audio):
    # Charger l'audio avec torchaudio
    waveform, sample_rate = torchaudio.load(audio)

    # Convertir stéréo en mono
    if waveform.shape[0] > 1:
        mono_audio = waveform.mean(dim=0, keepdim=True)
    else:
        mono_audio = waveform

    # Rééchantillonner à 16 kHz si nécessaire
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        mono_audio = resampler(mono_audio)
        sample_rate = 16000

    # Convertir en tableau numpy
    mono_audio = mono_audio.squeeze(0).numpy()

    # Transcrire l'audio
    result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
    return result['text']

# Modèle 2 : Texte Wolof -> audio Wolof
synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")

# Charger les embeddings pour les voix masculine et féminine
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding_male = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
speaker_embedding_female = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


# Fonction 2 : Texte Wolof -> audio Wolof ()
def text_to_speech_wolof(text, voice_type):
    embedding = speaker_embedding_male if voice_type == "Male" else speaker_embedding_female
    speech = synthesiser_wolof(text, forward_params={"speaker_embeddings": embedding})
    return speech["sampling_rate"], speech["audio"]

# Interface Gradio
with gr.Blocks() as app:
    with gr.Tab("Transcription Audio -> Texte"):
        gr.Markdown("### Transcription audio Wolof vers texte")
        audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Enregistrer ou importer un fichier audio")
        transcription_output = gr.Textbox(label="Texte transcrit")
        transcribe_button = gr.Button("Transcrire")
        transcribe_button.click(transcribe_audio_wolof, inputs=audio_input, outputs=transcription_output)

    with gr.Tab("Texte -> Synthèse Vocale"):
        gr.Markdown("### Conversion de texte Wolof en audio")
        text_input = gr.Textbox(label="Entrez du texte en Wolof")
        voice_selector = gr.Radio(["Male", "Female"], label="Type de voix", value="Male")
        audio_output = gr.Audio(label="Synthèse vocale")
        synthesize_button = gr.Button("Synthétiser")
        synthesize_button.click(text_to_speech_wolof, inputs=[text_input, voice_selector], outputs=audio_output)

# Lancer l'application
app.launch(debug=True, share=True)