Spaces:

bilalfaye
/

Wax_ak_Bind_Wolof

Sleeping

App Files Files Community

Wax_ak_Bind_Wolof / app.py

bilalfaye

Update app.py

dbc6751 verified 5 months ago

raw

history blame contribute delete

3.03 kB

	import gradio as gr
	import torchaudio
	from transformers import pipeline
	import torch
	from datasets import load_dataset

	# Modèle 1 : Transcription audio Wolof -> texte Wolof
	pipe_wolof = pipeline(
	task="automatic-speech-recognition",
	model="bilalfaye/wav2vec2-large-mms-1b-wolof",
	processor="bilalfaye/wav2vec2-large-mms-1b-wolof",
	device="cuda" if torch.cuda.is_available() else "cpu"
	)

	# Fonction 1 : Transcription audio Wolof -> texte Wolof
	def transcribe_audio_wolof(audio):
	# Charger l'audio avec torchaudio
	waveform, sample_rate = torchaudio.load(audio)

	# Convertir stéréo en mono
	if waveform.shape[0] > 1:
	mono_audio = waveform.mean(dim=0, keepdim=True)
	else:
	mono_audio = waveform

	# Rééchantillonner à 16 kHz si nécessaire
	if sample_rate != 16000:
	resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
	mono_audio = resampler(mono_audio)
	sample_rate = 16000

	# Convertir en tableau numpy
	mono_audio = mono_audio.squeeze(0).numpy()

	# Transcrire l'audio
	result = pipe_wolof({"array": mono_audio, "sampling_rate": sample_rate})
	return result['text']

	# Modèle 2 : Texte Wolof -> audio Wolof
	synthesiser_wolof = pipeline("text-to-speech", "bilalfaye/speecht5_tts-wolof")

	# Charger les embeddings pour les voix masculine et féminine
	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embedding_male = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)
	speaker_embedding_female = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)



	# Fonction 2 : Texte Wolof -> audio Wolof ()
	def text_to_speech_wolof(text, voice_type):
	embedding = speaker_embedding_male if voice_type == "Male" else speaker_embedding_female
	speech = synthesiser_wolof(text, forward_params={"speaker_embeddings": embedding})
	return speech["sampling_rate"], speech["audio"]

	# Interface Gradio
	with gr.Blocks() as app:
	with gr.Tab("Transcription Audio -> Texte"):
	gr.Markdown("### Transcription audio Wolof vers texte")
	audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Enregistrer ou importer un fichier audio")
	transcription_output = gr.Textbox(label="Texte transcrit")
	transcribe_button = gr.Button("Transcrire")
	transcribe_button.click(transcribe_audio_wolof, inputs=audio_input, outputs=transcription_output)

	with gr.Tab("Texte -> Synthèse Vocale"):
	gr.Markdown("### Conversion de texte Wolof en audio")
	text_input = gr.Textbox(label="Entrez du texte en Wolof")
	voice_selector = gr.Radio(["Male", "Female"], label="Type de voix", value="Male")
	audio_output = gr.Audio(label="Synthèse vocale")
	synthesize_button = gr.Button("Synthétiser")
	synthesize_button.click(text_to_speech_wolof, inputs=[text_input, voice_selector], outputs=audio_output)

	# Lancer l'application
	app.launch(debug=True, share=True)