Clona_voz / app.py
Segizu's picture
sin docker
445117f
import gradio as gr
import torch
import numpy as np
# Importar los m贸dulos del clonador de voz (de Real-Time-Voice-Cloning)
from encoder import inference as encoder
from synthesizer.inference import Synthesizer
from vocoder import inference as vocoder
# Cargar los modelos pre-entrenados (ajusta las rutas seg煤n tu entorno)
encoder_model_path = "encoder/saved_models/pretrained.pt"
synthesizer_model_path = "synthesizer/saved_models/pretrained/pretrained.pt"
vocoder_model_path = "vocoder/saved_models/pretrained/pretrained.pt"
encoder.load_model(encoder_model_path)
synthesizer = Synthesizer(synthesizer_model_path)
vocoder.load_model(vocoder_model_path)
def clone_voice(reference_audio, text):
# Se espera que reference_audio sea una tupla (sample_rate, numpy_array) proveniente de gr.Audio
sample_rate, audio = reference_audio
# Preprocesar el audio para obtener el formato adecuado para el encoder
preprocessed_wav = encoder.preprocess_wav(audio)
# Extraer la embedding del hablante a partir del audio de referencia
embed = encoder.embed_utterance(preprocessed_wav)
# Sintetizar el espectrograma a partir del texto y la embedding extra铆da
specs = synthesizer.synthesize_spectrograms([text], [embed])
# Generar la onda de audio a partir del espectrograma con el vocoder
generated_wav = vocoder.infer_waveform(specs[0])
# Opcional: agregar padding al audio generado para evitar clics o truncamientos
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
# Retornar la onda de audio generada junto con la tasa de muestreo
return (synthesizer.sample_rate, generated_wav)
# Configuraci贸n de la interfaz de Gradio
demo = gr.Interface(
fn=clone_voice,
inputs=[
gr.Audio(source="upload", type="numpy", label="Voz de Referencia"),
gr.Textbox(label="Texto a Clonar")
],
outputs=gr.Audio(label="Voz Clonada")
)
if __name__ == "__main__":
demo.launch()