sin docker
Browse files- main.py → app.py +7 -20
main.py → app.py
RENAMED
@@ -1,19 +1,13 @@
|
|
1 |
import gradio as gr
|
2 |
-
import spaces
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
|
6 |
-
#
|
7 |
-
zero = torch.Tensor([0]).cuda()
|
8 |
-
print("Fuera de la función, zero.device:", zero.device) # Muestra 'cpu' fuera de GPU
|
9 |
-
|
10 |
-
# Importar los módulos del clonador de voz
|
11 |
-
# Estos módulos provienen del repositorio Real-Time-Voice-Cloning, que debes tener instalado.
|
12 |
from encoder import inference as encoder
|
13 |
from synthesizer.inference import Synthesizer
|
14 |
from vocoder import inference as vocoder
|
15 |
|
16 |
-
# Cargar los modelos pre-entrenados (
|
17 |
encoder_model_path = "encoder/saved_models/pretrained.pt"
|
18 |
synthesizer_model_path = "synthesizer/saved_models/pretrained/pretrained.pt"
|
19 |
vocoder_model_path = "vocoder/saved_models/pretrained/pretrained.pt"
|
@@ -22,36 +16,29 @@ encoder.load_model(encoder_model_path)
|
|
22 |
synthesizer = Synthesizer(synthesizer_model_path)
|
23 |
vocoder.load_model(vocoder_model_path)
|
24 |
|
25 |
-
@spaces.GPU
|
26 |
def clone_voice(reference_audio, text):
|
27 |
-
#
|
28 |
-
print("Dentro de la función, zero.device:", zero.device)
|
29 |
-
|
30 |
-
# Se espera que reference_audio sea una tupla (sample_rate, np.array) proveniente de gr.Audio
|
31 |
sample_rate, audio = reference_audio
|
32 |
|
33 |
-
# Preprocesar el audio para obtener el
|
34 |
preprocessed_wav = encoder.preprocess_wav(audio)
|
35 |
|
36 |
# Extraer la embedding del hablante a partir del audio de referencia
|
37 |
embed = encoder.embed_utterance(preprocessed_wav)
|
38 |
|
39 |
-
# Sintetizar el espectrograma a partir del texto y
|
40 |
specs = synthesizer.synthesize_spectrograms([text], [embed])
|
41 |
|
42 |
# Generar la onda de audio a partir del espectrograma con el vocoder
|
43 |
generated_wav = vocoder.infer_waveform(specs[0])
|
44 |
|
45 |
-
# Opcional: agregar padding al audio generado
|
46 |
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
|
47 |
|
48 |
# Retornar la onda de audio generada junto con la tasa de muestreo
|
49 |
return (synthesizer.sample_rate, generated_wav)
|
50 |
|
51 |
-
# Configuración de la interfaz de Gradio
|
52 |
-
# - Entrada 1: Audio (voz de referencia) a subir.
|
53 |
-
# - Entrada 2: Texto que se desea sintetizar con la voz clonada.
|
54 |
-
# - Salida: Audio resultante.
|
55 |
demo = gr.Interface(
|
56 |
fn=clone_voice,
|
57 |
inputs=[
|
|
|
1 |
import gradio as gr
|
|
|
2 |
import torch
|
3 |
import numpy as np
|
4 |
|
5 |
+
# Importar los módulos del clonador de voz (de Real-Time-Voice-Cloning)
|
|
|
|
|
|
|
|
|
|
|
6 |
from encoder import inference as encoder
|
7 |
from synthesizer.inference import Synthesizer
|
8 |
from vocoder import inference as vocoder
|
9 |
|
10 |
+
# Cargar los modelos pre-entrenados (ajusta las rutas según tu entorno)
|
11 |
encoder_model_path = "encoder/saved_models/pretrained.pt"
|
12 |
synthesizer_model_path = "synthesizer/saved_models/pretrained/pretrained.pt"
|
13 |
vocoder_model_path = "vocoder/saved_models/pretrained/pretrained.pt"
|
|
|
16 |
synthesizer = Synthesizer(synthesizer_model_path)
|
17 |
vocoder.load_model(vocoder_model_path)
|
18 |
|
|
|
19 |
def clone_voice(reference_audio, text):
|
20 |
+
# Se espera que reference_audio sea una tupla (sample_rate, numpy_array) proveniente de gr.Audio
|
|
|
|
|
|
|
21 |
sample_rate, audio = reference_audio
|
22 |
|
23 |
+
# Preprocesar el audio para obtener el formato adecuado para el encoder
|
24 |
preprocessed_wav = encoder.preprocess_wav(audio)
|
25 |
|
26 |
# Extraer la embedding del hablante a partir del audio de referencia
|
27 |
embed = encoder.embed_utterance(preprocessed_wav)
|
28 |
|
29 |
+
# Sintetizar el espectrograma a partir del texto y la embedding extraída
|
30 |
specs = synthesizer.synthesize_spectrograms([text], [embed])
|
31 |
|
32 |
# Generar la onda de audio a partir del espectrograma con el vocoder
|
33 |
generated_wav = vocoder.infer_waveform(specs[0])
|
34 |
|
35 |
+
# Opcional: agregar padding al audio generado para evitar clics o truncamientos
|
36 |
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
|
37 |
|
38 |
# Retornar la onda de audio generada junto con la tasa de muestreo
|
39 |
return (synthesizer.sample_rate, generated_wav)
|
40 |
|
41 |
+
# Configuración de la interfaz de Gradio
|
|
|
|
|
|
|
42 |
demo = gr.Interface(
|
43 |
fn=clone_voice,
|
44 |
inputs=[
|