sin docker
Browse files- app.py +64 -0
- requiremens.txt +7 -0
app.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import spaces
|
3 |
+
import torch
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
# Demostraci贸n del uso de CUDA
|
7 |
+
zero = torch.Tensor([0]).cuda()
|
8 |
+
print("Fuera de la funci贸n, zero.device:", zero.device) # Muestra 'cpu' fuera de GPU
|
9 |
+
|
10 |
+
# Importar los m贸dulos del clonador de voz
|
11 |
+
# Estos m贸dulos provienen del repositorio Real-Time-Voice-Cloning, que debes tener instalado.
|
12 |
+
from encoder import inference as encoder
|
13 |
+
from synthesizer.inference import Synthesizer
|
14 |
+
from vocoder import inference as vocoder
|
15 |
+
|
16 |
+
# Cargar los modelos pre-entrenados (modifica las rutas seg煤n corresponda)
|
17 |
+
encoder_model_path = "encoder/saved_models/pretrained.pt"
|
18 |
+
synthesizer_model_path = "synthesizer/saved_models/pretrained/pretrained.pt"
|
19 |
+
vocoder_model_path = "vocoder/saved_models/pretrained/pretrained.pt"
|
20 |
+
|
21 |
+
encoder.load_model(encoder_model_path)
|
22 |
+
synthesizer = Synthesizer(synthesizer_model_path)
|
23 |
+
vocoder.load_model(vocoder_model_path)
|
24 |
+
|
25 |
+
@spaces.GPU
|
26 |
+
def clone_voice(reference_audio, text):
|
27 |
+
# Dentro de la funci贸n, el contexto GPU est谩 activo
|
28 |
+
print("Dentro de la funci贸n, zero.device:", zero.device)
|
29 |
+
|
30 |
+
# Se espera que reference_audio sea una tupla (sample_rate, np.array) proveniente de gr.Audio
|
31 |
+
sample_rate, audio = reference_audio
|
32 |
+
|
33 |
+
# Preprocesar el audio para obtener el wav adecuado para el encoder
|
34 |
+
preprocessed_wav = encoder.preprocess_wav(audio)
|
35 |
+
|
36 |
+
# Extraer la embedding del hablante a partir del audio de referencia
|
37 |
+
embed = encoder.embed_utterance(preprocessed_wav)
|
38 |
+
|
39 |
+
# Sintetizar el espectrograma a partir del texto y de la embedding extra铆da
|
40 |
+
specs = synthesizer.synthesize_spectrograms([text], [embed])
|
41 |
+
|
42 |
+
# Generar la onda de audio a partir del espectrograma con el vocoder
|
43 |
+
generated_wav = vocoder.infer_waveform(specs[0])
|
44 |
+
|
45 |
+
# Opcional: agregar padding al audio generado (esto puede ayudar a evitar clics o truncamientos)
|
46 |
+
generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
|
47 |
+
|
48 |
+
# Retornar la onda de audio generada junto con la tasa de muestreo
|
49 |
+
return (synthesizer.sample_rate, generated_wav)
|
50 |
+
|
51 |
+
# Configuraci贸n de la interfaz de Gradio:
|
52 |
+
# - Entrada 1: Audio (voz de referencia) a subir.
|
53 |
+
# - Entrada 2: Texto que se desea sintetizar con la voz clonada.
|
54 |
+
# - Salida: Audio resultante.
|
55 |
+
demo = gr.Interface(
|
56 |
+
fn=clone_voice,
|
57 |
+
inputs=[
|
58 |
+
gr.Audio(source="upload", type="numpy", label="Voz de Referencia"),
|
59 |
+
gr.Textbox(label="Texto a Clonar")
|
60 |
+
],
|
61 |
+
outputs=gr.Audio(label="Voz Clonada")
|
62 |
+
)
|
63 |
+
|
64 |
+
demo.launch()
|
requiremens.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
spaces
|
3 |
+
torch
|
4 |
+
numpy
|
5 |
+
encoder
|
6 |
+
synthesizer.inference
|
7 |
+
vocoder
|