Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import os
|
3 |
+
import torch
|
4 |
+
import torchaudio
|
5 |
+
from TTS.api import TTS
|
6 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
7 |
+
from TTS.tts.models.xtts import Xtts
|
8 |
+
from TTS.utils.generic_utils import get_user_data_dir
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
from scipy.io.wavfile import write
|
12 |
+
|
13 |
+
os.environ["COQUI_TOS_AGREED"] = "1"
|
14 |
+
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
|
15 |
+
|
16 |
+
model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
|
17 |
+
config = XttsConfig()
|
18 |
+
config.load_json(os.path.join(model_path, "config.json"))
|
19 |
+
model = Xtts.init_from_config(config)
|
20 |
+
model.load_checkpoint(
|
21 |
+
config,
|
22 |
+
checkpoint_path=os.path.join(model_path, "model.pth"),
|
23 |
+
vocab_path=os.path.join(model_path, "vocab.json"),
|
24 |
+
eval=True,
|
25 |
+
use_deepspeed=True
|
26 |
+
)
|
27 |
+
model.cuda()
|
28 |
+
|
29 |
+
def stream_audio(synthesis_text):
|
30 |
+
gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="female.wav")
|
31 |
+
wav_chunks = []
|
32 |
+
|
33 |
+
chunks = model.inference_stream(
|
34 |
+
synthesis_text,
|
35 |
+
"en",
|
36 |
+
gpt_cond_latent,
|
37 |
+
speaker_embedding,
|
38 |
+
stream_chunk_size=10,
|
39 |
+
overlap_wav_len=512)
|
40 |
+
|
41 |
+
for i, chunk in enumerate(chunks):
|
42 |
+
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
|
43 |
+
bytes_wav = bytes()
|
44 |
+
byte_io = io.BytesIO(bytes_wav)
|
45 |
+
write(byte_io, 24000, chunk.detach().cpu().numpy().squeeze())
|
46 |
+
result_bytes = byte_io.read()
|
47 |
+
yield result_bytes
|
48 |
+
|
49 |
+
demo = gr.Interface(
|
50 |
+
fn=stream_audio,
|
51 |
+
inputs=gr.Textbox(),
|
52 |
+
outputs=gr.Audio(autoplay=True, streaming=True),
|
53 |
+
)
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
demo.queue().launch(debug=True)
|