reach-vb HF staff commited on
Commit
d6c1b5c
1 Parent(s): e266054

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -0
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import torch
4
+ import torchaudio
5
+ from TTS.api import TTS
6
+ from TTS.tts.configs.xtts_config import XttsConfig
7
+ from TTS.tts.models.xtts import Xtts
8
+ from TTS.utils.generic_utils import get_user_data_dir
9
+
10
+ import gradio as gr
11
+ from scipy.io.wavfile import write
12
+
13
+ os.environ["COQUI_TOS_AGREED"] = "1"
14
+ tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
15
+
16
+ model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1")
17
+ config = XttsConfig()
18
+ config.load_json(os.path.join(model_path, "config.json"))
19
+ model = Xtts.init_from_config(config)
20
+ model.load_checkpoint(
21
+ config,
22
+ checkpoint_path=os.path.join(model_path, "model.pth"),
23
+ vocab_path=os.path.join(model_path, "vocab.json"),
24
+ eval=True,
25
+ use_deepspeed=True
26
+ )
27
+ model.cuda()
28
+
29
+ def stream_audio(synthesis_text):
30
+ gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="female.wav")
31
+ wav_chunks = []
32
+
33
+ chunks = model.inference_stream(
34
+ synthesis_text,
35
+ "en",
36
+ gpt_cond_latent,
37
+ speaker_embedding,
38
+ stream_chunk_size=10,
39
+ overlap_wav_len=512)
40
+
41
+ for i, chunk in enumerate(chunks):
42
+ print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
43
+ bytes_wav = bytes()
44
+ byte_io = io.BytesIO(bytes_wav)
45
+ write(byte_io, 24000, chunk.detach().cpu().numpy().squeeze())
46
+ result_bytes = byte_io.read()
47
+ yield result_bytes
48
+
49
+ demo = gr.Interface(
50
+ fn=stream_audio,
51
+ inputs=gr.Textbox(),
52
+ outputs=gr.Audio(autoplay=True, streaming=True),
53
+ )
54
+
55
+ if __name__ == "__main__":
56
+ demo.queue().launch(debug=True)