import io import os import torch import torchaudio from TTS.api import TTS from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.generic_utils import get_user_data_dir import gradio as gr from scipy.io.wavfile import write os.environ["COQUI_TOS_AGREED"] = "1" tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1") model_path = os.path.join(get_user_data_dir("tts"), "tts_models--multilingual--multi-dataset--xtts_v1") config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) model = Xtts.init_from_config(config) model.load_checkpoint( config, checkpoint_path=os.path.join(model_path, "model.pth"), vocab_path=os.path.join(model_path, "vocab.json"), eval=True, use_deepspeed=True ) model.cuda() def stream_audio(synthesis_text): gpt_cond_latent, _, speaker_embedding = model.get_conditioning_latents(audio_path="female.wav") wav_chunks = [] chunks = model.inference_stream( synthesis_text, "en", gpt_cond_latent, speaker_embedding, stream_chunk_size=10, overlap_wav_len=512) for i, chunk in enumerate(chunks): print(f"Received chunk {i} of audio length {chunk.shape[-1]}") bytes_wav = bytes() byte_io = io.BytesIO(bytes_wav) write(byte_io, 24000, chunk.detach().cpu().numpy().squeeze()) result_bytes = byte_io.read() yield result_bytes demo = gr.Interface( fn=stream_audio, inputs=gr.Textbox(), outputs=gr.Audio(autoplay=True, streaming=True), ) if __name__ == "__main__": demo.queue().launch(debug=True)