Text-To-Speech / app.py
pikto's picture
Update app.py
0cb208e
import os
import glob
import logging
from typing import cast
from threading import Lock
import gradio as gr
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files
# locker that disallow access to the tts object from more then one thread
locker = Lock()
# global tts module, initialized from a model selected
tts = None
# path to the model that is currently used in tts
cur_model_path = None
# cache of speakers, maps model name to speaker list
model_to_speakers = dict()
model_repo_dir = "/data"
for name in list_repo_files(repo_id="balacoon/tts"):
if not os.path.isfile(os.path.join(model_repo_dir, name)):
hf_hub_download(
repo_id="balacoon/tts",
filename=name,
local_dir=model_repo_dir,
)
def main():
logging.basicConfig(level=logging.INFO)
with gr.Blocks() as demo:
gr.Markdown(
"""
<h1 align="center">Balacoon🦝 Text-to-Speech</h1>
1. Write an utterance to generate,
2. Select the model to synthesize with
3. Select speaker
4. Hit "Generate" and listen to the result!
You can learn more about models available
[here](https://huggingface.co/balacoon/tts).
Visit [Balacoon website](https://balacoon.com/) for more info.
"""
)
with gr.Row(variant="panel"):
text = gr.Textbox(label="Text", placeholder="Type something here...")
with gr.Row():
with gr.Column(variant="panel"):
repo_files = os.listdir(model_repo_dir)
model_files = [x for x in repo_files if x.endswith("_cpu.addon")]
model_name = gr.Dropdown(
label="Model",
choices=model_files,
)
with gr.Column(variant="panel"):
speaker = gr.Dropdown(label="Speaker", choices=[])
def set_model(model_name_str: str):
"""
gets value from `model_name`. either
uses cached list of speakers for the given model name
or loads the addon and checks what are the speakers.
"""
global model_to_speakers
if model_name_str in model_to_speakers:
speakers = model_to_speakers[model_name_str]
else:
global tts, cur_model_path, locker
with locker:
# need to load this model to learn the list of speakers
model_path = os.path.join(model_repo_dir, model_name_str)
if tts is not None:
del tts
tts = TTS(model_path)
cur_model_path = model_path
speakers = tts.get_speakers()
model_to_speakers[model_name_str] = speakers
value = speakers[-1]
return gr.Dropdown.update(
choices=speakers, value=value, visible=True
)
model_name.change(set_model, inputs=model_name, outputs=speaker)
with gr.Row(variant="panel"):
generate = gr.Button("Generate")
with gr.Row(variant="panel"):
audio = gr.Audio()
def synthesize_audio(text_str: str, model_name_str: str, speaker_str: str):
"""
gets utterance to synthesize from `text` Textbox
and speaker name from `speaker` dropdown list.
speaker name might be empty for single-speaker models.
Synthesizes the waveform and updates `audio` with it.
"""
if not text_str or not model_name_str or not speaker_str:
logging.info("text, model name or speaker are not provided")
return None
expected_model_path = os.path.join(model_repo_dir, model_name_str)
global tts, cur_model_path, locker
with locker:
if expected_model_path != cur_model_path:
# reload model
if tts is not None:
del tts
tts = TTS(expected_model_path)
cur_model_path = expected_model_path
if len(text_str) > 1024:
# truncate the text
text_str = text_str[:1024]
samples = tts.synthesize(text_str, speaker_str)
return gr.Audio.update(value=(tts.get_sampling_rate(), samples))
generate.click(synthesize_audio, inputs=[text, model_name, speaker], outputs=audio)
demo.queue(concurrency_count=1).launch()
if __name__ == "__main__":
main()