Spaces:
Sleeping
Sleeping
import os | |
import subprocess | |
from os.path import exists, join, expanduser | |
from huggingface_hub import snapshot_download | |
# project_name = "pytorch-dc-tts" | |
# if not exists(project_name): | |
# ! git clone --quiet https://github.com/tugstugi/{project_name} | |
# ! cd {project_name} && pip install -q -r requirements.txt | |
project_name = "pytorch-dc-tts" | |
if not os.path.exists(project_name): | |
subprocess.run(["git", "clone", "--quiet", f"https://github.com/tugstugi/{project_name}"]) | |
import os | |
import shutil | |
project_name = "pytorch-dc-tts" | |
# Get a list of all files and folders in the project directory | |
for item in os.listdir(project_name): | |
source = os.path.join(project_name, item) | |
destination = os.path.join(".", item) # Move to the current (main) directory | |
# Move the item, handling files and folders differently | |
if os.path.isfile(source): | |
shutil.move(source, destination) | |
elif os.path.isdir(source): | |
shutil.move(source, destination) | |
# (Optional) Remove the original project directory if it's empty | |
if not os.listdir(project_name): | |
os.rmdir(project_name) | |
#download model | |
snapshot_download(repo_id="Dorjzodovsuren/Mongolian_TTS", local_dir="./") | |
import sys | |
import uuid | |
import torch | |
import gradio as gr | |
#sys.path.append(project_name) | |
import warnings | |
warnings.filterwarnings("ignore") # ignore warnings in this notebook | |
import numpy as np | |
from tqdm import * | |
from hparams import HParams as hp | |
from audio import save_to_wav | |
from models import Text2Mel, SSRN | |
from datasets.mb_speech import vocab, idx2char, get_test_data, number2word | |
hp.max_T = 300 | |
# Check for CUDA availability and set device | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
def load_checkpoint(checkpoint_file_name, model, optimizer): | |
"""Loads the checkpoint into the given model and optimizer.""" | |
checkpoint = torch.load(checkpoint_file_name, map_location=torch.device(device)) | |
model.load_state_dict(checkpoint['state_dict']) | |
model.float() | |
if optimizer is not None: | |
optimizer.load_state_dict(checkpoint['optimizer']) | |
start_epoch = checkpoint.get('epoch', 0) | |
global_step = checkpoint.get('global_step', 0) | |
del checkpoint | |
print("loaded checkpoint epoch=%d step=%d" % (start_epoch, global_step)) | |
return start_epoch, global_step | |
text2mel = Text2Mel(vocab).eval() | |
last_checkpoint_file_name = "text2mel.pth" | |
if last_checkpoint_file_name: | |
print("loading text2mel checkpoint '%s'..." % last_checkpoint_file_name) | |
load_checkpoint(last_checkpoint_file_name, text2mel, None) | |
ssrn = SSRN().eval() | |
last_checkpoint_file_name = "ssrn.pth" | |
if last_checkpoint_file_name: | |
print("loading text2mel checkpoint '%s'..." % last_checkpoint_file_name) | |
load_checkpoint(last_checkpoint_file_name, ssrn, None) | |
def generate_random_id(): | |
"""Generates a random UUID (Universally Unique Identifier). | |
Returns: | |
A random UUID as a string. | |
""" | |
return str(uuid.uuid4()) | |
# synthetize by one by one because there is a batch processing bug! | |
def text_to_speech(sentence_raw): | |
sentence = ' '.join([number2word(s) if s.isdigit() else s for s in sentence_raw.split()]) | |
normalized_sentence = "".join([c if c.lower() in vocab else '' for c in sentence]) | |
print(normalized_sentence) | |
sentences = [normalized_sentence] | |
max_N = len(normalized_sentence) | |
L = torch.from_numpy(get_test_data(sentences, max_N)) | |
zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32)) | |
Y = zeros | |
A = None | |
for t in range(hp.max_T): | |
_, Y_t, A = text2mel(L, Y, monotonic_attention=True) | |
Y = torch.cat((zeros, Y_t), -1) | |
_, attention = torch.max(A[0, :, -1], 0) | |
attention = attention.item() | |
if L[0, attention] == vocab.index('E'): # EOS | |
break | |
_, Z = ssrn(Y) | |
Z = Z.cpu().detach().numpy() | |
random_id = generate_random_id() | |
save_to_wav(Z[0, :, :].T, f'{random_id}.wav') | |
return f"{random_id}.wav" | |
# Gradio interface | |
iface = gr.Interface( | |
fn=text_to_speech, | |
inputs="text", | |
outputs="audio", | |
title="Mongolian Text to Speech", | |
description="Энэхүү Монгол хэлний текстээс яриа үүсгэх системийг Hugging Face дээр ашиглах боломжтой боллоо. Монгол хэлээр бичсэн текстийг оруулснаар яриа хэлбэрээр буцаан авах боломжтой бөгөөд үүнийг та үнэ төлбөргүйгээр хүссэн үедээ ашиглах боломжтой . Одоогоор зөвхөн CPU дээр ажиллаж байгаа тул боловсруулалт удаан байж магадгүй. Enjoy ;D", | |
examples=[["Сайн байна уу, найзууд аа."], ["Монгол улсын нийслэл Улаанбаатараас ярьж байна."], ["Чихинү чимэг болсон аялгуу сайхан монгол хэл, Чин зоригт өвгөд дээдсийн минь өв их эрдэнэ."]], | |
cache_examples=True | |
) | |
iface.queue().launch() |