Spaces:

Dorjzodovsuren
/

Mongolian_TTS

Sleeping

App Files Files Community

Mongolian_TTS / app.py

Dorjzodovsuren

Update app.py

140fa82 verified 2 months ago

raw

history blame contribute delete

5.08 kB

	import os
	import subprocess
	from os.path import exists, join, expanduser
	from huggingface_hub import snapshot_download

	# project_name = "pytorch-dc-tts"
	# if not exists(project_name):
	# ! git clone --quiet https://github.com/tugstugi/{project_name}
	# ! cd {project_name} && pip install -q -r requirements.txt

	project_name = "pytorch-dc-tts"
	if not os.path.exists(project_name):
	subprocess.run(["git", "clone", "--quiet", f"https://github.com/tugstugi/{project_name}"])

	import os
	import shutil

	project_name = "pytorch-dc-tts"

	# Get a list of all files and folders in the project directory
	for item in os.listdir(project_name):
	source = os.path.join(project_name, item)
	destination = os.path.join(".", item) # Move to the current (main) directory

	# Move the item, handling files and folders differently
	if os.path.isfile(source):
	shutil.move(source, destination)
	elif os.path.isdir(source):
	shutil.move(source, destination)

	# (Optional) Remove the original project directory if it's empty
	if not os.listdir(project_name):
	os.rmdir(project_name)

	#download model
	snapshot_download(repo_id="Dorjzodovsuren/Mongolian_TTS", local_dir="./")

	import sys
	import uuid
	import torch
	import gradio as gr
	#sys.path.append(project_name)

	import warnings
	warnings.filterwarnings("ignore") # ignore warnings in this notebook

	import numpy as np
	from tqdm import *

	from hparams import HParams as hp
	from audio import save_to_wav
	from models import Text2Mel, SSRN
	from datasets.mb_speech import vocab, idx2char, get_test_data, number2word

	hp.max_T = 300

	# Check for CUDA availability and set device
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


	def load_checkpoint(checkpoint_file_name, model, optimizer):
	"""Loads the checkpoint into the given model and optimizer."""
	checkpoint = torch.load(checkpoint_file_name, map_location=torch.device(device))
	model.load_state_dict(checkpoint['state_dict'])
	model.float()
	if optimizer is not None:
	optimizer.load_state_dict(checkpoint['optimizer'])
	start_epoch = checkpoint.get('epoch', 0)
	global_step = checkpoint.get('global_step', 0)
	del checkpoint
	print("loaded checkpoint epoch=%d step=%d" % (start_epoch, global_step))
	return start_epoch, global_step


	text2mel = Text2Mel(vocab).eval()
	last_checkpoint_file_name = "text2mel.pth"
	if last_checkpoint_file_name:
	print("loading text2mel checkpoint '%s'..." % last_checkpoint_file_name)
	load_checkpoint(last_checkpoint_file_name, text2mel, None)


	ssrn = SSRN().eval()
	last_checkpoint_file_name = "ssrn.pth"
	if last_checkpoint_file_name:
	print("loading text2mel checkpoint '%s'..." % last_checkpoint_file_name)
	load_checkpoint(last_checkpoint_file_name, ssrn, None)

	def generate_random_id():
	"""Generates a random UUID (Universally Unique Identifier).

	Returns:
	A random UUID as a string.
	"""
	return str(uuid.uuid4())

	# synthetize by one by one because there is a batch processing bug!
	def text_to_speech(sentence_raw):
	sentence = ' '.join([number2word(s) if s.isdigit() else s for s in sentence_raw.split()])
	normalized_sentence = "".join([c if c.lower() in vocab else '' for c in sentence])
	print(normalized_sentence)

	sentences = [normalized_sentence]
	max_N = len(normalized_sentence)
	L = torch.from_numpy(get_test_data(sentences, max_N))
	zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32))
	Y = zeros
	A = None

	for t in range(hp.max_T):
	_, Y_t, A = text2mel(L, Y, monotonic_attention=True)
	Y = torch.cat((zeros, Y_t), -1)
	_, attention = torch.max(A[0, :, -1], 0)
	attention = attention.item()
	if L[0, attention] == vocab.index('E'): # EOS
	break

	_, Z = ssrn(Y)

	Z = Z.cpu().detach().numpy()
	random_id = generate_random_id()
	save_to_wav(Z[0, :, :].T, f'{random_id}.wav')

	return f"{random_id}.wav"

	# Gradio interface
	iface = gr.Interface(
	fn=text_to_speech,
	inputs="text",
	outputs="audio",
	title="Mongolian Text to Speech",
	description="Энэхүү Монгол хэлний текстээс яриа үүсгэх системийг Hugging Face дээр ашиглах боломжтой боллоо. Монгол хэлээр бичсэн текстийг оруулснаар яриа хэлбэрээр буцаан авах боломжтой бөгөөд үүнийг та үнэ төлбөргүйгээр хүссэн үедээ ашиглах боломжтой . Одоогоор зөвхөн CPU дээр ажиллаж байгаа тул боловсруулалт удаан байж магадгүй. Enjoy ;D",
	examples=[["Сайн байна уу, найзууд аа."], ["Монгол улсын нийслэл Улаанбаатараас ярьж байна."], ["Чихинү чимэг болсон аялгуу сайхан монгол хэл, Чин зоригт өвгөд дээдсийн минь өв их эрдэнэ."]],
	cache_examples=True
	)

	iface.queue().launch()