File size: 5,083 Bytes
e439330
 
 
 
 
862f941
 
 
 
 
e439330
 
 
862f941
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e439330
 
 
 
 
 
 
 
862f941
e439330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9783125
140fa82
9783125
e439330
 
9783125
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import os
import subprocess
from os.path import exists, join, expanduser
from huggingface_hub import snapshot_download

# project_name = "pytorch-dc-tts"
# if not exists(project_name):
#   ! git clone --quiet https://github.com/tugstugi/{project_name}
#   ! cd {project_name} && pip install -q -r requirements.txt

project_name = "pytorch-dc-tts"
if not os.path.exists(project_name):
    subprocess.run(["git", "clone", "--quiet", f"https://github.com/tugstugi/{project_name}"])

import os
import shutil

project_name = "pytorch-dc-tts"

# Get a list of all files and folders in the project directory
for item in os.listdir(project_name):
    source = os.path.join(project_name, item)
    destination = os.path.join(".", item)  # Move to the current (main) directory

    # Move the item, handling files and folders differently
    if os.path.isfile(source):
        shutil.move(source, destination)
    elif os.path.isdir(source):
        shutil.move(source, destination)

# (Optional) Remove the original project directory if it's empty
if not os.listdir(project_name):
    os.rmdir(project_name)

#download model
snapshot_download(repo_id="Dorjzodovsuren/Mongolian_TTS", local_dir="./")

import sys
import uuid
import torch
import gradio as gr
#sys.path.append(project_name)

import warnings
warnings.filterwarnings("ignore")  # ignore warnings in this notebook

import numpy as np
from tqdm import *

from hparams import HParams as hp
from audio import save_to_wav
from models import Text2Mel, SSRN
from datasets.mb_speech import vocab, idx2char, get_test_data, number2word

hp.max_T = 300

# Check for CUDA availability and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def load_checkpoint(checkpoint_file_name, model, optimizer):
    """Loads the checkpoint into the given model and optimizer."""
    checkpoint = torch.load(checkpoint_file_name, map_location=torch.device(device))
    model.load_state_dict(checkpoint['state_dict'])
    model.float()
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer'])
    start_epoch = checkpoint.get('epoch', 0)
    global_step = checkpoint.get('global_step', 0)
    del checkpoint
    print("loaded checkpoint epoch=%d step=%d" % (start_epoch, global_step))
    return start_epoch, global_step


text2mel = Text2Mel(vocab).eval()
last_checkpoint_file_name = "text2mel.pth"
if last_checkpoint_file_name:
    print("loading text2mel checkpoint '%s'..." % last_checkpoint_file_name)
    load_checkpoint(last_checkpoint_file_name, text2mel, None)


ssrn = SSRN().eval()
last_checkpoint_file_name = "ssrn.pth"
if last_checkpoint_file_name:
    print("loading text2mel checkpoint '%s'..." % last_checkpoint_file_name)
    load_checkpoint(last_checkpoint_file_name, ssrn, None)

def generate_random_id():
  """Generates a random UUID (Universally Unique Identifier).

  Returns:
    A random UUID as a string.
  """
  return str(uuid.uuid4())

# synthetize by one by one because there is a batch processing bug!
def text_to_speech(sentence_raw):
    sentence = ' '.join([number2word(s) if s.isdigit() else s for s in sentence_raw.split()])
    normalized_sentence = "".join([c if c.lower() in vocab else '' for c in sentence])
    print(normalized_sentence)

    sentences = [normalized_sentence]
    max_N = len(normalized_sentence)
    L = torch.from_numpy(get_test_data(sentences, max_N))
    zeros = torch.from_numpy(np.zeros((1, hp.n_mels, 1), np.float32))
    Y = zeros
    A = None

    for t in range(hp.max_T):
      _, Y_t, A = text2mel(L, Y, monotonic_attention=True)
      Y = torch.cat((zeros, Y_t), -1)
      _, attention = torch.max(A[0, :, -1], 0)
      attention = attention.item()
      if L[0, attention] == vocab.index('E'):  # EOS
          break

    _, Z = ssrn(Y)

    Z = Z.cpu().detach().numpy()
    random_id = generate_random_id()
    save_to_wav(Z[0, :, :].T, f'{random_id}.wav')

    return f"{random_id}.wav"

# Gradio interface
iface = gr.Interface(
    fn=text_to_speech,
    inputs="text",
    outputs="audio",
    title="Mongolian Text to Speech",
    description="Энэхүү Монгол хэлний текстээс яриа үүсгэх системийг Hugging Face дээр ашиглах боломжтой боллоо. Монгол хэлээр бичсэн текстийг оруулснаар яриа хэлбэрээр буцаан авах боломжтой бөгөөд үүнийг та үнэ төлбөргүйгээр хүссэн үедээ ашиглах боломжтой . Одоогоор зөвхөн CPU дээр ажиллаж байгаа тул боловсруулалт удаан байж магадгүй. Enjoy ;D",
    examples=[["Сайн байна уу, найзууд аа."], ["Монгол улсын нийслэл Улаанбаатараас ярьж байна."], ["Чихинү чимэг болсон аялгуу сайхан монгол хэл, Чин зоригт өвгөд дээдсийн минь өв их эрдэнэ."]],
    cache_examples=True
)

iface.queue().launch()