JotunnBurton commited on
Commit
20887d2
·
verified ·
1 Parent(s): ae73f54

Upload 21 files

Browse files
text/__init__.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text.symbols import *
2
+
3
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
+
5
+
6
+ def cleaned_text_to_sequence(cleaned_text, tones, language):
7
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
+ Args:
9
+ text: string to convert to a sequence
10
+ Returns:
11
+ List of integers corresponding to the symbols in the text
12
+ """
13
+ phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
+ tone_start = language_tone_start_map[language]
15
+ tones = [i + tone_start for i in tones]
16
+ lang_id = language_id_map[language]
17
+ lang_ids = [lang_id for i in phones]
18
+ return phones, tones, lang_ids
19
+
20
+
21
+ def get_bert(norm_text, word2ph, language, device, style_text=None, style_weight=0.7):
22
+ #from .chinese_bert import get_bert_feature as zh_bert
23
+ # from .english_bert_mock import get_bert_feature as en_bert
24
+ from .japanese_bert import get_bert_feature as jp_bert
25
+
26
+ lang_bert_func_map = {"JP": jp_bert}
27
+ bert = lang_bert_func_map[language](
28
+ norm_text, word2ph, device, style_text, style_weight
29
+ )
30
+ return bert
31
+
32
+
33
+ def init_openjtalk():
34
+ import platform
35
+
36
+ if platform.platform() == "Linux":
37
+ import pyopenjtalk
38
+
39
+ pyopenjtalk.g2p("こんにちは,世界。")
40
+
41
+
42
+ init_openjtalk()
text/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (1.8 kB). View file
 
text/__pycache__/cleaner.cpython-39.pyc ADDED
Binary file (963 Bytes). View file
 
text/__pycache__/japanese.cpython-39.pyc ADDED
Binary file (14.9 kB). View file
 
text/__pycache__/japanese_bert.cpython-39.pyc ADDED
Binary file (1.79 kB). View file
 
text/__pycache__/japanese_mora_list.cpython-39.pyc ADDED
Binary file (4.35 kB). View file
 
text/__pycache__/symbols.cpython-39.pyc ADDED
Binary file (1.51 kB). View file
 
text/bert_utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ from config import config
6
+
7
+
8
+ MIRROR: str = config.mirror
9
+
10
+
11
+ def _check_bert(repo_id, files, local_path):
12
+ for file in files:
13
+ if not Path(local_path).joinpath(file).exists():
14
+ if MIRROR.lower() == "openi":
15
+ import openi
16
+
17
+ openi.model.download_model(
18
+ "Stardust_minus/Bert-VITS2", repo_id.split("/")[-1], "./bert"
19
+ )
20
+ else:
21
+ hf_hub_download(
22
+ repo_id, file, local_dir=local_path, local_dir_use_symlinks=False
23
+ )
text/chinese.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+
4
+ from pypinyin import Style
5
+ from g2pW.pypinyin_G2pW_bv2 import G2PWPinyin
6
+ from text.symbols import punctuation
7
+ from text.tone_sandhi import ToneSandhi
8
+
9
+ try:
10
+ from tn.chinese.normalizer import Normalizer
11
+
12
+ normalizer = Normalizer().normalize
13
+ except ImportError:
14
+ import cn2an
15
+
16
+ print("tn.chinese.normalizer not found, use cn2an normalizer")
17
+ normalizer = lambda x: cn2an.transform(x, "an2cn")
18
+
19
+ current_file_path = os.path.dirname(__file__)
20
+ pinyin_to_symbol_map = {
21
+ line.split("\t")[0]: line.strip().split("\t")[1]
22
+ for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
23
+ }
24
+
25
+ try:
26
+ import jieba_fast.posseg as psg
27
+ except:
28
+ import jieba.posseg as psg
29
+
30
+
31
+ rep_map = {
32
+ ":": ",",
33
+ ";": ",",
34
+ ",": ",",
35
+ "。": ".",
36
+ "!": "!",
37
+ "?": "?",
38
+ "\n": ".",
39
+ "·": ",",
40
+ "、": ",",
41
+ "...": "…",
42
+ "$": ".",
43
+ "“": "'",
44
+ "”": "'",
45
+ '"': "'",
46
+ "‘": "'",
47
+ "’": "'",
48
+ "(": "'",
49
+ ")": "'",
50
+ "(": "'",
51
+ ")": "'",
52
+ "《": "'",
53
+ "》": "'",
54
+ "【": "'",
55
+ "】": "'",
56
+ "[": "'",
57
+ "]": "'",
58
+ "—": "-",
59
+ "~": "-",
60
+ "~": "-",
61
+ "「": "'",
62
+ "」": "'",
63
+ }
64
+
65
+ tone_modifier = ToneSandhi()
66
+
67
+ pinyinPlus = G2PWPinyin(
68
+ model_dir="g2pW/",
69
+ model_source="bert/Erlangshen-MegatronBert-1.3B-Chinese/",
70
+ v_to_u=False,
71
+ neutral_tone_with_five=True,
72
+ )
73
+
74
+
75
+ def replace_punctuation(text):
76
+ text = text.replace("嗯", "恩").replace("呣", "母")
77
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
78
+
79
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
80
+
81
+ replaced_text = re.sub(
82
+ r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
83
+ )
84
+
85
+ return replaced_text
86
+
87
+
88
+ def g2p(text):
89
+ pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
90
+ sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
91
+ phones, tones, word2ph = _g2p(sentences)
92
+ assert sum(word2ph) == len(phones)
93
+ assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
94
+ phones = ["_"] + phones + ["_"]
95
+ tones = [0] + tones + [0]
96
+ word2ph = [1] + word2ph + [1]
97
+ return phones, tones, word2ph
98
+
99
+
100
+ def _get_initials_finals(word):
101
+ initials = []
102
+ finals = []
103
+ orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
104
+ orig_finals = lazy_pinyin(
105
+ word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
106
+ )
107
+ for c, v in zip(orig_initials, orig_finals):
108
+ initials.append(c)
109
+ finals.append(v)
110
+ return initials, finals
111
+
112
+
113
+ def _get_initials_finalsV2(word, orig_initials, orig_finals):
114
+ initials = []
115
+ finals = []
116
+ for c, v in zip(orig_initials, orig_finals):
117
+ initials.append(c)
118
+ finals.append(v)
119
+ return initials, finals
120
+
121
+
122
+ def _g2p(segments):
123
+ phones_list = []
124
+ tones_list = []
125
+ word2ph = []
126
+ for seg in segments:
127
+ # Replace all English words in the sentence
128
+
129
+ seg = re.sub("[a-zA-Z]+", "", seg)
130
+
131
+ seg_cut = psg.lcut(seg)
132
+ initials = []
133
+ finals = []
134
+ seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
135
+ allWords = ""
136
+ for word, pos in seg_cut:
137
+ allWords = allWords + word
138
+
139
+ orig_initials = pinyinPlus.lazy_pinyin(
140
+ allWords, neutral_tone_with_five=True, style=Style.INITIALS
141
+ )
142
+ orig_finals = pinyinPlus.lazy_pinyin(
143
+ allWords, neutral_tone_with_five=True, style=Style.FINALS_TONE3
144
+ )
145
+ currentIndex = 0
146
+ for word, pos in seg_cut:
147
+ curr_orig_initials = orig_initials[currentIndex : currentIndex + len(word)]
148
+ curr_orig_finalss = orig_finals[currentIndex : currentIndex + len(word)]
149
+ currentIndex = currentIndex + len(word)
150
+ if pos == "eng":
151
+ continue
152
+ sub_initials, sub_finals = _get_initials_finalsV2(
153
+ word, curr_orig_initials, curr_orig_finalss
154
+ )
155
+ sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
156
+ initials.append(sub_initials)
157
+ finals.append(sub_finals)
158
+
159
+ # assert len(sub_initials) == len(sub_finals) == len(word)
160
+ initials = sum(initials, [])
161
+ finals = sum(finals, [])
162
+ #
163
+ for c, v in zip(initials, finals):
164
+ raw_pinyin = c + v
165
+ # NOTE: post process for pypinyin outputs
166
+ # we discriminate i, ii and iii
167
+ if c == v:
168
+ assert c in punctuation
169
+ phone = [c]
170
+ tone = "0"
171
+ word2ph.append(1)
172
+ else:
173
+ v_without_tone = v[:-1]
174
+ tone = v[-1]
175
+
176
+ pinyin = c + v_without_tone
177
+ assert tone in "12345"
178
+
179
+ if c:
180
+ # 多音节
181
+ v_rep_map = {
182
+ "uei": "ui",
183
+ "iou": "iu",
184
+ "uen": "un",
185
+ }
186
+ if v_without_tone in v_rep_map.keys():
187
+ pinyin = c + v_rep_map[v_without_tone]
188
+ else:
189
+ # 单音节
190
+ pinyin_rep_map = {
191
+ "ing": "ying",
192
+ "i": "yi",
193
+ "in": "yin",
194
+ "u": "wu",
195
+ }
196
+ if pinyin in pinyin_rep_map.keys():
197
+ pinyin = pinyin_rep_map[pinyin]
198
+ else:
199
+ single_rep_map = {
200
+ "v": "yu",
201
+ "e": "e",
202
+ "i": "y",
203
+ "u": "w",
204
+ }
205
+ if pinyin[0] in single_rep_map.keys():
206
+ pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
207
+
208
+ assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
209
+ phone = pinyin_to_symbol_map[pinyin].split(" ")
210
+ word2ph.append(len(phone))
211
+
212
+ phones_list += phone
213
+ tones_list += [int(tone)] * len(phone)
214
+ return phones_list, tones_list, word2ph
215
+
216
+
217
+ def text_normalize(text):
218
+ text = normalizer(text)
219
+ text = replace_punctuation(text)
220
+ return text
221
+
222
+
223
+ def get_bert_feature(text, word2ph):
224
+ from text import chinese_bert
225
+
226
+ return chinese_bert.get_bert_feature(text, word2ph)
227
+
228
+
229
+ if __name__ == "__main__":
230
+ from text.chinese_bert import get_bert_feature
231
+
232
+ text = "欸,这个「勾玉」的形状,是不是和那边门上的凹槽很像?"
233
+ text = text_normalize(text)
234
+ print(text)
235
+ phones, tones, word2ph = g2p(text)
236
+ bert = get_bert_feature(text, word2ph)
237
+
238
+ print(phones, tones, word2ph, bert.shape)
239
+
240
+
241
+ # # 示例用法
242
+ # text = "这是一个示例文本:,你好!这是一个测试...."
243
+ # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
text/chinese_bert.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import torch
3
+ from config import config
4
+ from transformers import MegatronBertModel, BertTokenizer
5
+
6
+ LOCAL_PATH = "./bert/Erlangshen-MegatronBert-1.3B-Chinese"
7
+ tokenizer = BertTokenizer.from_pretrained(LOCAL_PATH)
8
+
9
+ models = dict()
10
+
11
+
12
+ def get_bert_feature(
13
+ text,
14
+ word2ph,
15
+ device=config.bert_gen_config.device,
16
+ style_text=None,
17
+ style_weight=0.7,
18
+ ):
19
+ if (
20
+ sys.platform == "darwin"
21
+ and torch.backends.mps.is_available()
22
+ and device == "cpu"
23
+ ):
24
+ device = "mps"
25
+ if not device:
26
+ device = "cuda"
27
+ if device not in models.keys():
28
+ if config.webui_config.fp16_run:
29
+ models[device] = MegatronBertModel.from_pretrained(
30
+ LOCAL_PATH, torch_dtype=torch.float16
31
+ ).to(device)
32
+ else:
33
+ models[device] = MegatronBertModel.from_pretrained(LOCAL_PATH).to(device)
34
+ with torch.no_grad():
35
+ inputs = tokenizer(text, return_tensors="pt")
36
+ for i in inputs:
37
+ inputs[i] = inputs[i].to(device)
38
+ res = models[device](**inputs, output_hidden_states=True)
39
+ res = (
40
+ torch.nn.functional.normalize(
41
+ torch.cat(res["hidden_states"][-3:-2], -1)[0], dim=0
42
+ )
43
+ .float()
44
+ .cpu()
45
+ )
46
+ if style_text:
47
+ style_inputs = tokenizer(style_text, return_tensors="pt")
48
+ for i in style_inputs:
49
+ style_inputs[i] = style_inputs[i].to(device)
50
+ style_res = models[device](**style_inputs, output_hidden_states=True)
51
+ style_res = (
52
+ torch.nn.functional.normalize(
53
+ torch.cat(style_res["hidden_states"][-3:-2], -1)[0], dim=0
54
+ )
55
+ .float()
56
+ .cpu()
57
+ )
58
+ style_res_mean = style_res.mean(0)
59
+ assert len(word2ph) == len(text) + 2
60
+ word2phone = word2ph
61
+ phone_level_feature = []
62
+ for i in range(len(word2phone)):
63
+ if style_text:
64
+ repeat_feature = (
65
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
66
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
67
+ )
68
+ else:
69
+ repeat_feature = res[i].repeat(word2phone[i], 1)
70
+ phone_level_feature.append(repeat_feature)
71
+
72
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
73
+
74
+ return phone_level_feature.T
75
+
76
+
77
+ if __name__ == "__main__":
78
+ word_level_feature = torch.rand(38, 2048) # 12个词,每个词2048维特征
79
+ word2phone = [
80
+ 1,
81
+ 2,
82
+ 1,
83
+ 2,
84
+ 2,
85
+ 1,
86
+ 2,
87
+ 2,
88
+ 1,
89
+ 2,
90
+ 2,
91
+ 1,
92
+ 2,
93
+ 2,
94
+ 2,
95
+ 2,
96
+ 2,
97
+ 1,
98
+ 1,
99
+ 2,
100
+ 2,
101
+ 1,
102
+ 2,
103
+ 2,
104
+ 2,
105
+ 2,
106
+ 1,
107
+ 2,
108
+ 2,
109
+ 2,
110
+ 2,
111
+ 2,
112
+ 1,
113
+ 2,
114
+ 2,
115
+ 2,
116
+ 2,
117
+ 1,
118
+ ]
119
+
120
+ # 计算总帧数
121
+ total_frames = sum(word2phone)
122
+ print(word_level_feature.shape)
123
+ print(word2phone)
124
+ phone_level_feature = []
125
+ for i in range(len(word2phone)):
126
+ print(word_level_feature[i].shape)
127
+
128
+ # 对每个词重复word2phone[i]次
129
+ repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
130
+ phone_level_feature.append(repeat_feature)
131
+
132
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
133
+ print(phone_level_feature.shape) # torch.Size([36, 2048])
text/cleaner.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from text import japanese, cleaned_text_to_sequence
2
+
3
+
4
+ language_module_map = {"JP": japanese}
5
+
6
+
7
+ def clean_text(text, language):
8
+ language_module = language_module_map[language]
9
+ norm_text = language_module.text_normalize(text)
10
+ phones, tones, word2ph = language_module.g2p(norm_text)
11
+ return norm_text, phones, tones, word2ph
12
+
13
+
14
+ def clean_text_bert(text, language):
15
+ language_module = language_module_map[language]
16
+ norm_text = language_module.text_normalize(text)
17
+ phones, tones, word2ph = language_module.g2p(norm_text)
18
+ bert = language_module.get_bert_feature(norm_text, word2ph)
19
+ return phones, tones, bert
20
+
21
+
22
+ def text_to_sequence(text, language):
23
+ norm_text, phones, tones, word2ph = clean_text(text, language)
24
+ return cleaned_text_to_sequence(phones, tones, language)
25
+
26
+
27
+ if __name__ == "__main__":
28
+ pass
text/cmudict.rep ADDED
The diff for this file is too large to render. See raw diff
 
text/cmudict_cache.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
+ size 6212655
text/english.py ADDED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+ import os
3
+ import re
4
+ from g2p_en import G2p
5
+ from transformers import DebertaV2Tokenizer
6
+
7
+ from text import symbols
8
+ from text.symbols import punctuation
9
+
10
+ current_file_path = os.path.dirname(__file__)
11
+ CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
12
+ CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
13
+ _g2p = G2p()
14
+ LOCAL_PATH = "./bert/deberta-v3-large"
15
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
16
+
17
+ arpa = {
18
+ "AH0",
19
+ "S",
20
+ "AH1",
21
+ "EY2",
22
+ "AE2",
23
+ "EH0",
24
+ "OW2",
25
+ "UH0",
26
+ "NG",
27
+ "B",
28
+ "G",
29
+ "AY0",
30
+ "M",
31
+ "AA0",
32
+ "F",
33
+ "AO0",
34
+ "ER2",
35
+ "UH1",
36
+ "IY1",
37
+ "AH2",
38
+ "DH",
39
+ "IY0",
40
+ "EY1",
41
+ "IH0",
42
+ "K",
43
+ "N",
44
+ "W",
45
+ "IY2",
46
+ "T",
47
+ "AA1",
48
+ "ER1",
49
+ "EH2",
50
+ "OY0",
51
+ "UH2",
52
+ "UW1",
53
+ "Z",
54
+ "AW2",
55
+ "AW1",
56
+ "V",
57
+ "UW2",
58
+ "AA2",
59
+ "ER",
60
+ "AW0",
61
+ "UW0",
62
+ "R",
63
+ "OW1",
64
+ "EH1",
65
+ "ZH",
66
+ "AE0",
67
+ "IH2",
68
+ "IH",
69
+ "Y",
70
+ "JH",
71
+ "P",
72
+ "AY1",
73
+ "EY0",
74
+ "OY2",
75
+ "TH",
76
+ "HH",
77
+ "D",
78
+ "ER0",
79
+ "CH",
80
+ "AO1",
81
+ "AE1",
82
+ "AO2",
83
+ "OY1",
84
+ "AY2",
85
+ "IH1",
86
+ "OW0",
87
+ "L",
88
+ "SH",
89
+ }
90
+
91
+
92
+ def post_replace_ph(ph):
93
+ rep_map = {
94
+ ":": ",",
95
+ ";": ",",
96
+ ",": ",",
97
+ "。": ".",
98
+ "!": "!",
99
+ "?": "?",
100
+ "\n": ".",
101
+ "·": ",",
102
+ "、": ",",
103
+ "…": "...",
104
+ "···": "...",
105
+ "・・・": "...",
106
+ "v": "V",
107
+ }
108
+ if ph in rep_map.keys():
109
+ ph = rep_map[ph]
110
+ if ph in symbols:
111
+ return ph
112
+ if ph not in symbols:
113
+ ph = "UNK"
114
+ return ph
115
+
116
+
117
+ rep_map = {
118
+ ":": ",",
119
+ ";": ",",
120
+ ",": ",",
121
+ "。": ".",
122
+ "!": "!",
123
+ "?": "?",
124
+ "\n": ".",
125
+ ".": ".",
126
+ "…": "...",
127
+ "···": "...",
128
+ "・・・": "...",
129
+ "·": ",",
130
+ "・": ",",
131
+ "、": ",",
132
+ "$": ".",
133
+ "“": "'",
134
+ "”": "'",
135
+ '"': "'",
136
+ "‘": "'",
137
+ "’": "'",
138
+ "(": "'",
139
+ ")": "'",
140
+ "(": "'",
141
+ ")": "'",
142
+ "《": "'",
143
+ "》": "'",
144
+ "【": "'",
145
+ "】": "'",
146
+ "[": "'",
147
+ "]": "'",
148
+ "—": "-",
149
+ "−": "-",
150
+ "~": "-",
151
+ "~": "-",
152
+ "「": "'",
153
+ "」": "'",
154
+ }
155
+
156
+
157
+ def replace_punctuation(text):
158
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
159
+
160
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
161
+
162
+ # replaced_text = re.sub(
163
+ # r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
164
+ # + "".join(punctuation)
165
+ # + r"]+",
166
+ # "",
167
+ # replaced_text,
168
+ # )
169
+
170
+ return replaced_text
171
+
172
+
173
+ def read_dict():
174
+ g2p_dict = {}
175
+ start_line = 49
176
+ with open(CMU_DICT_PATH) as f:
177
+ line = f.readline()
178
+ line_index = 1
179
+ while line:
180
+ if line_index >= start_line:
181
+ line = line.strip()
182
+ word_split = line.split(" ")
183
+ word = word_split[0]
184
+
185
+ syllable_split = word_split[1].split(" - ")
186
+ g2p_dict[word] = []
187
+ for syllable in syllable_split:
188
+ phone_split = syllable.split(" ")
189
+ g2p_dict[word].append(phone_split)
190
+
191
+ line_index = line_index + 1
192
+ line = f.readline()
193
+
194
+ return g2p_dict
195
+
196
+
197
+ def cache_dict(g2p_dict, file_path):
198
+ with open(file_path, "wb") as pickle_file:
199
+ pickle.dump(g2p_dict, pickle_file)
200
+
201
+
202
+ def get_dict():
203
+ if os.path.exists(CACHE_PATH):
204
+ with open(CACHE_PATH, "rb") as pickle_file:
205
+ g2p_dict = pickle.load(pickle_file)
206
+ else:
207
+ g2p_dict = read_dict()
208
+ cache_dict(g2p_dict, CACHE_PATH)
209
+
210
+ return g2p_dict
211
+
212
+
213
+ eng_dict = get_dict()
214
+
215
+
216
+ def refine_ph(phn):
217
+ tone = 0
218
+ if re.search(r"\d$", phn):
219
+ tone = int(phn[-1]) + 1
220
+ phn = phn[:-1]
221
+ else:
222
+ tone = 3
223
+ return phn.lower(), tone
224
+
225
+
226
+ def refine_syllables(syllables):
227
+ tones = []
228
+ phonemes = []
229
+ for phn_list in syllables:
230
+ for i in range(len(phn_list)):
231
+ phn = phn_list[i]
232
+ phn, tone = refine_ph(phn)
233
+ phonemes.append(phn)
234
+ tones.append(tone)
235
+ return phonemes, tones
236
+
237
+
238
+ import re
239
+ import inflect
240
+
241
+ _inflect = inflect.engine()
242
+ _comma_number_re = re.compile(r"([0-9][0-9\,]+[0-9])")
243
+ _decimal_number_re = re.compile(r"([0-9]+\.[0-9]+)")
244
+ _pounds_re = re.compile(r"£([0-9\,]*[0-9]+)")
245
+ _dollars_re = re.compile(r"\$([0-9\.\,]*[0-9]+)")
246
+ _ordinal_re = re.compile(r"[0-9]+(st|nd|rd|th)")
247
+ _number_re = re.compile(r"[0-9]+")
248
+
249
+ # List of (regular expression, replacement) pairs for abbreviations:
250
+ _abbreviations = [
251
+ (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
252
+ for x in [
253
+ ("mrs", "misess"),
254
+ ("mr", "mister"),
255
+ ("dr", "doctor"),
256
+ ("st", "saint"),
257
+ ("co", "company"),
258
+ ("jr", "junior"),
259
+ ("maj", "major"),
260
+ ("gen", "general"),
261
+ ("drs", "doctors"),
262
+ ("rev", "reverend"),
263
+ ("lt", "lieutenant"),
264
+ ("hon", "honorable"),
265
+ ("sgt", "sergeant"),
266
+ ("capt", "captain"),
267
+ ("esq", "esquire"),
268
+ ("ltd", "limited"),
269
+ ("col", "colonel"),
270
+ ("ft", "fort"),
271
+ ]
272
+ ]
273
+
274
+
275
+ # List of (ipa, lazy ipa) pairs:
276
+ _lazy_ipa = [
277
+ (re.compile("%s" % x[0]), x[1])
278
+ for x in [
279
+ ("r", "ɹ"),
280
+ ("æ", "e"),
281
+ ("ɑ", "a"),
282
+ ("ɔ", "o"),
283
+ ("ð", "z"),
284
+ ("θ", "s"),
285
+ ("ɛ", "e"),
286
+ ("ɪ", "i"),
287
+ ("ʊ", "u"),
288
+ ("ʒ", "ʥ"),
289
+ ("ʤ", "ʥ"),
290
+ ("ˈ", "↓"),
291
+ ]
292
+ ]
293
+
294
+ # List of (ipa, lazy ipa2) pairs:
295
+ _lazy_ipa2 = [
296
+ (re.compile("%s" % x[0]), x[1])
297
+ for x in [
298
+ ("r", "ɹ"),
299
+ ("ð", "z"),
300
+ ("θ", "s"),
301
+ ("ʒ", "ʑ"),
302
+ ("ʤ", "dʑ"),
303
+ ("ˈ", "↓"),
304
+ ]
305
+ ]
306
+
307
+ # List of (ipa, ipa2) pairs
308
+ _ipa_to_ipa2 = [
309
+ (re.compile("%s" % x[0]), x[1]) for x in [("r", "ɹ"), ("ʤ", "dʒ"), ("ʧ", "tʃ")]
310
+ ]
311
+
312
+
313
+ def _expand_dollars(m):
314
+ match = m.group(1)
315
+ parts = match.split(".")
316
+ if len(parts) > 2:
317
+ return match + " dollars" # Unexpected format
318
+ dollars = int(parts[0]) if parts[0] else 0
319
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
320
+ if dollars and cents:
321
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
322
+ cent_unit = "cent" if cents == 1 else "cents"
323
+ return "%s %s, %s %s" % (dollars, dollar_unit, cents, cent_unit)
324
+ elif dollars:
325
+ dollar_unit = "dollar" if dollars == 1 else "dollars"
326
+ return "%s %s" % (dollars, dollar_unit)
327
+ elif cents:
328
+ cent_unit = "cent" if cents == 1 else "cents"
329
+ return "%s %s" % (cents, cent_unit)
330
+ else:
331
+ return "zero dollars"
332
+
333
+
334
+ def _remove_commas(m):
335
+ return m.group(1).replace(",", "")
336
+
337
+
338
+ def _expand_ordinal(m):
339
+ return _inflect.number_to_words(m.group(0))
340
+
341
+
342
+ def _expand_number(m):
343
+ num = int(m.group(0))
344
+ if num > 1000 and num < 3000:
345
+ if num == 2000:
346
+ return "two thousand"
347
+ elif num > 2000 and num < 2010:
348
+ return "two thousand " + _inflect.number_to_words(num % 100)
349
+ elif num % 100 == 0:
350
+ return _inflect.number_to_words(num // 100) + " hundred"
351
+ else:
352
+ return _inflect.number_to_words(
353
+ num, andword="", zero="oh", group=2
354
+ ).replace(", ", " ")
355
+ else:
356
+ return _inflect.number_to_words(num, andword="")
357
+
358
+
359
+ def _expand_decimal_point(m):
360
+ return m.group(1).replace(".", " point ")
361
+
362
+
363
+ def normalize_numbers(text):
364
+ text = re.sub(_comma_number_re, _remove_commas, text)
365
+ text = re.sub(_pounds_re, r"\1 pounds", text)
366
+ text = re.sub(_dollars_re, _expand_dollars, text)
367
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
368
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
369
+ text = re.sub(_number_re, _expand_number, text)
370
+ return text
371
+
372
+
373
+ def text_normalize(text):
374
+ text = normalize_numbers(text)
375
+ text = replace_punctuation(text)
376
+ text = re.sub(r"([,;.\?\!])([\w])", r"\1 \2", text)
377
+ return text
378
+
379
+
380
+ def distribute_phone(n_phone, n_word):
381
+ phones_per_word = [0] * n_word
382
+ for task in range(n_phone):
383
+ min_tasks = min(phones_per_word)
384
+ min_index = phones_per_word.index(min_tasks)
385
+ phones_per_word[min_index] += 1
386
+ return phones_per_word
387
+
388
+
389
+ def sep_text(text):
390
+ words = re.split(r"([,;.\?\!\s+])", text)
391
+ words = [word for word in words if word.strip() != ""]
392
+ return words
393
+
394
+
395
+ def text_to_words(text):
396
+ tokens = tokenizer.tokenize(text)
397
+ words = []
398
+ for idx, t in enumerate(tokens):
399
+ if t.startswith("▁"):
400
+ words.append([t[1:]])
401
+ else:
402
+ if t in punctuation:
403
+ if idx == len(tokens) - 1:
404
+ words.append([f"{t}"])
405
+ else:
406
+ if (
407
+ not tokens[idx + 1].startswith("▁")
408
+ and tokens[idx + 1] not in punctuation
409
+ ):
410
+ if idx == 0:
411
+ words.append([])
412
+ words[-1].append(f"{t}")
413
+ else:
414
+ words.append([f"{t}"])
415
+ else:
416
+ if idx == 0:
417
+ words.append([])
418
+ words[-1].append(f"{t}")
419
+ return words
420
+
421
+
422
+ def g2p(text):
423
+ phones = []
424
+ tones = []
425
+ phone_len = []
426
+ # words = sep_text(text)
427
+ # tokens = [tokenizer.tokenize(i) for i in words]
428
+ words = text_to_words(text)
429
+
430
+ for word in words:
431
+ temp_phones, temp_tones = [], []
432
+ if len(word) > 1:
433
+ if "'" in word:
434
+ word = ["".join(word)]
435
+ for w in word:
436
+ if w in punctuation:
437
+ temp_phones.append(w)
438
+ temp_tones.append(0)
439
+ continue
440
+ if w.upper() in eng_dict:
441
+ phns, tns = refine_syllables(eng_dict[w.upper()])
442
+ temp_phones += [post_replace_ph(i) for i in phns]
443
+ temp_tones += tns
444
+ # w2ph.append(len(phns))
445
+ else:
446
+ phone_list = list(filter(lambda p: p != " ", _g2p(w)))
447
+ phns = []
448
+ tns = []
449
+ for ph in phone_list:
450
+ if ph in arpa:
451
+ ph, tn = refine_ph(ph)
452
+ phns.append(ph)
453
+ tns.append(tn)
454
+ else:
455
+ phns.append(ph)
456
+ tns.append(0)
457
+ temp_phones += [post_replace_ph(i) for i in phns]
458
+ temp_tones += tns
459
+ phones += temp_phones
460
+ tones += temp_tones
461
+ phone_len.append(len(temp_phones))
462
+ # phones = [post_replace_ph(i) for i in phones]
463
+
464
+ word2ph = []
465
+ for token, pl in zip(words, phone_len):
466
+ word_len = len(token)
467
+
468
+ aaa = distribute_phone(pl, word_len)
469
+ word2ph += aaa
470
+
471
+ phones = ["_"] + phones + ["_"]
472
+ tones = [0] + tones + [0]
473
+ word2ph = [1] + word2ph + [1]
474
+ assert len(phones) == len(tones), text
475
+ assert len(phones) == sum(word2ph), text
476
+
477
+ return phones, tones, word2ph
478
+
479
+
480
+ def get_bert_feature(text, word2ph):
481
+ from text import english_bert_mock
482
+
483
+ return english_bert_mock.get_bert_feature(text, word2ph)
484
+
485
+
486
+ if __name__ == "__main__":
487
+ # print(get_dict())
488
+ # print(eng_word_to_phoneme("hello"))
489
+ print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
490
+ # all_phones = set()
491
+ # for k, syllables in eng_dict.items():
492
+ # for group in syllables:
493
+ # for ph in group:
494
+ # all_phones.add(ph)
495
+ # print(all_phones)
text/english_bert_mock.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import DebertaV2Model, DebertaV2Tokenizer
5
+
6
+ from config import config
7
+
8
+
9
+ LOCAL_PATH = "./bert/deberta-v3-large"
10
+
11
+ tokenizer = DebertaV2Tokenizer.from_pretrained(LOCAL_PATH)
12
+
13
+ models = dict()
14
+
15
+
16
+ def get_bert_feature(
17
+ text,
18
+ word2ph,
19
+ device=config.bert_gen_config.device,
20
+ style_text=None,
21
+ style_weight=0.7,
22
+ ):
23
+ if (
24
+ sys.platform == "darwin"
25
+ and torch.backends.mps.is_available()
26
+ and device == "cpu"
27
+ ):
28
+ device = "mps"
29
+ if not device:
30
+ device = "cuda"
31
+ if device not in models.keys():
32
+ if config.webui_config.fp16_run:
33
+ models[device] = DebertaV2Model.from_pretrained(
34
+ LOCAL_PATH, torch_dtype=torch.float16
35
+ ).to(device)
36
+ else:
37
+ models[device] = DebertaV2Model.from_pretrained(LOCAL_PATH).to(device)
38
+ with torch.no_grad():
39
+ inputs = tokenizer(text, return_tensors="pt")
40
+ for i in inputs:
41
+ inputs[i] = inputs[i].to(device)
42
+ res = models[device](**inputs, output_hidden_states=True)
43
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
44
+ if style_text:
45
+ style_inputs = tokenizer(style_text, return_tensors="pt")
46
+ for i in style_inputs:
47
+ style_inputs[i] = style_inputs[i].to(device)
48
+ style_res = models[device](**style_inputs, output_hidden_states=True)
49
+ style_res = (
50
+ torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
51
+ )
52
+ style_res_mean = style_res.mean(0)
53
+ assert len(word2ph) == res.shape[0], (text, res.shape[0], len(word2ph))
54
+ word2phone = word2ph
55
+ phone_level_feature = []
56
+ for i in range(len(word2phone)):
57
+ if style_text:
58
+ repeat_feature = (
59
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
60
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
61
+ )
62
+ else:
63
+ repeat_feature = res[i].repeat(word2phone[i], 1)
64
+ phone_level_feature.append(repeat_feature)
65
+
66
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
67
+
68
+ return phone_level_feature.T
text/japanese.py ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ import pyopenjtalk
7
+ from num2words import num2words
8
+ from transformers import AutoTokenizer
9
+
10
+ from text import punctuation
11
+ from text.japanese_mora_list import (
12
+ mora_kata_to_mora_phonemes,
13
+ )
14
+
15
+ # 子音の集合
16
+ COSONANTS = set(
17
+ [
18
+ cosonant
19
+ for cosonant, _ in mora_kata_to_mora_phonemes.values()
20
+ if cosonant is not None
21
+ ]
22
+ )
23
+
24
+ # 母音の集合
25
+ VOWELS = {"a", "i", "u", "e", "o"}
26
+
27
+
28
+ # 正規化で記号を変換するための辞書
29
+ rep_map = {
30
+ ":": ",",
31
+ ";": ",",
32
+ ",": ",",
33
+ "。": ".",
34
+ "!": "!",
35
+ "?": "?",
36
+ "\n": ".",
37
+ ".": ".",
38
+ "…": "...",
39
+ "···": "...",
40
+ "・・・": "...",
41
+ "·": ",",
42
+ "・": ",",
43
+ "、": ",",
44
+ "$": ".",
45
+ "“": "'",
46
+ "”": "'",
47
+ '"': "'",
48
+ "‘": "'",
49
+ "’": "'",
50
+ "(": "'",
51
+ ")": "'",
52
+ "(": "'",
53
+ ")": "'",
54
+ "《": "'",
55
+ "》": "'",
56
+ "【": "'",
57
+ "】": "'",
58
+ "[": "'",
59
+ "]": "'",
60
+ "—": "-",
61
+ "−": "-",
62
+ # "~": "-", # これは長音記号「ー」として扱うよう変更
63
+ # "~": "-", # これは長音記号「ー」として扱うよう変更
64
+ "「": "'",
65
+ "」": "'",
66
+ }
67
+
68
+
69
+ def text_normalize(text):
70
+ """
71
+ 日本語のテキストを正規化する。
72
+ 結果は、ちょうど次の文字のみからなる:
73
+ - ひらがな
74
+ - カタカナ(全角長音記号「ー」が入る!)
75
+ - 漢字
76
+ - 半角アルファベット(大文字と小文字)
77
+ - ギリシャ文字
78
+ - `.` (句点`。`や`…`の一部や改行等)
79
+ - `,` (読点`、`や`:`等)
80
+ - `?` (疑問符`?`)
81
+ - `!` (感嘆符`!`)
82
+ - `'` (`「`や`」`等)
83
+ - `-` (`―`(ダッシュ、長音記号ではない)や`-`等)
84
+
85
+ 注意点:
86
+ - 三点リーダー`…`は`...`に変換される(`なるほど…。` → `なるほど....`)
87
+ - 数字は漢字に変換される(`1,100円` → `千百円`、`52.34` → `五十二点三四`)
88
+ - 読点や疑問符等の位置・個数等は保持される(`??あ、、!!!` → `??あ,,!!!`)
89
+ """
90
+ #print(f"Before normalization: {text}")
91
+ # ここでアルファベットは半角になり、三点リーダは`...`になる
92
+ res = unicodedata.normalize("NFKC", text)
93
+
94
+ res = japanese_convert_numbers_to_words(res) # 「100円」→「百円」等
95
+
96
+ # 「~」と「~」も長音記号として扱う
97
+ res = res.replace("~", "ー")
98
+ res = res.replace("~", "ー")
99
+
100
+ res = replace_punctuation(res) # 句読点等正規化、読めない文字を削除
101
+
102
+ # 結合文字の濁点・半濁点を削除
103
+ # 通常の「ば」等はそのままのこされる、「あ゛」は上で「あ゙」になりここで「あ」になる
104
+ res = res.replace("\u3099", "") # 結合文字の濁点を削除、る゙ → る
105
+ res = res.replace("\u309A", "") # 結合文字の半濁点を削除、な゚ → な
106
+ return res
107
+
108
+
109
+ def replace_punctuation(text: str) -> str:
110
+ """句読点等を「.」「,」「!」「?」「'」「-」に正規化し、OpenJTalkで読みが取得できるもののみ残す:
111
+ 漢字・平仮名・カタカナ、アルファベット、ギリシャ文字
112
+ """
113
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
114
+
115
+ # 句読点を辞書で置換
116
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
117
+
118
+ replaced_text = re.sub(
119
+ # ↓ ひらがな、カタカナ、漢字
120
+ r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF\u3005"
121
+ # ↓ 半角アルファベット(大文字と小文字)
122
+ + r"\u0041-\u005A\u0061-\u007A"
123
+ # ↓ 全角アルファベット(大文字と小文字)
124
+ + r"\uFF21-\uFF3A\uFF41-\uFF5A"
125
+ # ↓ ギリシャ文字
126
+ + r"\u0370-\u03FF\u1F00-\u1FFF"
127
+ # ↓ "!", "?", "…", ",", ".", "'", "-", 但し`…`はすでに`...`に変換されている
128
+ + "".join(punctuation) + r"]+",
129
+ # 上述以外の文字を削除
130
+ "",
131
+ replaced_text,
132
+ )
133
+
134
+ return replaced_text
135
+
136
+
137
+ _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
138
+ _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
139
+ _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
140
+ _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
141
+
142
+
143
+ def japanese_convert_numbers_to_words(text: str) -> str:
144
+ res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
145
+ res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
146
+ res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
147
+ return res
148
+
149
+
150
+ def g2p(norm_text: str) -> tuple[list[str], list[int], list[int]]:
151
+ """
152
+ 他で使われるメインの関数。`text_normalize()`で正規化された`norm_text`を受け取り、
153
+ - phones: 音素のリスト(ただし`!`や`,`や`.`等punctuationが含まれうる)
154
+ - tones: アクセントのリスト、0(低)と1(高)からなり、phonesと同じ長さ
155
+ - word2ph: 元のテキストの各文字に音素が何個割り当てられるかを表すリスト
156
+ のタプルを返す。
157
+ ただし`phones`と`tones`の最初と終わりに`_`が入り、応じて`word2ph`の最初と最後に1が追加される。
158
+ """
159
+ # pyopenjtalkのフルコンテキストラベルを使ってアクセントを取り出すと、punctuationの位置が消えてしまい情報が失われてしまう:
160
+ # 「こんにちは、世界。」と「こんにちは!世界。」と「こんにちは!!!???世界……。」は全て同じになる。
161
+ # よって、まずpunctuation無しの音素とアクセントのリストを作り、
162
+ # それとは別にpyopenjtalk.run_frontend()で得られる音素リスト(こちらはpunctuationが保持される)を使い、
163
+ # アクセント割当をしなおすことによってpunctuationを含めた音素とアクセントのリストを作る。
164
+
165
+ # punctuationがすべて消えた、音素とアクセントのタプルのリスト
166
+ phone_tone_list_wo_punct = g2phone_tone_wo_punct(norm_text)
167
+
168
+ # sep_text: 単語単位の単語のリスト
169
+ # sep_kata: 単語単位の単語のカタカナ読みのリスト
170
+ sep_text, sep_kata = text2sep_kata(norm_text)
171
+
172
+ # sep_phonemes: 各単語ごとの音素のリストのリスト
173
+ sep_phonemes = handle_long([kata2phoneme_list(i) for i in sep_kata])
174
+
175
+ # phone_w_punct: sep_phonemesを結合した、punctuationを元のまま保持した音素列
176
+ phone_w_punct: list[str] = []
177
+ for i in sep_phonemes:
178
+ phone_w_punct += i
179
+
180
+ # punctuation無しのアクセント情報を使って、punctuationを含めたアクセント情報を作る
181
+ phone_tone_list = align_tones(phone_w_punct, phone_tone_list_wo_punct)
182
+ # word2phは厳密な解答は不可能なので(「今日」「眼鏡」等の熟字訓が存在)、
183
+ # Bert-VITS2では、単語単位の分割を使って、単語の文字ごとにだいたい均等に音素を分配する
184
+
185
+ # sep_textから、各単語を1文字1文字分割して、文字のリスト(のリスト)を作る
186
+ sep_tokenized: list[list[str]] = []
187
+ for i in sep_text:
188
+ if i not in punctuation:
189
+ sep_tokenized.append(tokenizer.tokenize(i)) # ここでおそらく`i`が文字単位に分割される
190
+ else:
191
+ sep_tokenized.append([i])
192
+
193
+ # 各単語について、音素の数と文字の数を比較して、均等っぽく分配する
194
+ word2ph = []
195
+ for token, phoneme in zip(sep_tokenized, sep_phonemes):
196
+ phone_len = len(phoneme)
197
+ word_len = len(token)
198
+ word2ph += distribute_phone(phone_len, word_len)
199
+
200
+ # 最初と最後に`_`記号を追加、アクセントは0(低)、word2phもそれに合わせて追加
201
+ phone_tone_list = [("_", 0)] + phone_tone_list + [("_", 0)]
202
+ word2ph = [1] + word2ph + [1]
203
+
204
+ phones = [phone for phone, _ in phone_tone_list]
205
+ tones = [tone for _, tone in phone_tone_list]
206
+
207
+ assert len(phones) == sum(word2ph), f"{len(phones)} != {sum(word2ph)}"
208
+
209
+ return phones, tones, word2ph
210
+
211
+
212
+ def g2phone_tone_wo_punct(text: str) -> list[tuple[str, int]]:
213
+ """
214
+ テキストに対して、音素とアクセント(0か1)のペアのリストを返す。
215
+ ただし「!」「.」「?」等の非音素記号(punctuation)は全て消える(ポーズ記号も残さない)。
216
+ 非音素記号を含める処理は`align_tones()`で行われる。
217
+ また「っ」は「cl」でなく「q」に変換される(「ん」は「N」のまま)。
218
+ 例: "こんにちは、世界ー。。元気?!" →
219
+ [('k', 0), ('o', 0), ('N', 1), ('n', 1), ('i', 1), ('ch', 1), ('i', 1), ('w', 1), ('a', 1), ('s', 1), ('e', 1), ('k', 0), ('a', 0), ('i', 0), ('i', 0), ('g', 1), ('e', 1), ('N', 0), ('k', 0), ('i', 0)]
220
+ """
221
+ prosodies = pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True)
222
+ result: list[tuple[str, int]] = []
223
+ current_phrase: list[tuple[str, int]] = []
224
+ current_tone = 0
225
+ for i, letter in enumerate(prosodies):
226
+ # 特殊記号の処理
227
+
228
+ # 文頭記号、無視する
229
+ if letter == "^":
230
+ assert i == 0, "Unexpected ^"
231
+ # アクセント句の終わりに来る記号
232
+ elif letter in ("$", "?", "_", "#"):
233
+ # 保持しているフレーズを、アクセント数値を0-1に修正し結果に追加
234
+ result.extend(fix_phone_tone(current_phrase))
235
+ # 末尾に来る終了記号、無視(文中の疑問文は`_`になる)
236
+ if letter in ("$", "?"):
237
+ assert i == len(prosodies) - 1, f"Unexpected {letter}"
238
+ # あとは"_"(ポーズ)と"#"(アクセント句の境界)のみ
239
+ # これらは残さず、次のアクセント句に備える。
240
+ current_phrase = []
241
+ # 0を基準点にしてそこから上昇・下降する(負の場合は上の`fix_phone_tone`で直る)
242
+ current_tone = 0
243
+ # アクセント上昇記号
244
+ elif letter == "[":
245
+ current_tone = current_tone + 1
246
+ # アクセント下降記号
247
+ elif letter == "]":
248
+ current_tone = current_tone - 1
249
+ # それ以外は通常の音素
250
+ else:
251
+ if letter == "cl": # 「っ」の処理
252
+ letter = "q"
253
+ current_phrase.append((letter, current_tone))
254
+ return result
255
+
256
+
257
+ def text2sep_kata(norm_text: str) -> tuple[list[str], list[str]]:
258
+ """
259
+ `text_normalize`で正規化済みの`norm_text`を受け取り、それを単語分割し、
260
+ 分割された単語リストとその読み(カタカナor記号1文字)のリストのタプルを返す。
261
+ 単語分割結果は、`g2p()`の`word2ph`で1文字あたりに割り振る音素記号の数を決めるために使う。
262
+ 例:
263
+ `私はそう思う!って感じ?` →
264
+ ["私", "は", "そう", "思う", "!", "って", "感じ", "?"], ["ワタシ", "ワ", "ソー", "オモウ", "!", "ッテ", "カンジ", "?"]
265
+ """
266
+ # parsed: OpenJTalkの解析結果
267
+ parsed = pyopenjtalk.run_frontend(norm_text)
268
+ sep_text: list[str] = []
269
+ sep_kata: list[str] = []
270
+ for parts in parsed:
271
+ # word: 実際の単語の文字列
272
+ # yomi: その読み、但し無声化サインの`’`は除去
273
+ word, yomi = replace_punctuation(parts["string"]), parts["pron"].replace(
274
+ "’", ""
275
+ )
276
+ """
277
+ ここで`yomi`の取りうる値は以下の通りのはず。
278
+ - `word`が通常単語 → 通常の読み(カタカナ)
279
+ (カタカナからなり、長音記号も含みうる、`アー` 等)
280
+ - `word`が`ー` から始まる → `ーラー` や `ーーー` など
281
+ - `word`が句読点や空白等 → `、`
282
+ - `word`が`?` → `?`(全角になる)
283
+ 他にも`word`が読めないキリル文字アラビア文字等が来ると`、`になるが、正規化でこの場合は起きないはず。
284
+ また元のコードでは`yomi`が空白の場合の処理があったが、これは起きないはず。
285
+ 処理すべきは`yomi`が`、`の場合のみのはず。
286
+ """
287
+ assert yomi != "", f"Empty yomi: {word}"
288
+ if yomi == "、":
289
+ # wordは正規化されているので、`.`, `,`, `!`, `'`, `-`のいずれか
290
+ if word not in (
291
+ ".",
292
+ ",",
293
+ "!",
294
+ "'",
295
+ "-",
296
+ ):
297
+ # ここはpyopenjtalkが読めない文字等のときに起こる
298
+ raise ValueError(f"Cannot read: {word} in:\n{norm_text}")
299
+ # yomiは元の記号のままに変更
300
+ yomi = word
301
+ elif yomi == "?":
302
+ assert word == "?", f"yomi `?` comes from: {word}"
303
+ yomi = "?"
304
+ sep_text.append(word)
305
+ sep_kata.append(yomi)
306
+ return sep_text, sep_kata
307
+
308
+
309
+ # ESPnetの実装から引用、変更点無し
310
+ # https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
311
+ def pyopenjtalk_g2p_prosody(text: str, drop_unvoiced_vowels: bool = True) -> list[str]:
312
+ """Extract phoneme + prosoody symbol sequence from input full-context labels.
313
+
314
+ The algorithm is based on `Prosodic features control by symbols as input of
315
+ sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.
316
+
317
+ Args:
318
+ text (str): Input text.
319
+ drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.
320
+
321
+ Returns:
322
+ List[str]: List of phoneme + prosody symbols.
323
+
324
+ Examples:
325
+ >>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
326
+ >>> pyopenjtalk_g2p_prosody("こんにちは。")
327
+ ['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']
328
+
329
+ .. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
330
+ modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104
331
+
332
+ """
333
+ labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
334
+ N = len(labels)
335
+
336
+ phones = []
337
+ for n in range(N):
338
+ lab_curr = labels[n]
339
+
340
+ # current phoneme
341
+ p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
342
+ # deal unvoiced vowels as normal vowels
343
+ if drop_unvoiced_vowels and p3 in "AEIOU":
344
+ p3 = p3.lower()
345
+
346
+ # deal with sil at the beginning and the end of text
347
+ if p3 == "sil":
348
+ assert n == 0 or n == N - 1
349
+ if n == 0:
350
+ phones.append("^")
351
+ elif n == N - 1:
352
+ # check question form or not
353
+ e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
354
+ if e3 == 0:
355
+ phones.append("$")
356
+ elif e3 == 1:
357
+ phones.append("?")
358
+ continue
359
+ elif p3 == "pau":
360
+ phones.append("_")
361
+ continue
362
+ else:
363
+ phones.append(p3)
364
+
365
+ # accent type and position info (forward or backward)
366
+ a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
367
+ a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
368
+ a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)
369
+
370
+ # number of mora in accent phrase
371
+ f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)
372
+
373
+ a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
374
+ # accent phrase border
375
+ if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
376
+ phones.append("#")
377
+ # pitch falling
378
+ elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
379
+ phones.append("]")
380
+ # pitch rising
381
+ elif a2 == 1 and a2_next == 2:
382
+ phones.append("[")
383
+
384
+ return phones
385
+
386
+
387
+ def _numeric_feature_by_regex(regex, s):
388
+ match = re.search(regex, s)
389
+ if match is None:
390
+ return -50
391
+ return int(match.group(1))
392
+
393
+
394
+ def fix_phone_tone(phone_tone_list: list[tuple[str, int]]) -> list[tuple[str, int]]:
395
+ """
396
+ `phone_tone_list`のtone(アクセントの値)を0か1の範囲に修正する。
397
+ 例: [(a, 0), (i, -1), (u, -1)] → [(a, 1), (i, 0), (u, 0)]
398
+ """
399
+ tone_values = set(tone for _, tone in phone_tone_list)
400
+ if len(tone_values) == 1:
401
+ assert tone_values == {0}, tone_values
402
+ return phone_tone_list
403
+ elif len(tone_values) == 2:
404
+ if tone_values == {0, 1}:
405
+ return phone_tone_list
406
+ elif tone_values == {-1, 0}:
407
+ return [
408
+ (letter, 0 if tone == -1 else 1) for letter, tone in phone_tone_list
409
+ ]
410
+ else:
411
+ raise ValueError(f"Unexpected tone values: {tone_values}")
412
+ else:
413
+ raise ValueError(f"Unexpected tone values: {tone_values}")
414
+
415
+
416
+ def distribute_phone(n_phone: int, n_word: int) -> list[int]:
417
+ """
418
+ 左から右に1ずつ振り分け、次にまた左から右に1ずつ増やし、というふうに、
419
+ 音素の数`n_phone`を単語の数`n_word`に分配する。
420
+ """
421
+ phones_per_word = [0] * n_word
422
+ for _ in range(n_phone):
423
+ min_tasks = min(phones_per_word)
424
+ min_index = phones_per_word.index(min_tasks)
425
+ phones_per_word[min_index] += 1
426
+ return phones_per_word
427
+
428
+
429
+ def handle_long(sep_phonemes: list[list[str]]) -> list[list[str]]:
430
+ for i in range(len(sep_phonemes)):
431
+ if sep_phonemes[i][0] == "ー":
432
+ sep_phonemes[i][0] = sep_phonemes[i - 1][-1]
433
+ if "ー" in sep_phonemes[i]:
434
+ for j in range(len(sep_phonemes[i])):
435
+ if sep_phonemes[i][j] == "ー":
436
+ sep_phonemes[i][j] = sep_phonemes[i][j - 1][-1]
437
+ return sep_phonemes
438
+
439
+
440
+ tokenizer = AutoTokenizer.from_pretrained("./bert/deberta-v2-large-japanese-char-wwm")
441
+
442
+
443
+ def align_tones(
444
+ phones_with_punct: list[str], phone_tone_list: list[tuple[str, int]]
445
+ ) -> list[tuple[str, int]]:
446
+ """
447
+ 例:
448
+ …私は、、そう思う。
449
+ phones_with_punct:
450
+ [".", ".", ".", "w", "a", "t", "a", "sh", "i", "w", "a", ",", ",", "s", "o", "o", "o", "m", "o", "u", "."]
451
+ phone_tone_list:
452
+ [("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0))]
453
+ Return:
454
+ [(".", 0), (".", 0), (".", 0), ("w", 0), ("a", 0), ("t", 1), ("a", 1), ("sh", 1), ("i", 1), ("w", 1), ("a", 1), (",", 0), (",", 0), ("s", 0), ("o", 0), ("o", 1), ("o", 1), ("m", 1), ("o", 1), ("u", 0), (".", 0)]
455
+ """
456
+ result: list[tuple[str, int]] = []
457
+ tone_index = 0
458
+ for phone in phones_with_punct:
459
+ if tone_index >= len(phone_tone_list):
460
+ # 余ったpunctuationがある場合 → (punctuation, 0)を追加
461
+ result.append((phone, 0))
462
+ elif phone == phone_tone_list[tone_index][0]:
463
+ # phone_tone_listの現在の音素と一致する場合 → toneをそこから取得、(phone, tone)を追加
464
+ result.append((phone, phone_tone_list[tone_index][1]))
465
+ # 探すindexを1つ進める
466
+ tone_index += 1
467
+ elif phone in punctuation:
468
+ # phoneがpunctuationの場合 → (phone, 0)を追加
469
+ result.append((phone, 0))
470
+ else:
471
+ print(f"phones: {phones_with_punct}")
472
+ print(f"phone_tone_list: {phone_tone_list}")
473
+ print(f"result: {result}")
474
+ print(f"tone_index: {tone_index}")
475
+ print(f"phone: {phone}")
476
+ raise ValueError(f"Unexpected phone: {phone}")
477
+ return result
478
+
479
+
480
+ def kata2phoneme_list(text: str) -> list[str]:
481
+ """
482
+ 原則カタカナの`text`を受け取り、それをそ��ままいじらずに音素記号のリストに変換。
483
+ 注意点:
484
+ - punctuationが来た場合(punctuationが1文字の場合がありうる)、処理せず1文字のリストを返す
485
+ - 冒頭に続く「ー」はそのまま「ー」のままにする(`handle_long()`で処理される)
486
+ - 文中の「ー」は前の音素記号の最後の音素記号に変換される。
487
+ 例:
488
+ `ーーソーナノカーー` → ["ー", "ー", "s", "o", "o", "n", "a", "n", "o", "k", "a", "a", "a"]
489
+ `?` → ["?"]
490
+ """
491
+ if text in punctuation:
492
+ return [text]
493
+ # `text`がカタカナ(`ー`含む)のみからなるかどうかをチェック
494
+ if re.fullmatch(r"[\u30A0-\u30FF]+", text) is None:
495
+ raise ValueError(f"Input must be katakana only: {text}")
496
+ sorted_keys = sorted(mora_kata_to_mora_phonemes.keys(), key=len, reverse=True)
497
+ pattern = "|".join(map(re.escape, sorted_keys))
498
+
499
+ def mora2phonemes(mora: str) -> str:
500
+ cosonant, vowel = mora_kata_to_mora_phonemes[mora]
501
+ if cosonant is None:
502
+ return f" {vowel}"
503
+ return f" {cosonant} {vowel}"
504
+
505
+ spaced_phonemes = re.sub(pattern, lambda m: mora2phonemes(m.group()), text)
506
+
507
+ # 長音記号「ー」の処理
508
+ long_pattern = r"(\w)(ー*)"
509
+ long_replacement = lambda m: m.group(1) + (" " + m.group(1)) * len(m.group(2))
510
+ spaced_phonemes = re.sub(long_pattern, long_replacement, spaced_phonemes)
511
+ return spaced_phonemes.strip().split(" ")
512
+
513
+
514
+ if __name__ == "__main__":
515
+ tokenizer = AutoTokenizer.from_pretrained(
516
+ "./bert/deberta-v2-large-japanese-char-wwm"
517
+ )
518
+ text = "hello,こんにちは、世界ー~!……"
519
+
520
+ from text.japanese_bert import get_bert_feature
521
+
522
+ text = text_normalize(text)
523
+ print(text)
524
+
525
+ phones, tones, word2ph = g2p(text)
526
+ print(phones, tones, word2ph)
527
+ bert = get_bert_feature(text, word2ph)
528
+
529
+ print(phones, tones, word2ph, bert.shape)
text/japanese_bert.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import torch
4
+ from transformers import AutoModelForMaskedLM, AutoTokenizer
5
+
6
+ from config import config
7
+ from text.japanese import text2sep_kata
8
+
9
+ LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10
+
11
+ tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12
+
13
+ models = dict()
14
+
15
+
16
+ def get_bert_feature(
17
+ text,
18
+ word2ph,
19
+ device=config.bert_gen_config.device,
20
+ style_text=None,
21
+ style_weight=0.7,
22
+ ):
23
+ text = "".join(text2sep_kata(text)[0])
24
+ if style_text:
25
+ style_text = "".join(text2sep_kata(style_text)[0])
26
+ if (
27
+ sys.platform == "darwin"
28
+ and torch.backends.mps.is_available()
29
+ and device == "cpu"
30
+ ):
31
+ device = "mps"
32
+ if not device:
33
+ device = "cuda"
34
+ if device not in models.keys():
35
+ if config.webui_config.fp16_run:
36
+ models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH, torch_dtype=torch.float16).to(device)
37
+ else:
38
+ models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
39
+ with torch.no_grad():
40
+ inputs = tokenizer(text, return_tensors="pt")
41
+ for i in inputs:
42
+ inputs[i] = inputs[i].to(device)
43
+ res = models[device](**inputs, output_hidden_states=True)
44
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
45
+ if style_text:
46
+ style_inputs = tokenizer(style_text, return_tensors="pt")
47
+ for i in style_inputs:
48
+ style_inputs[i] = style_inputs[i].to(device)
49
+ style_res = models[device](**style_inputs, output_hidden_states=True)
50
+ style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
51
+ style_res_mean = style_res.mean(0)
52
+
53
+ assert len(word2ph) == len(text) + 2
54
+ word2phone = word2ph
55
+ phone_level_feature = []
56
+ for i in range(len(word2phone)):
57
+ if style_text:
58
+ repeat_feature = (
59
+ res[i].repeat(word2phone[i], 1) * (1 - style_weight)
60
+ + style_res_mean.repeat(word2phone[i], 1) * style_weight
61
+ )
62
+ else:
63
+ repeat_feature = res[i].repeat(word2phone[i], 1)
64
+ phone_level_feature.append(repeat_feature)
65
+
66
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
67
+
68
+ return phone_level_feature.T
text/japanese_mora_list.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VOICEVOXのソースコードからお借りして最低限に改造したコード。
3
+ https://github.com/VOICEVOX/voicevox_engine/blob/master/voicevox_engine/tts_pipeline/mora_list.py
4
+ """
5
+ """
6
+ 以下のモーラ対応表はOpenJTalkのソースコードから取得し、
7
+ カタカナ表記とモーラが一対一対応するように改造した。
8
+ ライセンス表記:
9
+ -----------------------------------------------------------------
10
+ The Japanese TTS System "Open JTalk"
11
+ developed by HTS Working Group
12
+ http://open-jtalk.sourceforge.net/
13
+ -----------------------------------------------------------------
14
+
15
+ Copyright (c) 2008-2014 Nagoya Institute of Technology
16
+ Department of Computer Science
17
+
18
+ All rights reserved.
19
+
20
+ Redistribution and use in source and binary forms, with or
21
+ without modification, are permitted provided that the following
22
+ conditions are met:
23
+
24
+ - Redistributions of source code must retain the above copyright
25
+ notice, this list of conditions and the following disclaimer.
26
+ - Redistributions in binary form must reproduce the above
27
+ copyright notice, this list of conditions and the following
28
+ disclaimer in the documentation and/or other materials provided
29
+ with the distribution.
30
+ - Neither the name of the HTS working group nor the names of its
31
+ contributors may be used to endorse or promote products derived
32
+ from this software without specific prior written permission.
33
+
34
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
35
+ CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
36
+ INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
37
+ MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
38
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
39
+ BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
41
+ TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
43
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
44
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
45
+ OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
46
+ POSSIBILITY OF SUCH DAMAGE.
47
+ """
48
+ from typing import Optional
49
+
50
+ # (カタカナ, 子音, 母音)の順。子音がない場合はNoneを入れる。
51
+ # 但し「ン」と「ッ」は母音のみという扱いで、それぞれ「N」「q (clから変更)」
52
+ # また「デェ = dy e」はpyopenjtalkの出力(de e)と合わないため削除
53
+ _mora_list_minimum: list[tuple[str, Optional[str], str]] = [
54
+ ("ヴォ", "v", "o"),
55
+ ("ヴェ", "v", "e"),
56
+ ("ヴィ", "v", "i"),
57
+ ("ヴァ", "v", "a"),
58
+ ("ヴ", "v", "u"),
59
+ ("ン", None, "N"),
60
+ ("ワ", "w", "a"),
61
+ ("ロ", "r", "o"),
62
+ ("レ", "r", "e"),
63
+ ("ル", "r", "u"),
64
+ ("リョ", "ry", "o"),
65
+ ("リュ", "ry", "u"),
66
+ ("リャ", "ry", "a"),
67
+ ("リェ", "ry", "e"),
68
+ ("リ", "r", "i"),
69
+ ("ラ", "r", "a"),
70
+ ("ヨ", "y", "o"),
71
+ ("ユ", "y", "u"),
72
+ ("ヤ", "y", "a"),
73
+ ("モ", "m", "o"),
74
+ ("メ", "m", "e"),
75
+ ("ム", "m", "u"),
76
+ ("ミョ", "my", "o"),
77
+ ("ミュ", "my", "u"),
78
+ ("ミャ", "my", "a"),
79
+ ("ミェ", "my", "e"),
80
+ ("ミ", "m", "i"),
81
+ ("マ", "m", "a"),
82
+ ("ポ", "p", "o"),
83
+ ("ボ", "b", "o"),
84
+ ("ホ", "h", "o"),
85
+ ("ペ", "p", "e"),
86
+ ("ベ", "b", "e"),
87
+ ("ヘ", "h", "e"),
88
+ ("プ", "p", "u"),
89
+ ("ブ", "b", "u"),
90
+ ("フォ", "f", "o"),
91
+ ("フェ", "f", "e"),
92
+ ("フィ", "f", "i"),
93
+ ("ファ", "f", "a"),
94
+ ("フ", "f", "u"),
95
+ ("ピョ", "py", "o"),
96
+ ("ピュ", "py", "u"),
97
+ ("ピャ", "py", "a"),
98
+ ("ピェ", "py", "e"),
99
+ ("ピ", "p", "i"),
100
+ ("ビョ", "by", "o"),
101
+ ("ビュ", "by", "u"),
102
+ ("ビャ", "by", "a"),
103
+ ("ビェ", "by", "e"),
104
+ ("ビ", "b", "i"),
105
+ ("ヒョ", "hy", "o"),
106
+ ("ヒュ", "hy", "u"),
107
+ ("ヒャ", "hy", "a"),
108
+ ("ヒェ", "hy", "e"),
109
+ ("ヒ", "h", "i"),
110
+ ("パ", "p", "a"),
111
+ ("バ", "b", "a"),
112
+ ("ハ", "h", "a"),
113
+ ("ノ", "n", "o"),
114
+ ("ネ", "n", "e"),
115
+ ("ヌ", "n", "u"),
116
+ ("ニョ", "ny", "o"),
117
+ ("ニュ", "ny", "u"),
118
+ ("ニャ", "ny", "a"),
119
+ ("ニェ", "ny", "e"),
120
+ ("ニ", "n", "i"),
121
+ ("ナ", "n", "a"),
122
+ ("ドゥ", "d", "u"),
123
+ ("ド", "d", "o"),
124
+ ("トゥ", "t", "u"),
125
+ ("ト", "t", "o"),
126
+ ("デョ", "dy", "o"),
127
+ ("デュ", "dy", "u"),
128
+ ("デャ", "dy", "a"),
129
+ # ("デェ", "dy", "e"),
130
+ ("ディ", "d", "i"),
131
+ ("デ", "d", "e"),
132
+ ("テョ", "ty", "o"),
133
+ ("テュ", "ty", "u"),
134
+ ("テャ", "ty", "a"),
135
+ ("ティ", "t", "i"),
136
+ ("テ", "t", "e"),
137
+ ("ツォ", "ts", "o"),
138
+ ("ツェ", "ts", "e"),
139
+ ("ツィ", "ts", "i"),
140
+ ("ツァ", "ts", "a"),
141
+ ("ツ", "ts", "u"),
142
+ ("ッ", None, "q"), # 「cl」から「q」に変更
143
+ ("チョ", "ch", "o"),
144
+ ("チュ", "ch", "u"),
145
+ ("チャ", "ch", "a"),
146
+ ("チェ", "ch", "e"),
147
+ ("チ", "ch", "i"),
148
+ ("ダ", "d", "a"),
149
+ ("タ", "t", "a"),
150
+ ("ゾ", "z", "o"),
151
+ ("ソ", "s", "o"),
152
+ ("ゼ", "z", "e"),
153
+ ("セ", "s", "e"),
154
+ ("ズィ", "z", "i"),
155
+ ("ズ", "z", "u"),
156
+ ("スィ", "s", "i"),
157
+ ("ス", "s", "u"),
158
+ ("ジョ", "j", "o"),
159
+ ("ジュ", "j", "u"),
160
+ ("ジャ", "j", "a"),
161
+ ("ジェ", "j", "e"),
162
+ ("ジ", "j", "i"),
163
+ ("ショ", "sh", "o"),
164
+ ("シュ", "sh", "u"),
165
+ ("シャ", "sh", "a"),
166
+ ("シェ", "sh", "e"),
167
+ ("シ", "sh", "i"),
168
+ ("ザ", "z", "a"),
169
+ ("サ", "s", "a"),
170
+ ("ゴ", "g", "o"),
171
+ ("コ", "k", "o"),
172
+ ("ゲ", "g", "e"),
173
+ ("ケ", "k", "e"),
174
+ ("グヮ", "gw", "a"),
175
+ ("グ", "g", "u"),
176
+ ("クヮ", "kw", "a"),
177
+ ("ク", "k", "u"),
178
+ ("ギョ", "gy", "o"),
179
+ ("ギュ", "gy", "u"),
180
+ ("ギャ", "gy", "a"),
181
+ ("ギェ", "gy", "e"),
182
+ ("ギ", "g", "i"),
183
+ ("キョ", "ky", "o"),
184
+ ("キュ", "ky", "u"),
185
+ ("キャ", "ky", "a"),
186
+ ("キェ", "ky", "e"),
187
+ ("キ", "k", "i"),
188
+ ("ガ", "g", "a"),
189
+ ("カ", "k", "a"),
190
+ ("オ", None, "o"),
191
+ ("エ", None, "e"),
192
+ ("ウォ", "w", "o"),
193
+ ("ウェ", "w", "e"),
194
+ ("ウィ", "w", "i"),
195
+ ("ウ", None, "u"),
196
+ ("イェ", "y", "e"),
197
+ ("イ", None, "i"),
198
+ ("ア", None, "a"),
199
+ ]
200
+ _mora_list_additional: list[tuple[str, Optional[str], str]] = [
201
+ ("ヴョ", "by", "o"),
202
+ ("ヴュ", "by", "u"),
203
+ ("ヴャ", "by", "a"),
204
+ ("ヲ", None, "o"),
205
+ ("ヱ", None, "e"),
206
+ ("ヰ", None, "i"),
207
+ ("ヮ", "w", "a"),
208
+ ("ョ", "y", "o"),
209
+ ("ュ", "y", "u"),
210
+ ("ヅ", "z", "u"),
211
+ ("ヂ", "j", "i"),
212
+ ("ヶ", "k", "e"),
213
+ ("ャ", "y", "a"),
214
+ ("ォ", None, "o"),
215
+ ("ェ", None, "e"),
216
+ ("ゥ", None, "u"),
217
+ ("ィ", None, "i"),
218
+ ("ァ", None, "a"),
219
+ ]
220
+
221
+ # 例: "vo" -> "ヴォ", "a" -> "ア"
222
+ mora_phonemes_to_mora_kata: dict[str, str] = {
223
+ (consonant or "") + vowel: kana for [kana, consonant, vowel] in _mora_list_minimum
224
+ }
225
+
226
+ # 例: "ヴォ" -> ("v", "o"), "ア" -> (None, "a")
227
+ mora_kata_to_mora_phonemes: dict[str, tuple[Optional[str], str]] = {
228
+ kana: (consonant, vowel)
229
+ for [kana, consonant, vowel] in _mora_list_minimum + _mora_list_additional
230
+ }
text/opencpop-strict.txt ADDED
@@ -0,0 +1,429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ a AA a
2
+ ai AA ai
3
+ an AA an
4
+ ang AA ang
5
+ ao AA ao
6
+ ba b a
7
+ bai b ai
8
+ ban b an
9
+ bang b ang
10
+ bao b ao
11
+ bei b ei
12
+ ben b en
13
+ beng b eng
14
+ bi b i
15
+ bian b ian
16
+ biao b iao
17
+ bie b ie
18
+ bin b in
19
+ bing b ing
20
+ bo b o
21
+ bu b u
22
+ ca c a
23
+ cai c ai
24
+ can c an
25
+ cang c ang
26
+ cao c ao
27
+ ce c e
28
+ cei c ei
29
+ cen c en
30
+ ceng c eng
31
+ cha ch a
32
+ chai ch ai
33
+ chan ch an
34
+ chang ch ang
35
+ chao ch ao
36
+ che ch e
37
+ chen ch en
38
+ cheng ch eng
39
+ chi ch ir
40
+ chong ch ong
41
+ chou ch ou
42
+ chu ch u
43
+ chua ch ua
44
+ chuai ch uai
45
+ chuan ch uan
46
+ chuang ch uang
47
+ chui ch ui
48
+ chun ch un
49
+ chuo ch uo
50
+ ci c i0
51
+ cong c ong
52
+ cou c ou
53
+ cu c u
54
+ cuan c uan
55
+ cui c ui
56
+ cun c un
57
+ cuo c uo
58
+ da d a
59
+ dai d ai
60
+ dan d an
61
+ dang d ang
62
+ dao d ao
63
+ de d e
64
+ dei d ei
65
+ den d en
66
+ deng d eng
67
+ di d i
68
+ dia d ia
69
+ dian d ian
70
+ diao d iao
71
+ die d ie
72
+ ding d ing
73
+ diu d iu
74
+ dong d ong
75
+ dou d ou
76
+ du d u
77
+ duan d uan
78
+ dui d ui
79
+ dun d un
80
+ duo d uo
81
+ e EE e
82
+ ei EE ei
83
+ en EE en
84
+ eng EE eng
85
+ er EE er
86
+ fa f a
87
+ fan f an
88
+ fang f ang
89
+ fei f ei
90
+ fen f en
91
+ feng f eng
92
+ fo f o
93
+ fou f ou
94
+ fu f u
95
+ ga g a
96
+ gai g ai
97
+ gan g an
98
+ gang g ang
99
+ gao g ao
100
+ ge g e
101
+ gei g ei
102
+ gen g en
103
+ geng g eng
104
+ gong g ong
105
+ gou g ou
106
+ gu g u
107
+ gua g ua
108
+ guai g uai
109
+ guan g uan
110
+ guang g uang
111
+ gui g ui
112
+ gun g un
113
+ guo g uo
114
+ ha h a
115
+ hai h ai
116
+ han h an
117
+ hang h ang
118
+ hao h ao
119
+ he h e
120
+ hei h ei
121
+ hen h en
122
+ heng h eng
123
+ hong h ong
124
+ hou h ou
125
+ hu h u
126
+ hua h ua
127
+ huai h uai
128
+ huan h uan
129
+ huang h uang
130
+ hui h ui
131
+ hun h un
132
+ huo h uo
133
+ ji j i
134
+ jia j ia
135
+ jian j ian
136
+ jiang j iang
137
+ jiao j iao
138
+ jie j ie
139
+ jin j in
140
+ jing j ing
141
+ jiong j iong
142
+ jiu j iu
143
+ ju j v
144
+ jv j v
145
+ juan j van
146
+ jvan j van
147
+ jue j ve
148
+ jve j ve
149
+ jun j vn
150
+ jvn j vn
151
+ ka k a
152
+ kai k ai
153
+ kan k an
154
+ kang k ang
155
+ kao k ao
156
+ ke k e
157
+ kei k ei
158
+ ken k en
159
+ keng k eng
160
+ kong k ong
161
+ kou k ou
162
+ ku k u
163
+ kua k ua
164
+ kuai k uai
165
+ kuan k uan
166
+ kuang k uang
167
+ kui k ui
168
+ kun k un
169
+ kuo k uo
170
+ la l a
171
+ lai l ai
172
+ lan l an
173
+ lang l ang
174
+ lao l ao
175
+ le l e
176
+ lei l ei
177
+ leng l eng
178
+ li l i
179
+ lia l ia
180
+ lian l ian
181
+ liang l iang
182
+ liao l iao
183
+ lie l ie
184
+ lin l in
185
+ ling l ing
186
+ liu l iu
187
+ lo l o
188
+ long l ong
189
+ lou l ou
190
+ lu l u
191
+ luan l uan
192
+ lun l un
193
+ luo l uo
194
+ lv l v
195
+ lve l ve
196
+ ma m a
197
+ mai m ai
198
+ man m an
199
+ mang m ang
200
+ mao m ao
201
+ me m e
202
+ mei m ei
203
+ men m en
204
+ meng m eng
205
+ mi m i
206
+ mian m ian
207
+ miao m iao
208
+ mie m ie
209
+ min m in
210
+ ming m ing
211
+ miu m iu
212
+ mo m o
213
+ mou m ou
214
+ mu m u
215
+ na n a
216
+ nai n ai
217
+ nan n an
218
+ nang n ang
219
+ nao n ao
220
+ ne n e
221
+ nei n ei
222
+ nen n en
223
+ neng n eng
224
+ ni n i
225
+ nian n ian
226
+ niang n iang
227
+ niao n iao
228
+ nie n ie
229
+ nin n in
230
+ ning n ing
231
+ niu n iu
232
+ nong n ong
233
+ nou n ou
234
+ nu n u
235
+ nuan n uan
236
+ nun n un
237
+ nuo n uo
238
+ nv n v
239
+ nve n ve
240
+ o OO o
241
+ ou OO ou
242
+ pa p a
243
+ pai p ai
244
+ pan p an
245
+ pang p ang
246
+ pao p ao
247
+ pei p ei
248
+ pen p en
249
+ peng p eng
250
+ pi p i
251
+ pian p ian
252
+ piao p iao
253
+ pie p ie
254
+ pin p in
255
+ ping p ing
256
+ po p o
257
+ pou p ou
258
+ pu p u
259
+ qi q i
260
+ qia q ia
261
+ qian q ian
262
+ qiang q iang
263
+ qiao q iao
264
+ qie q ie
265
+ qin q in
266
+ qing q ing
267
+ qiong q iong
268
+ qiu q iu
269
+ qu q v
270
+ qv q v
271
+ quan q van
272
+ qvan q van
273
+ que q ve
274
+ qve q ve
275
+ qun q vn
276
+ qvn q vn
277
+ ran r an
278
+ rang r ang
279
+ rao r ao
280
+ re r e
281
+ ren r en
282
+ reng r eng
283
+ ri r ir
284
+ rong r ong
285
+ rou r ou
286
+ ru r u
287
+ rua r ua
288
+ ruan r uan
289
+ rui r ui
290
+ run r un
291
+ ruo r uo
292
+ sa s a
293
+ sai s ai
294
+ san s an
295
+ sang s ang
296
+ sao s ao
297
+ se s e
298
+ sen s en
299
+ seng s eng
300
+ sha sh a
301
+ shai sh ai
302
+ shan sh an
303
+ shang sh ang
304
+ shao sh ao
305
+ she sh e
306
+ shei sh ei
307
+ shen sh en
308
+ sheng sh eng
309
+ shi sh ir
310
+ shou sh ou
311
+ shu sh u
312
+ shua sh ua
313
+ shuai sh uai
314
+ shuan sh uan
315
+ shuang sh uang
316
+ shui sh ui
317
+ shun sh un
318
+ shuo sh uo
319
+ si s i0
320
+ song s ong
321
+ sou s ou
322
+ su s u
323
+ suan s uan
324
+ sui s ui
325
+ sun s un
326
+ suo s uo
327
+ ta t a
328
+ tai t ai
329
+ tan t an
330
+ tang t ang
331
+ tao t ao
332
+ te t e
333
+ tei t ei
334
+ teng t eng
335
+ ti t i
336
+ tian t ian
337
+ tiao t iao
338
+ tie t ie
339
+ ting t ing
340
+ tong t ong
341
+ tou t ou
342
+ tu t u
343
+ tuan t uan
344
+ tui t ui
345
+ tun t un
346
+ tuo t uo
347
+ wa w a
348
+ wai w ai
349
+ wan w an
350
+ wang w ang
351
+ wei w ei
352
+ wen w en
353
+ weng w eng
354
+ wo w o
355
+ wu w u
356
+ xi x i
357
+ xia x ia
358
+ xian x ian
359
+ xiang x iang
360
+ xiao x iao
361
+ xie x ie
362
+ xin x in
363
+ xing x ing
364
+ xiong x iong
365
+ xiu x iu
366
+ xu x v
367
+ xv x v
368
+ xuan x van
369
+ xvan x van
370
+ xue x ve
371
+ xve x ve
372
+ xun x vn
373
+ xvn x vn
374
+ ya y a
375
+ yan y En
376
+ yang y ang
377
+ yao y ao
378
+ ye y E
379
+ yi y i
380
+ yin y in
381
+ ying y ing
382
+ yo y o
383
+ yong y ong
384
+ you y ou
385
+ yu y v
386
+ yv y v
387
+ yuan y van
388
+ yvan y van
389
+ yue y ve
390
+ yve y ve
391
+ yun y vn
392
+ yvn y vn
393
+ za z a
394
+ zai z ai
395
+ zan z an
396
+ zang z ang
397
+ zao z ao
398
+ ze z e
399
+ zei z ei
400
+ zen z en
401
+ zeng z eng
402
+ zha zh a
403
+ zhai zh ai
404
+ zhan zh an
405
+ zhang zh ang
406
+ zhao zh ao
407
+ zhe zh e
408
+ zhei zh ei
409
+ zhen zh en
410
+ zheng zh eng
411
+ zhi zh ir
412
+ zhong zh ong
413
+ zhou zh ou
414
+ zhu zh u
415
+ zhua zh ua
416
+ zhuai zh uai
417
+ zhuan zh uan
418
+ zhuang zh uang
419
+ zhui zh ui
420
+ zhun zh un
421
+ zhuo zh uo
422
+ zi z i0
423
+ zong z ong
424
+ zou z ou
425
+ zu z u
426
+ zuan z uan
427
+ zui z ui
428
+ zun z un
429
+ zuo z uo
text/symbols.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2
+ pu_symbols = punctuation + ["SP", "UNK"]
3
+ pad = "_"
4
+
5
+ # chinese
6
+ zh_symbols = [
7
+ "E",
8
+ "En",
9
+ "a",
10
+ "ai",
11
+ "an",
12
+ "ang",
13
+ "ao",
14
+ "b",
15
+ "c",
16
+ "ch",
17
+ "d",
18
+ "e",
19
+ "ei",
20
+ "en",
21
+ "eng",
22
+ "er",
23
+ "f",
24
+ "g",
25
+ "h",
26
+ "i",
27
+ "i0",
28
+ "ia",
29
+ "ian",
30
+ "iang",
31
+ "iao",
32
+ "ie",
33
+ "in",
34
+ "ing",
35
+ "iong",
36
+ "ir",
37
+ "iu",
38
+ "j",
39
+ "k",
40
+ "l",
41
+ "m",
42
+ "n",
43
+ "o",
44
+ "ong",
45
+ "ou",
46
+ "p",
47
+ "q",
48
+ "r",
49
+ "s",
50
+ "sh",
51
+ "t",
52
+ "u",
53
+ "ua",
54
+ "uai",
55
+ "uan",
56
+ "uang",
57
+ "ui",
58
+ "un",
59
+ "uo",
60
+ "v",
61
+ "van",
62
+ "ve",
63
+ "vn",
64
+ "w",
65
+ "x",
66
+ "y",
67
+ "z",
68
+ "zh",
69
+ "AA",
70
+ "EE",
71
+ "OO",
72
+ ]
73
+ num_zh_tones = 6
74
+
75
+ # japanese
76
+ ja_symbols = [
77
+ "N",
78
+ "a",
79
+ "a:",
80
+ "b",
81
+ "by",
82
+ "ch",
83
+ "d",
84
+ "dy",
85
+ "e",
86
+ "e:",
87
+ "f",
88
+ "g",
89
+ "gy",
90
+ "h",
91
+ "hy",
92
+ "i",
93
+ "i:",
94
+ "j",
95
+ "k",
96
+ "ky",
97
+ "m",
98
+ "my",
99
+ "n",
100
+ "ny",
101
+ "o",
102
+ "o:",
103
+ "p",
104
+ "py",
105
+ "q",
106
+ "r",
107
+ "ry",
108
+ "s",
109
+ "sh",
110
+ "t",
111
+ "ts",
112
+ "ty",
113
+ "u",
114
+ "u:",
115
+ "w",
116
+ "y",
117
+ "z",
118
+ "zy",
119
+ ]
120
+ num_ja_tones = 2
121
+
122
+ # English
123
+ en_symbols = [
124
+ "aa",
125
+ "ae",
126
+ "ah",
127
+ "ao",
128
+ "aw",
129
+ "ay",
130
+ "b",
131
+ "ch",
132
+ "d",
133
+ "dh",
134
+ "eh",
135
+ "er",
136
+ "ey",
137
+ "f",
138
+ "g",
139
+ "hh",
140
+ "ih",
141
+ "iy",
142
+ "jh",
143
+ "k",
144
+ "l",
145
+ "m",
146
+ "n",
147
+ "ng",
148
+ "ow",
149
+ "oy",
150
+ "p",
151
+ "r",
152
+ "s",
153
+ "sh",
154
+ "t",
155
+ "th",
156
+ "uh",
157
+ "uw",
158
+ "V",
159
+ "w",
160
+ "y",
161
+ "z",
162
+ "zh",
163
+ ]
164
+ num_en_tones = 4
165
+
166
+ # combine all symbols
167
+ normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
168
+ symbols = [pad] + normal_symbols + pu_symbols
169
+ sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
170
+
171
+ # combine all tones
172
+ num_tones = num_zh_tones + num_ja_tones + num_en_tones
173
+
174
+ # language maps
175
+ language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
176
+ num_languages = len(language_id_map.keys())
177
+
178
+ language_tone_start_map = {
179
+ "ZH": 0,
180
+ "JP": num_zh_tones,
181
+ "EN": num_zh_tones + num_ja_tones,
182
+ }
183
+
184
+ if __name__ == "__main__":
185
+ a = set(zh_symbols)
186
+ b = set(en_symbols)
187
+ print(sorted(a & b))
text/tone_sandhi.py ADDED
@@ -0,0 +1,776 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from typing import List
15
+ from typing import Tuple
16
+
17
+ try:
18
+ import jieba_fast as jieba
19
+ except:
20
+ import jieba
21
+ from pypinyin import lazy_pinyin
22
+ from pypinyin import Style
23
+
24
+
25
+ class ToneSandhi:
26
+ def __init__(self):
27
+ self.must_neural_tone_words = {
28
+ "麻烦",
29
+ "麻利",
30
+ "鸳鸯",
31
+ "高粱",
32
+ "骨头",
33
+ "骆驼",
34
+ "马虎",
35
+ "首饰",
36
+ "馒头",
37
+ "馄饨",
38
+ "风筝",
39
+ "难为",
40
+ "队伍",
41
+ "阔气",
42
+ "闺女",
43
+ "门道",
44
+ "锄头",
45
+ "铺盖",
46
+ "铃铛",
47
+ "铁匠",
48
+ "钥匙",
49
+ "里脊",
50
+ "里头",
51
+ "部分",
52
+ "那么",
53
+ "道士",
54
+ "造化",
55
+ "迷糊",
56
+ "连累",
57
+ "这么",
58
+ "这个",
59
+ "运气",
60
+ "过去",
61
+ "软和",
62
+ "转悠",
63
+ "踏实",
64
+ "跳蚤",
65
+ "跟头",
66
+ "趔趄",
67
+ "财主",
68
+ "豆腐",
69
+ "讲究",
70
+ "记性",
71
+ "记号",
72
+ "认识",
73
+ "规矩",
74
+ "见识",
75
+ "裁缝",
76
+ "补丁",
77
+ "衣裳",
78
+ "衣服",
79
+ "衙门",
80
+ "街坊",
81
+ "行李",
82
+ "行当",
83
+ "蛤蟆",
84
+ "蘑菇",
85
+ "薄荷",
86
+ "葫芦",
87
+ "葡萄",
88
+ "萝卜",
89
+ "荸荠",
90
+ "苗条",
91
+ "苗头",
92
+ "苍蝇",
93
+ "芝麻",
94
+ "舒服",
95
+ "舒坦",
96
+ "舌头",
97
+ "自在",
98
+ "膏药",
99
+ "脾气",
100
+ "脑袋",
101
+ "脊梁",
102
+ "能耐",
103
+ "胳膊",
104
+ "胭脂",
105
+ "胡萝",
106
+ "胡琴",
107
+ "胡同",
108
+ "聪明",
109
+ "耽误",
110
+ "耽搁",
111
+ "耷拉",
112
+ "耳朵",
113
+ "老爷",
114
+ "老实",
115
+ "老婆",
116
+ "老头",
117
+ "老太",
118
+ "翻腾",
119
+ "罗嗦",
120
+ "罐头",
121
+ "编辑",
122
+ "结实",
123
+ "红火",
124
+ "累赘",
125
+ "糨糊",
126
+ "糊涂",
127
+ "精神",
128
+ "粮食",
129
+ "簸箕",
130
+ "篱笆",
131
+ "算计",
132
+ "算盘",
133
+ "答应",
134
+ "笤帚",
135
+ "笑语",
136
+ "笑话",
137
+ "窟窿",
138
+ "窝囊",
139
+ "窗户",
140
+ "稳当",
141
+ "稀罕",
142
+ "称呼",
143
+ "秧歌",
144
+ "秀气",
145
+ "秀才",
146
+ "福气",
147
+ "祖宗",
148
+ "砚台",
149
+ "码头",
150
+ "石榴",
151
+ "石头",
152
+ "石匠",
153
+ "知识",
154
+ "眼睛",
155
+ "眯缝",
156
+ "眨巴",
157
+ "眉毛",
158
+ "相声",
159
+ "盘算",
160
+ "白净",
161
+ "痢疾",
162
+ "痛快",
163
+ "疟疾",
164
+ "疙瘩",
165
+ "疏忽",
166
+ "畜生",
167
+ "生意",
168
+ "甘蔗",
169
+ "琵琶",
170
+ "琢磨",
171
+ "琉璃",
172
+ "玻璃",
173
+ "玫瑰",
174
+ "玄乎",
175
+ "狐狸",
176
+ "状元",
177
+ "特务",
178
+ "牲口",
179
+ "牙碜",
180
+ "牌楼",
181
+ "爽快",
182
+ "爱人",
183
+ "热闹",
184
+ "烧饼",
185
+ "烟筒",
186
+ "烂糊",
187
+ "点心",
188
+ "炊帚",
189
+ "灯笼",
190
+ "火候",
191
+ "漂亮",
192
+ "滑溜",
193
+ "溜达",
194
+ "温和",
195
+ "清楚",
196
+ "消息",
197
+ "浪头",
198
+ "活泼",
199
+ "比方",
200
+ "正经",
201
+ "欺负",
202
+ "模糊",
203
+ "槟榔",
204
+ "棺材",
205
+ "棒槌",
206
+ "棉花",
207
+ "核桃",
208
+ "栅栏",
209
+ "柴火",
210
+ "架势",
211
+ "枕头",
212
+ "枇杷",
213
+ "机灵",
214
+ "本事",
215
+ "木头",
216
+ "木匠",
217
+ "朋友",
218
+ "月饼",
219
+ "月亮",
220
+ "暖和",
221
+ "明白",
222
+ "时候",
223
+ "新鲜",
224
+ "故事",
225
+ "收拾",
226
+ "收成",
227
+ "提防",
228
+ "挖苦",
229
+ "挑剔",
230
+ "指甲",
231
+ "指头",
232
+ "拾掇",
233
+ "拳头",
234
+ "拨弄",
235
+ "招牌",
236
+ "招呼",
237
+ "抬举",
238
+ "护士",
239
+ "折腾",
240
+ "扫帚",
241
+ "打量",
242
+ "打算",
243
+ "打点",
244
+ "打扮",
245
+ "打听",
246
+ "打发",
247
+ "扎实",
248
+ "扁担",
249
+ "戒指",
250
+ "懒得",
251
+ "意识",
252
+ "意思",
253
+ "情形",
254
+ "悟性",
255
+ "怪物",
256
+ "思量",
257
+ "怎么",
258
+ "念头",
259
+ "念叨",
260
+ "快活",
261
+ "忙活",
262
+ "志气",
263
+ "心思",
264
+ "得罪",
265
+ "张罗",
266
+ "弟兄",
267
+ "开通",
268
+ "应酬",
269
+ "庄稼",
270
+ "干事",
271
+ "帮手",
272
+ "帐篷",
273
+ "希罕",
274
+ "师父",
275
+ "师傅",
276
+ "巴结",
277
+ "巴掌",
278
+ "差事",
279
+ "工夫",
280
+ "岁数",
281
+ "屁股",
282
+ "尾巴",
283
+ "少爷",
284
+ "小气",
285
+ "小伙",
286
+ "将就",
287
+ "对头",
288
+ "对付",
289
+ "寡妇",
290
+ "家伙",
291
+ "客气",
292
+ "实在",
293
+ "官司",
294
+ "学问",
295
+ "学生",
296
+ "字号",
297
+ "嫁妆",
298
+ "媳妇",
299
+ "媒人",
300
+ "婆家",
301
+ "娘家",
302
+ "委屈",
303
+ "姑娘",
304
+ "姐夫",
305
+ "妯娌",
306
+ "妥当",
307
+ "妖精",
308
+ "奴才",
309
+ "女婿",
310
+ "头发",
311
+ "太阳",
312
+ "大爷",
313
+ "大方",
314
+ "大意",
315
+ "大夫",
316
+ "多少",
317
+ "多么",
318
+ "外甥",
319
+ "壮实",
320
+ "地道",
321
+ "地方",
322
+ "在乎",
323
+ "困难",
324
+ "嘴巴",
325
+ "嘱咐",
326
+ "嘟囔",
327
+ "嘀咕",
328
+ "喜欢",
329
+ "喇嘛",
330
+ "喇叭",
331
+ "商量",
332
+ "唾沫",
333
+ "哑巴",
334
+ "哈欠",
335
+ "哆嗦",
336
+ "咳嗽",
337
+ "和尚",
338
+ "告诉",
339
+ "告示",
340
+ "含糊",
341
+ "吓唬",
342
+ "后头",
343
+ "名字",
344
+ "名堂",
345
+ "合同",
346
+ "吆喝",
347
+ "叫唤",
348
+ "口袋",
349
+ "厚道",
350
+ "厉害",
351
+ "千斤",
352
+ "包袱",
353
+ "包涵",
354
+ "匀称",
355
+ "勤快",
356
+ "动静",
357
+ "动弹",
358
+ "功夫",
359
+ "力气",
360
+ "前头",
361
+ "刺猬",
362
+ "刺激",
363
+ "别扭",
364
+ "利落",
365
+ "利索",
366
+ "利害",
367
+ "分析",
368
+ "出息",
369
+ "凑合",
370
+ "凉快",
371
+ "冷战",
372
+ "冤枉",
373
+ "冒失",
374
+ "养活",
375
+ "关系",
376
+ "先生",
377
+ "兄弟",
378
+ "便宜",
379
+ "使唤",
380
+ "佩服",
381
+ "作坊",
382
+ "体面",
383
+ "位置",
384
+ "似的",
385
+ "伙计",
386
+ "休息",
387
+ "什么",
388
+ "人家",
389
+ "亲戚",
390
+ "亲家",
391
+ "交情",
392
+ "云彩",
393
+ "事情",
394
+ "买卖",
395
+ "主意",
396
+ "丫头",
397
+ "丧气",
398
+ "两口",
399
+ "东西",
400
+ "东家",
401
+ "世故",
402
+ "不由",
403
+ "不在",
404
+ "下水",
405
+ "下巴",
406
+ "上头",
407
+ "上司",
408
+ "丈夫",
409
+ "丈人",
410
+ "一辈",
411
+ "那个",
412
+ "菩萨",
413
+ "父亲",
414
+ "母亲",
415
+ "咕噜",
416
+ "邋遢",
417
+ "费用",
418
+ "冤家",
419
+ "甜头",
420
+ "介绍",
421
+ "荒唐",
422
+ "大人",
423
+ "泥鳅",
424
+ "幸福",
425
+ "熟悉",
426
+ "计划",
427
+ "扑腾",
428
+ "蜡烛",
429
+ "姥爷",
430
+ "照顾",
431
+ "喉咙",
432
+ "吉他",
433
+ "���堂",
434
+ "蚂蚱",
435
+ "凤凰",
436
+ "拖沓",
437
+ "寒碜",
438
+ "糟蹋",
439
+ "倒腾",
440
+ "报复",
441
+ "逻辑",
442
+ "盘缠",
443
+ "喽啰",
444
+ "牢骚",
445
+ "咖喱",
446
+ "扫把",
447
+ "惦记",
448
+ }
449
+ self.must_not_neural_tone_words = {
450
+ "男子",
451
+ "女子",
452
+ "分子",
453
+ "原子",
454
+ "量子",
455
+ "莲子",
456
+ "石子",
457
+ "瓜子",
458
+ "电子",
459
+ "人人",
460
+ "虎虎",
461
+ }
462
+ self.punc = ":,;。?!“”‘’':,;.?!"
463
+
464
+ # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
465
+ # e.g.
466
+ # word: "家里"
467
+ # pos: "s"
468
+ # finals: ['ia1', 'i3']
469
+ def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
470
+ # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
471
+ for j, item in enumerate(word):
472
+ if (
473
+ j - 1 >= 0
474
+ and item == word[j - 1]
475
+ and pos[0] in {"n", "v", "a"}
476
+ and word not in self.must_not_neural_tone_words
477
+ ):
478
+ finals[j] = finals[j][:-1] + "5"
479
+ ge_idx = word.find("个")
480
+ if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
481
+ finals[-1] = finals[-1][:-1] + "5"
482
+ elif len(word) >= 1 and word[-1] in "的地得":
483
+ finals[-1] = finals[-1][:-1] + "5"
484
+ # e.g. 走了, 看着, 去过
485
+ # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
486
+ # finals[-1] = finals[-1][:-1] + "5"
487
+ elif (
488
+ len(word) > 1
489
+ and word[-1] in "们子"
490
+ and pos in {"r", "n"}
491
+ and word not in self.must_not_neural_tone_words
492
+ ):
493
+ finals[-1] = finals[-1][:-1] + "5"
494
+ # e.g. 桌上, 地下, 家里
495
+ elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
496
+ finals[-1] = finals[-1][:-1] + "5"
497
+ # e.g. 上来, 下去
498
+ elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
499
+ finals[-1] = finals[-1][:-1] + "5"
500
+ # 个做量词
501
+ elif (
502
+ ge_idx >= 1
503
+ and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
504
+ ) or word == "个":
505
+ finals[ge_idx] = finals[ge_idx][:-1] + "5"
506
+ else:
507
+ if (
508
+ word in self.must_neural_tone_words
509
+ or word[-2:] in self.must_neural_tone_words
510
+ ):
511
+ finals[-1] = finals[-1][:-1] + "5"
512
+
513
+ word_list = self._split_word(word)
514
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
515
+ for i, word in enumerate(word_list):
516
+ # conventional neural in Chinese
517
+ if (
518
+ word in self.must_neural_tone_words
519
+ or word[-2:] in self.must_neural_tone_words
520
+ ):
521
+ finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
522
+ finals = sum(finals_list, [])
523
+ return finals
524
+
525
+ def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
526
+ # e.g. 看不懂
527
+ if len(word) == 3 and word[1] == "不":
528
+ finals[1] = finals[1][:-1] + "5"
529
+ else:
530
+ for i, char in enumerate(word):
531
+ # "不" before tone4 should be bu2, e.g. 不怕
532
+ if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
533
+ finals[i] = finals[i][:-1] + "2"
534
+ return finals
535
+
536
+ def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
537
+ # "一" in number sequences, e.g. 一零零, 二一零
538
+ if word.find("一") != -1 and all(
539
+ [item.isnumeric() for item in word if item != "一"]
540
+ ):
541
+ return finals
542
+ # "一" between reduplication words should be yi5, e.g. 看一看
543
+ elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
544
+ finals[1] = finals[1][:-1] + "5"
545
+ # when "一" is ordinal word, it should be yi1
546
+ elif word.startswith("第一"):
547
+ finals[1] = finals[1][:-1] + "1"
548
+ else:
549
+ for i, char in enumerate(word):
550
+ if char == "一" and i + 1 < len(word):
551
+ # "一" before tone4 should be yi2, e.g. 一段
552
+ if finals[i + 1][-1] == "4":
553
+ finals[i] = finals[i][:-1] + "2"
554
+ # "一" before non-tone4 should be yi4, e.g. 一天
555
+ else:
556
+ # "一" 后面如果是标点,还读一声
557
+ if word[i + 1] not in self.punc:
558
+ finals[i] = finals[i][:-1] + "4"
559
+ return finals
560
+
561
+ def _split_word(self, word: str) -> List[str]:
562
+ word_list = jieba.cut_for_search(word)
563
+ word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
564
+ first_subword = word_list[0]
565
+ first_begin_idx = word.find(first_subword)
566
+ if first_begin_idx == 0:
567
+ second_subword = word[len(first_subword) :]
568
+ new_word_list = [first_subword, second_subword]
569
+ else:
570
+ second_subword = word[: -len(first_subword)]
571
+ new_word_list = [second_subword, first_subword]
572
+ return new_word_list
573
+
574
+ def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
575
+ if len(word) == 2 and self._all_tone_three(finals):
576
+ finals[0] = finals[0][:-1] + "2"
577
+ elif len(word) == 3:
578
+ word_list = self._split_word(word)
579
+ if self._all_tone_three(finals):
580
+ # disyllabic + monosyllabic, e.g. 蒙古/包
581
+ if len(word_list[0]) == 2:
582
+ finals[0] = finals[0][:-1] + "2"
583
+ finals[1] = finals[1][:-1] + "2"
584
+ # monosyllabic + disyllabic, e.g. 纸/老虎
585
+ elif len(word_list[0]) == 1:
586
+ finals[1] = finals[1][:-1] + "2"
587
+ else:
588
+ finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
589
+ if len(finals_list) == 2:
590
+ for i, sub in enumerate(finals_list):
591
+ # e.g. 所有/人
592
+ if self._all_tone_three(sub) and len(sub) == 2:
593
+ finals_list[i][0] = finals_list[i][0][:-1] + "2"
594
+ # e.g. 好/喜欢
595
+ elif (
596
+ i == 1
597
+ and not self._all_tone_three(sub)
598
+ and finals_list[i][0][-1] == "3"
599
+ and finals_list[0][-1][-1] == "3"
600
+ ):
601
+ finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
602
+ finals = sum(finals_list, [])
603
+ # split idiom into two words who's length is 2
604
+ elif len(word) == 4:
605
+ finals_list = [finals[:2], finals[2:]]
606
+ finals = []
607
+ for sub in finals_list:
608
+ if self._all_tone_three(sub):
609
+ sub[0] = sub[0][:-1] + "2"
610
+ finals += sub
611
+
612
+ return finals
613
+
614
+ def _all_tone_three(self, finals: List[str]) -> bool:
615
+ return all(x[-1] == "3" for x in finals)
616
+
617
+ # merge "不" and the word behind it
618
+ # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
619
+ def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
620
+ new_seg = []
621
+ last_word = ""
622
+ for word, pos in seg:
623
+ if last_word == "不":
624
+ word = last_word + word
625
+ if word != "不":
626
+ new_seg.append((word, pos))
627
+ last_word = word[:]
628
+ if last_word == "不":
629
+ new_seg.append((last_word, "d"))
630
+ last_word = ""
631
+ return new_seg
632
+
633
+ # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
634
+ # function 2: merge single "一" and the word behind it
635
+ # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
636
+ # e.g.
637
+ # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
638
+ # output seg: [['听一听', 'v']]
639
+ def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
640
+ new_seg = [] * len(seg)
641
+ # function 1
642
+ i = 0
643
+ while i < len(seg):
644
+ word, pos = seg[i]
645
+ if (
646
+ i - 1 >= 0
647
+ and word == "一"
648
+ and i + 1 < len(seg)
649
+ and seg[i - 1][0] == seg[i + 1][0]
650
+ and seg[i - 1][1] == "v"
651
+ ):
652
+ new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
653
+ i += 2
654
+ else:
655
+ if (
656
+ i - 2 >= 0
657
+ and seg[i - 1][0] == "一"
658
+ and seg[i - 2][0] == word
659
+ and pos == "v"
660
+ ):
661
+ continue
662
+ else:
663
+ new_seg.append([word, pos])
664
+ i += 1
665
+ seg = [i for i in new_seg if len(i) > 0]
666
+ new_seg = []
667
+ # function 2
668
+ for i, (word, pos) in enumerate(seg):
669
+ if new_seg and new_seg[-1][0] == "一":
670
+ new_seg[-1][0] = new_seg[-1][0] + word
671
+ else:
672
+ new_seg.append([word, pos])
673
+ return new_seg
674
+
675
+ # the first and the second words are all_tone_three
676
+ def _merge_continuous_three_tones(
677
+ self, seg: List[Tuple[str, str]]
678
+ ) -> List[Tuple[str, str]]:
679
+ new_seg = []
680
+ sub_finals_list = [
681
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
682
+ for (word, pos) in seg
683
+ ]
684
+ assert len(sub_finals_list) == len(seg)
685
+ merge_last = [False] * len(seg)
686
+ for i, (word, pos) in enumerate(seg):
687
+ if (
688
+ i - 1 >= 0
689
+ and self._all_tone_three(sub_finals_list[i - 1])
690
+ and self._all_tone_three(sub_finals_list[i])
691
+ and not merge_last[i - 1]
692
+ ):
693
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
694
+ if (
695
+ not self._is_reduplication(seg[i - 1][0])
696
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
697
+ ):
698
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
699
+ merge_last[i] = True
700
+ else:
701
+ new_seg.append([word, pos])
702
+ else:
703
+ new_seg.append([word, pos])
704
+
705
+ return new_seg
706
+
707
+ def _is_reduplication(self, word: str) -> bool:
708
+ return len(word) == 2 and word[0] == word[1]
709
+
710
+ # the last char of first word and the first char of second word is tone_three
711
+ def _merge_continuous_three_tones_2(
712
+ self, seg: List[Tuple[str, str]]
713
+ ) -> List[Tuple[str, str]]:
714
+ new_seg = []
715
+ sub_finals_list = [
716
+ lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
717
+ for (word, pos) in seg
718
+ ]
719
+ assert len(sub_finals_list) == len(seg)
720
+ merge_last = [False] * len(seg)
721
+ for i, (word, pos) in enumerate(seg):
722
+ if (
723
+ i - 1 >= 0
724
+ and sub_finals_list[i - 1][-1][-1] == "3"
725
+ and sub_finals_list[i][0][-1] == "3"
726
+ and not merge_last[i - 1]
727
+ ):
728
+ # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
729
+ if (
730
+ not self._is_reduplication(seg[i - 1][0])
731
+ and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
732
+ ):
733
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
734
+ merge_last[i] = True
735
+ else:
736
+ new_seg.append([word, pos])
737
+ else:
738
+ new_seg.append([word, pos])
739
+ return new_seg
740
+
741
+ def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
742
+ new_seg = []
743
+ for i, (word, pos) in enumerate(seg):
744
+ if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
745
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
746
+ else:
747
+ new_seg.append([word, pos])
748
+ return new_seg
749
+
750
+ def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
751
+ new_seg = []
752
+ for i, (word, pos) in enumerate(seg):
753
+ if new_seg and word == new_seg[-1][0]:
754
+ new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
755
+ else:
756
+ new_seg.append([word, pos])
757
+ return new_seg
758
+
759
+ def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
760
+ seg = self._merge_bu(seg)
761
+ try:
762
+ seg = self._merge_yi(seg)
763
+ except:
764
+ print("_merge_yi failed")
765
+ seg = self._merge_reduplication(seg)
766
+ seg = self._merge_continuous_three_tones(seg)
767
+ seg = self._merge_continuous_three_tones_2(seg)
768
+ seg = self._merge_er(seg)
769
+ return seg
770
+
771
+ def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
772
+ finals = self._bu_sandhi(word, finals)
773
+ finals = self._yi_sandhi(word, finals)
774
+ finals = self._neural_sandhi(word, pos, finals)
775
+ finals = self._three_sandhi(word, finals)
776
+ return finals