JotunnBurton commited on
Commit
ae73f54
·
verified ·
1 Parent(s): e0c20c8

Delete text

Browse files
text/__init__.py DELETED
@@ -1,26 +0,0 @@
1
- from text.symbols import *
2
-
3
- _symbol_to_id = {s: i for i, s in enumerate(symbols)}
4
-
5
-
6
- def cleaned_text_to_sequence(cleaned_text, tones, language):
7
- """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
8
- Args:
9
- text: string to convert to a sequence
10
- Returns:
11
- List of integers corresponding to the symbols in the text
12
- """
13
- phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
14
- tone_start = language_tone_start_map[language]
15
- tones = [i + tone_start for i in tones]
16
- lang_id = language_id_map[language]
17
- lang_ids = [lang_id for i in phones]
18
- return phones, tones, lang_ids
19
-
20
-
21
- def get_bert(norm_text, word2ph, language, device):
22
- from .japanese_bert import get_bert_feature as jp_bert
23
-
24
- lang_bert_func_map = {"JP": jp_bert}
25
- bert = lang_bert_func_map[language](norm_text, word2ph, device)
26
- return bert
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/chinese.py DELETED
@@ -1,198 +0,0 @@
1
- import os
2
- import re
3
-
4
- import cn2an
5
- from pypinyin import lazy_pinyin, Style
6
-
7
- from text.symbols import punctuation
8
- from text.tone_sandhi import ToneSandhi
9
-
10
- current_file_path = os.path.dirname(__file__)
11
- pinyin_to_symbol_map = {
12
- line.split("\t")[0]: line.strip().split("\t")[1]
13
- for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
14
- }
15
-
16
- import jieba.posseg as psg
17
-
18
-
19
- rep_map = {
20
- ":": ",",
21
- ";": ",",
22
- ",": ",",
23
- "。": ".",
24
- "!": "!",
25
- "?": "?",
26
- "\n": ".",
27
- "·": ",",
28
- "、": ",",
29
- "...": "…",
30
- "$": ".",
31
- "“": "'",
32
- "”": "'",
33
- "‘": "'",
34
- "’": "'",
35
- "(": "'",
36
- ")": "'",
37
- "(": "'",
38
- ")": "'",
39
- "《": "'",
40
- "》": "'",
41
- "【": "'",
42
- "】": "'",
43
- "[": "'",
44
- "]": "'",
45
- "—": "-",
46
- "~": "-",
47
- "~": "-",
48
- "「": "'",
49
- "」": "'",
50
- }
51
-
52
- tone_modifier = ToneSandhi()
53
-
54
-
55
- def replace_punctuation(text):
56
- text = text.replace("嗯", "恩").replace("呣", "母")
57
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
58
-
59
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
60
-
61
- replaced_text = re.sub(
62
- r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
63
- )
64
-
65
- return replaced_text
66
-
67
-
68
- def g2p(text):
69
- pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
70
- sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
71
- phones, tones, word2ph = _g2p(sentences)
72
- assert sum(word2ph) == len(phones)
73
- assert len(word2ph) == len(text) # Sometimes it will crash,you can add a try-catch.
74
- phones = ["_"] + phones + ["_"]
75
- tones = [0] + tones + [0]
76
- word2ph = [1] + word2ph + [1]
77
- return phones, tones, word2ph
78
-
79
-
80
- def _get_initials_finals(word):
81
- initials = []
82
- finals = []
83
- orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
84
- orig_finals = lazy_pinyin(
85
- word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
86
- )
87
- for c, v in zip(orig_initials, orig_finals):
88
- initials.append(c)
89
- finals.append(v)
90
- return initials, finals
91
-
92
-
93
- def _g2p(segments):
94
- phones_list = []
95
- tones_list = []
96
- word2ph = []
97
- for seg in segments:
98
- # Replace all English words in the sentence
99
- seg = re.sub("[a-zA-Z]+", "", seg)
100
- seg_cut = psg.lcut(seg)
101
- initials = []
102
- finals = []
103
- seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
104
- for word, pos in seg_cut:
105
- if pos == "eng":
106
- continue
107
- sub_initials, sub_finals = _get_initials_finals(word)
108
- sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
109
- initials.append(sub_initials)
110
- finals.append(sub_finals)
111
-
112
- # assert len(sub_initials) == len(sub_finals) == len(word)
113
- initials = sum(initials, [])
114
- finals = sum(finals, [])
115
- #
116
- for c, v in zip(initials, finals):
117
- raw_pinyin = c + v
118
- # NOTE: post process for pypinyin outputs
119
- # we discriminate i, ii and iii
120
- if c == v:
121
- assert c in punctuation
122
- phone = [c]
123
- tone = "0"
124
- word2ph.append(1)
125
- else:
126
- v_without_tone = v[:-1]
127
- tone = v[-1]
128
-
129
- pinyin = c + v_without_tone
130
- assert tone in "12345"
131
-
132
- if c:
133
- # 多音节
134
- v_rep_map = {
135
- "uei": "ui",
136
- "iou": "iu",
137
- "uen": "un",
138
- }
139
- if v_without_tone in v_rep_map.keys():
140
- pinyin = c + v_rep_map[v_without_tone]
141
- else:
142
- # 单音节
143
- pinyin_rep_map = {
144
- "ing": "ying",
145
- "i": "yi",
146
- "in": "yin",
147
- "u": "wu",
148
- }
149
- if pinyin in pinyin_rep_map.keys():
150
- pinyin = pinyin_rep_map[pinyin]
151
- else:
152
- single_rep_map = {
153
- "v": "yu",
154
- "e": "e",
155
- "i": "y",
156
- "u": "w",
157
- }
158
- if pinyin[0] in single_rep_map.keys():
159
- pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
160
-
161
- assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
162
- phone = pinyin_to_symbol_map[pinyin].split(" ")
163
- word2ph.append(len(phone))
164
-
165
- phones_list += phone
166
- tones_list += [int(tone)] * len(phone)
167
- return phones_list, tones_list, word2ph
168
-
169
-
170
- def text_normalize(text):
171
- numbers = re.findall(r"\d+(?:\.?\d+)?", text)
172
- for number in numbers:
173
- text = text.replace(number, cn2an.an2cn(number), 1)
174
- text = replace_punctuation(text)
175
- return text
176
-
177
-
178
- def get_bert_feature(text, word2ph):
179
- from text import chinese_bert
180
-
181
- return chinese_bert.get_bert_feature(text, word2ph)
182
-
183
-
184
- if __name__ == "__main__":
185
- from text.chinese_bert import get_bert_feature
186
-
187
- text = "啊!但是《原神》是由,米哈\游自主, [研发]的一款全.新开放世界.冒险游戏"
188
- text = text_normalize(text)
189
- print(text)
190
- phones, tones, word2ph = g2p(text)
191
- bert = get_bert_feature(text, word2ph)
192
-
193
- print(phones, tones, word2ph, bert.shape)
194
-
195
-
196
- # # 示例用法
197
- # text = "这是一个示例文本:,你好!这是一个测试...."
198
- # print(g2p_paddle(text)) # 输出: 这是一个示例文本你好这是一个测试
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/chinese_bert.py DELETED
@@ -1,100 +0,0 @@
1
- import torch
2
- import sys
3
- from transformers import AutoTokenizer, AutoModelForMaskedLM
4
-
5
- tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
6
-
7
- models = dict()
8
-
9
-
10
- def get_bert_feature(text, word2ph, device=None):
11
- if (
12
- sys.platform == "darwin"
13
- and torch.backends.mps.is_available()
14
- and device == "cpu"
15
- ):
16
- device = "mps"
17
- if not device:
18
- device = "cuda"
19
- if device not in models.keys():
20
- models[device] = AutoModelForMaskedLM.from_pretrained(
21
- "hfl/chinese-roberta-wwm-ext-large"
22
- ).to(device)
23
- with torch.no_grad():
24
- inputs = tokenizer(text, return_tensors="pt")
25
- for i in inputs:
26
- inputs[i] = inputs[i].to(device)
27
- res = models[device](**inputs, output_hidden_states=True)
28
- res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
29
-
30
- assert len(word2ph) == len(text) + 2
31
- word2phone = word2ph
32
- phone_level_feature = []
33
- for i in range(len(word2phone)):
34
- repeat_feature = res[i].repeat(word2phone[i], 1)
35
- phone_level_feature.append(repeat_feature)
36
-
37
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
38
-
39
- return phone_level_feature.T
40
-
41
-
42
- if __name__ == "__main__":
43
- import torch
44
-
45
- word_level_feature = torch.rand(38, 1024) # 12个词,每个词1024维特征
46
- word2phone = [
47
- 1,
48
- 2,
49
- 1,
50
- 2,
51
- 2,
52
- 1,
53
- 2,
54
- 2,
55
- 1,
56
- 2,
57
- 2,
58
- 1,
59
- 2,
60
- 2,
61
- 2,
62
- 2,
63
- 2,
64
- 1,
65
- 1,
66
- 2,
67
- 2,
68
- 1,
69
- 2,
70
- 2,
71
- 2,
72
- 2,
73
- 1,
74
- 2,
75
- 2,
76
- 2,
77
- 2,
78
- 2,
79
- 1,
80
- 2,
81
- 2,
82
- 2,
83
- 2,
84
- 1,
85
- ]
86
-
87
- # 计算总帧数
88
- total_frames = sum(word2phone)
89
- print(word_level_feature.shape)
90
- print(word2phone)
91
- phone_level_feature = []
92
- for i in range(len(word2phone)):
93
- print(word_level_feature[i].shape)
94
-
95
- # 对每个词重复word2phone[i]次
96
- repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
97
- phone_level_feature.append(repeat_feature)
98
-
99
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
100
- print(phone_level_feature.shape) # torch.Size([36, 1024])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/cleaner.py DELETED
@@ -1,28 +0,0 @@
1
- from text import chinese, japanese, cleaned_text_to_sequence
2
-
3
-
4
- language_module_map = {"ZH": chinese, "JP": japanese}
5
-
6
-
7
- def clean_text(text, language):
8
- language_module = language_module_map[language]
9
- norm_text = language_module.text_normalize(text)
10
- phones, tones, word2ph = language_module.g2p(norm_text)
11
- return norm_text, phones, tones, word2ph
12
-
13
-
14
- def clean_text_bert(text, language):
15
- language_module = language_module_map[language]
16
- norm_text = language_module.text_normalize(text)
17
- phones, tones, word2ph = language_module.g2p(norm_text)
18
- bert = language_module.get_bert_feature(norm_text, word2ph)
19
- return phones, tones, bert
20
-
21
-
22
- def text_to_sequence(text, language):
23
- norm_text, phones, tones, word2ph = clean_text(text, language)
24
- return cleaned_text_to_sequence(phones, tones, language)
25
-
26
-
27
- if __name__ == "__main__":
28
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/cmudict.rep DELETED
The diff for this file is too large to render. See raw diff
 
text/cmudict_cache.pickle DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
3
- size 6212655
 
 
 
 
text/english.py DELETED
@@ -1,214 +0,0 @@
1
- import pickle
2
- import os
3
- import re
4
- from g2p_en import G2p
5
-
6
- from text import symbols
7
-
8
- current_file_path = os.path.dirname(__file__)
9
- CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
10
- CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
11
- _g2p = G2p()
12
-
13
- arpa = {
14
- "AH0",
15
- "S",
16
- "AH1",
17
- "EY2",
18
- "AE2",
19
- "EH0",
20
- "OW2",
21
- "UH0",
22
- "NG",
23
- "B",
24
- "G",
25
- "AY0",
26
- "M",
27
- "AA0",
28
- "F",
29
- "AO0",
30
- "ER2",
31
- "UH1",
32
- "IY1",
33
- "AH2",
34
- "DH",
35
- "IY0",
36
- "EY1",
37
- "IH0",
38
- "K",
39
- "N",
40
- "W",
41
- "IY2",
42
- "T",
43
- "AA1",
44
- "ER1",
45
- "EH2",
46
- "OY0",
47
- "UH2",
48
- "UW1",
49
- "Z",
50
- "AW2",
51
- "AW1",
52
- "V",
53
- "UW2",
54
- "AA2",
55
- "ER",
56
- "AW0",
57
- "UW0",
58
- "R",
59
- "OW1",
60
- "EH1",
61
- "ZH",
62
- "AE0",
63
- "IH2",
64
- "IH",
65
- "Y",
66
- "JH",
67
- "P",
68
- "AY1",
69
- "EY0",
70
- "OY2",
71
- "TH",
72
- "HH",
73
- "D",
74
- "ER0",
75
- "CH",
76
- "AO1",
77
- "AE1",
78
- "AO2",
79
- "OY1",
80
- "AY2",
81
- "IH1",
82
- "OW0",
83
- "L",
84
- "SH",
85
- }
86
-
87
-
88
- def post_replace_ph(ph):
89
- rep_map = {
90
- ":": ",",
91
- ";": ",",
92
- ",": ",",
93
- "。": ".",
94
- "!": "!",
95
- "?": "?",
96
- "\n": ".",
97
- "·": ",",
98
- "、": ",",
99
- "...": "…",
100
- "v": "V",
101
- }
102
- if ph in rep_map.keys():
103
- ph = rep_map[ph]
104
- if ph in symbols:
105
- return ph
106
- if ph not in symbols:
107
- ph = "UNK"
108
- return ph
109
-
110
-
111
- def read_dict():
112
- g2p_dict = {}
113
- start_line = 49
114
- with open(CMU_DICT_PATH) as f:
115
- line = f.readline()
116
- line_index = 1
117
- while line:
118
- if line_index >= start_line:
119
- line = line.strip()
120
- word_split = line.split(" ")
121
- word = word_split[0]
122
-
123
- syllable_split = word_split[1].split(" - ")
124
- g2p_dict[word] = []
125
- for syllable in syllable_split:
126
- phone_split = syllable.split(" ")
127
- g2p_dict[word].append(phone_split)
128
-
129
- line_index = line_index + 1
130
- line = f.readline()
131
-
132
- return g2p_dict
133
-
134
-
135
- def cache_dict(g2p_dict, file_path):
136
- with open(file_path, "wb") as pickle_file:
137
- pickle.dump(g2p_dict, pickle_file)
138
-
139
-
140
- def get_dict():
141
- if os.path.exists(CACHE_PATH):
142
- with open(CACHE_PATH, "rb") as pickle_file:
143
- g2p_dict = pickle.load(pickle_file)
144
- else:
145
- g2p_dict = read_dict()
146
- cache_dict(g2p_dict, CACHE_PATH)
147
-
148
- return g2p_dict
149
-
150
-
151
- eng_dict = get_dict()
152
-
153
-
154
- def refine_ph(phn):
155
- tone = 0
156
- if re.search(r"\d$", phn):
157
- tone = int(phn[-1]) + 1
158
- phn = phn[:-1]
159
- return phn.lower(), tone
160
-
161
-
162
- def refine_syllables(syllables):
163
- tones = []
164
- phonemes = []
165
- for phn_list in syllables:
166
- for i in range(len(phn_list)):
167
- phn = phn_list[i]
168
- phn, tone = refine_ph(phn)
169
- phonemes.append(phn)
170
- tones.append(tone)
171
- return phonemes, tones
172
-
173
-
174
- def text_normalize(text):
175
- # todo: eng text normalize
176
- return text
177
-
178
-
179
- def g2p(text):
180
- phones = []
181
- tones = []
182
- words = re.split(r"([,;.\-\?\!\s+])", text)
183
- for w in words:
184
- if w.upper() in eng_dict:
185
- phns, tns = refine_syllables(eng_dict[w.upper()])
186
- phones += phns
187
- tones += tns
188
- else:
189
- phone_list = list(filter(lambda p: p != " ", _g2p(w)))
190
- for ph in phone_list:
191
- if ph in arpa:
192
- ph, tn = refine_ph(ph)
193
- phones.append(ph)
194
- tones.append(tn)
195
- else:
196
- phones.append(ph)
197
- tones.append(0)
198
- # todo: implement word2ph
199
- word2ph = [1 for i in phones]
200
-
201
- phones = [post_replace_ph(i) for i in phones]
202
- return phones, tones, word2ph
203
-
204
-
205
- if __name__ == "__main__":
206
- # print(get_dict())
207
- # print(eng_word_to_phoneme("hello"))
208
- print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
209
- # all_phones = set()
210
- # for k, syllables in eng_dict.items():
211
- # for group in syllables:
212
- # for ph in group:
213
- # all_phones.add(ph)
214
- # print(all_phones)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/english_bert_mock.py DELETED
@@ -1,5 +0,0 @@
1
- import torch
2
-
3
-
4
- def get_bert_feature(norm_text, word2ph):
5
- return torch.zeros(1024, sum(word2ph))
 
 
 
 
 
 
text/japanese.py DELETED
@@ -1,704 +0,0 @@
1
- # Convert Japanese text to phonemes which is
2
- # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
- import re
4
- import unicodedata
5
-
6
- from transformers import AutoTokenizer
7
-
8
- from text import punctuation, symbols
9
-
10
- try:
11
- import MeCab
12
- except ImportError as e:
13
- raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
14
- from num2words import num2words
15
-
16
- _CONVRULES = [
17
- # Conversion of 2 letters
18
- "アァ/ a a",
19
- "イィ/ i i",
20
- "イェ/ i e",
21
- "イャ/ y a",
22
- "ウゥ/ u:",
23
- "エェ/ e e",
24
- "オォ/ o:",
25
- "カァ/ k a:",
26
- "キィ/ k i:",
27
- "クゥ/ k u:",
28
- "クャ/ ky a",
29
- "クュ/ ky u",
30
- "クョ/ ky o",
31
- "ケェ/ k e:",
32
- "コォ/ k o:",
33
- "ガァ/ g a:",
34
- "ギィ/ g i:",
35
- "グゥ/ g u:",
36
- "グャ/ gy a",
37
- "グュ/ gy u",
38
- "グョ/ gy o",
39
- "ゲェ/ g e:",
40
- "ゴォ/ g o:",
41
- "サァ/ s a:",
42
- "シィ/ sh i:",
43
- "スゥ/ s u:",
44
- "スャ/ sh a",
45
- "スュ/ sh u",
46
- "スョ/ sh o",
47
- "セェ/ s e:",
48
- "ソォ/ s o:",
49
- "ザァ/ z a:",
50
- "ジィ/ j i:",
51
- "ズゥ/ z u:",
52
- "ズャ/ zy a",
53
- "ズュ/ zy u",
54
- "ズョ/ zy o",
55
- "ゼェ/ z e:",
56
- "ゾォ/ z o:",
57
- "タァ/ t a:",
58
- "チィ/ ch i:",
59
- "ツァ/ ts a",
60
- "ツィ/ ts i",
61
- "ツゥ/ ts u:",
62
- "ツャ/ ch a",
63
- "ツュ/ ch u",
64
- "ツョ/ ch o",
65
- "ツェ/ ts e",
66
- "ツォ/ ts o",
67
- "テェ/ t e:",
68
- "トォ/ t o:",
69
- "ダァ/ d a:",
70
- "ヂィ/ j i:",
71
- "ヅゥ/ d u:",
72
- "ヅャ/ zy a",
73
- "ヅュ/ zy u",
74
- "ヅョ/ zy o",
75
- "デェ/ d e:",
76
- "ドォ/ d o:",
77
- "ナァ/ n a:",
78
- "ニィ/ n i:",
79
- "ヌゥ/ n u:",
80
- "ヌャ/ ny a",
81
- "ヌュ/ ny u",
82
- "ヌョ/ ny o",
83
- "ネェ/ n e:",
84
- "ノォ/ n o:",
85
- "ハァ/ h a:",
86
- "ヒィ/ h i:",
87
- "フゥ/ f u:",
88
- "フャ/ hy a",
89
- "フュ/ hy u",
90
- "フョ/ hy o",
91
- "ヘェ/ h e:",
92
- "ホォ/ h o:",
93
- "バァ/ b a:",
94
- "ビィ/ b i:",
95
- "ブゥ/ b u:",
96
- "フャ/ hy a",
97
- "ブュ/ by u",
98
- "フョ/ hy o",
99
- "ベェ/ b e:",
100
- "ボォ/ b o:",
101
- "パァ/ p a:",
102
- "ピィ/ p i:",
103
- "プゥ/ p u:",
104
- "プャ/ py a",
105
- "プュ/ py u",
106
- "プョ/ py o",
107
- "ペェ/ p e:",
108
- "ポォ/ p o:",
109
- "マァ/ m a:",
110
- "ミィ/ m i:",
111
- "ムゥ/ m u:",
112
- "ムャ/ my a",
113
- "ムュ/ my u",
114
- "ムョ/ my o",
115
- "メェ/ m e:",
116
- "モォ/ m o:",
117
- "ヤァ/ y a:",
118
- "ユゥ/ y u:",
119
- "ユャ/ y a:",
120
- "ユュ/ y u:",
121
- "ユョ/ y o:",
122
- "ヨォ/ y o:",
123
- "ラァ/ r a:",
124
- "リィ/ r i:",
125
- "ルゥ/ r u:",
126
- "ルャ/ ry a",
127
- "ルュ/ ry u",
128
- "ルョ/ ry o",
129
- "レェ/ r e:",
130
- "ロォ/ r o:",
131
- "ワァ/ w a:",
132
- "ヲォ/ o:",
133
- "ディ/ d i",
134
- "デェ/ d e:",
135
- "デャ/ dy a",
136
- "デュ/ dy u",
137
- "デョ/ dy o",
138
- "ティ/ t i",
139
- "テェ/ t e:",
140
- "テャ/ ty a",
141
- "テュ/ ty u",
142
- "テョ/ ty o",
143
- "スィ/ s i",
144
- "ズァ/ z u a",
145
- "ズィ/ z i",
146
- "ズゥ/ z u",
147
- "ズャ/ zy a",
148
- "ズュ/ zy u",
149
- "ズョ/ zy o",
150
- "ズェ/ z e",
151
- "ズォ/ z o",
152
- "キャ/ ky a",
153
- "キュ/ ky u",
154
- "キョ/ ky o",
155
- "シャ/ sh a",
156
- "シュ/ sh u",
157
- "シェ/ sh e",
158
- "ショ/ sh o",
159
- "チャ/ ch a",
160
- "チュ/ ch u",
161
- "チェ/ ch e",
162
- "チョ/ ch o",
163
- "トゥ/ t u",
164
- "トャ/ ty a",
165
- "トュ/ ty u",
166
- "トョ/ ty o",
167
- "ドァ/ d o a",
168
- "ドゥ/ d u",
169
- "ドャ/ dy a",
170
- "ドュ/ dy u",
171
- "ドョ/ dy o",
172
- "ドォ/ d o:",
173
- "ニャ/ ny a",
174
- "ニュ/ ny u",
175
- "ニョ/ ny o",
176
- "ヒャ/ hy a",
177
- "ヒュ/ hy u",
178
- "ヒョ/ hy o",
179
- "ミャ/ my a",
180
- "ミュ/ my u",
181
- "ミョ/ my o",
182
- "リャ/ ry a",
183
- "リュ/ ry u",
184
- "リョ/ ry o",
185
- "ギャ/ gy a",
186
- "ギュ/ gy u",
187
- "ギョ/ gy o",
188
- "ヂェ/ j e",
189
- "ヂャ/ j a",
190
- "ヂュ/ j u",
191
- "ヂョ/ j o",
192
- "ジェ/ j e",
193
- "ジャ/ j a",
194
- "ジュ/ j u",
195
- "ジョ/ j o",
196
- "ビャ/ by a",
197
- "ビュ/ by u",
198
- "ビョ/ by o",
199
- "ピャ/ py a",
200
- "ピュ/ py u",
201
- "ピョ/ py o",
202
- "ウァ/ u a",
203
- "ウィ/ w i",
204
- "ウェ/ w e",
205
- "ウォ/ w o",
206
- "ファ/ f a",
207
- "フィ/ f i",
208
- "フゥ/ f u",
209
- "フャ/ hy a",
210
- "フュ/ hy u",
211
- "フョ/ hy o",
212
- "フェ/ f e",
213
- "フォ/ f o",
214
- "ヴァ/ b a",
215
- "ヴィ/ b i",
216
- "ヴェ/ b e",
217
- "ヴォ/ b o",
218
- "ヴュ/ by u",
219
- "アー/ a:",
220
- "イー/ i:",
221
- "ウー/ u:",
222
- "エー/ e:",
223
- "オー/ o:",
224
- "カー/ k a:",
225
- "キー/ k i:",
226
- "クー/ k u:",
227
- "ケー/ k e:",
228
- "コー/ k o:",
229
- "サー/ s a:",
230
- "シー/ sh i:",
231
- "スー/ s u:",
232
- "セー/ s e:",
233
- "ソー/ s o:",
234
- "ター/ t a:",
235
- "チー/ ch i:",
236
- "ツー/ ts u:",
237
- "テー/ t e:",
238
- "トー/ t o:",
239
- "ナー/ n a:",
240
- "ニー/ n i:",
241
- "ヌ���/ n u:",
242
- "ネー/ n e:",
243
- "ノー/ n o:",
244
- "ハー/ h a:",
245
- "ヒー/ h i:",
246
- "フー/ f u:",
247
- "ヘー/ h e:",
248
- "ホー/ h o:",
249
- "マー/ m a:",
250
- "ミー/ m i:",
251
- "ムー/ m u:",
252
- "メー/ m e:",
253
- "モー/ m o:",
254
- "ラー/ r a:",
255
- "リー/ r i:",
256
- "ルー/ r u:",
257
- "レー/ r e:",
258
- "ロー/ r o:",
259
- "ガー/ g a:",
260
- "ギー/ g i:",
261
- "グー/ g u:",
262
- "ゲー/ g e:",
263
- "ゴー/ g o:",
264
- "ザー/ z a:",
265
- "ジー/ j i:",
266
- "ズー/ z u:",
267
- "ゼー/ z e:",
268
- "ゾー/ z o:",
269
- "ダー/ d a:",
270
- "ヂー/ j i:",
271
- "ヅー/ z u:",
272
- "デー/ d e:",
273
- "ドー/ d o:",
274
- "バー/ b a:",
275
- "ビー/ b i:",
276
- "ブー/ b u:",
277
- "ベー/ b e:",
278
- "ボー/ b o:",
279
- "パー/ p a:",
280
- "ピー/ p i:",
281
- "プー/ p u:",
282
- "ペー/ p e:",
283
- "ポー/ p o:",
284
- "ヤー/ y a:",
285
- "ユー/ y u:",
286
- "ヨー/ y o:",
287
- "ワー/ w a:",
288
- "ヰー/ i:",
289
- "ヱー/ e:",
290
- "ヲー/ o:",
291
- "ヴー/ b u:",
292
- # Conversion of 1 letter
293
- "ア/ a",
294
- "イ/ i",
295
- "ウ/ u",
296
- "エ/ e",
297
- "オ/ o",
298
- "カ/ k a",
299
- "キ/ k i",
300
- "ク/ k u",
301
- "ケ/ k e",
302
- "コ/ k o",
303
- "サ/ s a",
304
- "シ/ sh i",
305
- "ス/ s u",
306
- "セ/ s e",
307
- "ソ/ s o",
308
- "タ/ t a",
309
- "チ/ ch i",
310
- "ツ/ ts u",
311
- "テ/ t e",
312
- "ト/ t o",
313
- "ナ/ n a",
314
- "ニ/ n i",
315
- "ヌ/ n u",
316
- "ネ/ n e",
317
- "ノ/ n o",
318
- "ハ/ h a",
319
- "ヒ/ h i",
320
- "フ/ f u",
321
- "ヘ/ h e",
322
- "ホ/ h o",
323
- "マ/ m a",
324
- "ミ/ m i",
325
- "ム/ m u",
326
- "メ/ m e",
327
- "モ/ m o",
328
- "ラ/ r a",
329
- "リ/ r i",
330
- "ル/ r u",
331
- "レ/ r e",
332
- "ロ/ r o",
333
- "ガ/ g a",
334
- "ギ/ g i",
335
- "グ/ g u",
336
- "ゲ/ g e",
337
- "ゴ/ g o",
338
- "ザ/ z a",
339
- "ジ/ j i",
340
- "ズ/ z u",
341
- "ゼ/ z e",
342
- "ゾ/ z o",
343
- "ダ/ d a",
344
- "ヂ/ j i",
345
- "ヅ/ z u",
346
- "デ/ d e",
347
- "ド/ d o",
348
- "バ/ b a",
349
- "ビ/ b i",
350
- "ブ/ b u",
351
- "ベ/ b e",
352
- "ボ/ b o",
353
- "パ/ p a",
354
- "ピ/ p i",
355
- "プ/ p u",
356
- "ペ/ p e",
357
- "ポ/ p o",
358
- "ヤ/ y a",
359
- "ユ/ y u",
360
- "ヨ/ y o",
361
- "ワ/ w a",
362
- "ヰ/ i",
363
- "ヱ/ e",
364
- "ヲ/ o",
365
- "ン/ N",
366
- "ッ/ q",
367
- "ヴ/ b u",
368
- "ー/:", #这个不起作用
369
- # Try converting broken text
370
- "ァ/ a",
371
- "ィ/ i",
372
- "ゥ/ u",
373
- "ェ/ e",
374
- "ォ/ o",
375
- "ヮ/ w a",
376
- "ォ/ o",
377
- # Symbols
378
- "、/ ,",
379
- "。/ .",
380
- "!/ !",
381
- "?/ ?",
382
- "・/ ,",
383
- ]
384
-
385
- _COLON_RX = re.compile(":+")
386
- _REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
387
-
388
-
389
- def _makerulemap():
390
- l = [tuple(x.split("/")) for x in _CONVRULES]
391
- return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
392
-
393
-
394
- _RULEMAP1, _RULEMAP2 = _makerulemap()
395
-
396
-
397
- def kata2phoneme(text: str) -> str:
398
- """Convert katakana text to phonemes."""
399
- text = text.strip()
400
- res = []
401
- while text:
402
- if len(text) >= 2:
403
- x = _RULEMAP2.get(text[:2])
404
- if x is not None:
405
- text = text[2:]
406
- res += x.split(" ")[1:]
407
- continue
408
- x = _RULEMAP1.get(text[0])
409
- if x is not None:
410
- text = text[1:]
411
- res += x.split(" ")[1:]
412
- continue
413
- res.append(text[0])
414
- text = text[1:]
415
- # res = _COLON_RX.sub(":", res)
416
- return res
417
-
418
-
419
- _KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
420
- _HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
421
- _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
422
-
423
-
424
- def hira2kata(text: str) -> str:
425
- text = text.translate(_HIRA2KATATRANS)
426
- return text.replace("う゛", "ヴ")
427
-
428
-
429
- _SYMBOL_TOKENS = set(list("・、。?!"))
430
- _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
431
- _TAGGER = MeCab.Tagger()
432
-
433
-
434
- def text2kata(text: str) -> str:
435
- parsed = _TAGGER.parse(text)
436
- res = []
437
- for line in parsed.split("\n"):
438
- if line == "EOS":
439
- break
440
- parts = line.split("\t")
441
-
442
- word, yomi = parts[0], parts[1]
443
- if yomi:
444
- res.append(yomi)
445
- else:
446
- if word in _SYMBOL_TOKENS:
447
- res.append(word)
448
- elif word in ("っ", "ッ"):
449
- res.append("ッ")
450
- elif word in _NO_YOMI_TOKENS:
451
- pass
452
- else:
453
- res.append(word)
454
- return hira2kata("".join(res))
455
-
456
-
457
- def text2sep_kata(text: str) -> (list, list):
458
- parsed = _TAGGER.parse(text)
459
- res = []
460
- sep = []
461
- for line in parsed.split("\n"):
462
- if line == "EOS":
463
- break
464
- parts = line.split("\t")
465
-
466
- word, yomi = parts[0], parts[1]
467
- if yomi:
468
- res.append(yomi)
469
- else:
470
- if word in _SYMBOL_TOKENS:
471
- res.append(word)
472
- elif word in ("っ", "ッ"):
473
- res.append("ッ")
474
- elif word in _NO_YOMI_TOKENS:
475
- pass
476
- else:
477
- res.append(word)
478
- sep.append(word)
479
- return sep, [hira2kata(i) for i in res]
480
-
481
-
482
- _ALPHASYMBOL_YOMI = {
483
- "#": "シャープ",
484
- "%": "パーセント",
485
- "&": "アンド",
486
- "+": "プラス",
487
- "-": "マイナス",
488
- ":": "コロン",
489
- ";": "セミコロン",
490
- "<": "小なり",
491
- "=": "イコール",
492
- ">": "大なり",
493
- "@": "アット",
494
- "a": "エー",
495
- "b": "ビー",
496
- "c": "シー",
497
- "d": "ディー",
498
- "e": "イー",
499
- "f": "エフ",
500
- "g": "ジー",
501
- "h": "エイチ",
502
- "i": "アイ",
503
- "j": "ジェー",
504
- "k": "ケー",
505
- "l": "エル",
506
- "m": "エム",
507
- "n": "エヌ",
508
- "o": "オー",
509
- "p": "ピー",
510
- "q": "キュー",
511
- "r": "アール",
512
- "s": "エス",
513
- "t": "ティー",
514
- "u": "ユー",
515
- "v": "ブイ",
516
- "w": "ダブリュー",
517
- "x": "エックス",
518
- "y": "ワイ",
519
- "z": "ゼット",
520
- "α": "アルファ",
521
- "β": "ベータ",
522
- "γ": "ガンマ",
523
- "δ": "デルタ",
524
- "ε": "イプシロン",
525
- "ζ": "ゼータ",
526
- "η": "イータ",
527
- "θ": "シータ",
528
- "ι": "イオタ",
529
- "κ": "カッパ",
530
- "λ": "ラムダ",
531
- "μ": "ミュー",
532
- "ν": "ニュー",
533
- "ξ": "クサイ",
534
- "ο": "オミクロン",
535
- "π": "パイ",
536
- "ρ": "ロー",
537
- "σ": "シグマ",
538
- "τ": "タウ",
539
- "υ": "ウプシロン",
540
- "φ": "ファイ",
541
- "χ": "カイ",
542
- "ψ": "プサイ",
543
- "ω": "オメガ",
544
- }
545
-
546
-
547
- _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
548
- _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
549
- _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
550
- _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
551
-
552
-
553
- def japanese_convert_numbers_to_words(text: str) -> str:
554
- res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
555
- res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
556
- res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
557
- return res
558
-
559
-
560
- def japanese_convert_alpha_symbols_to_words(text: str) -> str:
561
- return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
562
-
563
-
564
- def japanese_text_to_phonemes(text: str) -> str:
565
- """Convert Japanese text to phonemes."""
566
- res = unicodedata.normalize("NFKC", text)
567
- res = japanese_convert_numbers_to_words(res)
568
- # res = japanese_convert_alpha_symbols_to_words(res)
569
- res = text2kata(res)
570
- res = kata2phoneme(res)
571
- return res
572
-
573
-
574
- def is_japanese_character(char):
575
- # 定义日语文字系统的 Unicode 范围
576
- japanese_ranges = [
577
- (0x3040, 0x309F), # 平假名
578
- (0x30A0, 0x30FF), # 片假名
579
- (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
580
- (0x3400, 0x4DBF), # 汉字扩展 A
581
- (0x20000, 0x2A6DF), # 汉字扩展 B
582
- # 可以根据需要添加其他汉字扩展范围
583
- ]
584
-
585
- # 将字符的 Unicode 编码转换为整数
586
- char_code = ord(char)
587
-
588
- # 检查字符是否在任何一个日语范围内
589
- for start, end in japanese_ranges:
590
- if start <= char_code <= end:
591
- return True
592
-
593
- return False
594
-
595
-
596
- rep_map = {
597
- ":": ",",
598
- ";": ",",
599
- ",": ",",
600
- "。": ".",
601
- "!": "!",
602
- "?": "?",
603
- "\n": ".",
604
- "·": ",",
605
- "、": ",",
606
- "…": "...",
607
- }
608
-
609
-
610
- def replace_punctuation(text):
611
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
612
-
613
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
614
-
615
- replaced_text = re.sub(
616
- r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
617
- + "".join(punctuation)
618
- + r"]+",
619
- "",
620
- replaced_text,
621
- )
622
-
623
- return replaced_text
624
-
625
-
626
- def text_normalize(text):
627
- res = unicodedata.normalize("NFKC", text)
628
- res = japanese_convert_numbers_to_words(res)
629
- # res = "".join([i for i in res if is_japanese_character(i)])
630
- res = replace_punctuation(res)
631
- return res
632
-
633
-
634
- def distribute_phone(n_phone, n_word):
635
- phones_per_word = [0] * n_word
636
- for task in range(n_phone):
637
- min_tasks = min(phones_per_word)
638
- min_index = phones_per_word.index(min_tasks)
639
- phones_per_word[min_index] += 1
640
- return phones_per_word
641
-
642
-
643
- tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
644
-
645
-
646
- def g2p(norm_text):
647
- sep_text, sep_kata = text2sep_kata(norm_text)
648
- sep_tokenized = [tokenizer.tokenize(i) for i in sep_text]
649
- sep_phonemes = [kata2phoneme(i) for i in sep_kata]
650
- # 异常处理,MeCab不认识的词的话会一路传到这里来,然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
651
- for i in sep_phonemes:
652
- for j in i:
653
- assert j in symbols, (sep_text, sep_kata, sep_phonemes)
654
-
655
- word2ph = []
656
- for token, phoneme in zip(sep_tokenized, sep_phonemes):
657
- phone_len = len(phoneme)
658
- word_len = len(token)
659
-
660
- aaa = distribute_phone(phone_len, word_len)
661
- word2ph += aaa
662
- phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
663
- tones = [0 for i in phones]
664
- word2ph = [1] + word2ph + [1]
665
- return phones, tones, word2ph
666
-
667
- if __name__ == "__main__":
668
- tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
669
- text = "だったら私、スズカさんと同じチームに入りたいです! スズカさんの走りを毎日近くで、なんなら真横から見ていたいので!"
670
- #print(_TAGGER.parse(text))
671
- # nodes = [{"surface": "こんにちは", "pos": "感動詞:*:*:*", "pron": "コンニチワ", "c_type": "*", "c_form": "*", "accent_type": 0, "accent_con_type": "-1", "chain_flag": -1}]
672
- nodes = [{"surface":"こんにちは","pron": "コンニチワ","pos": "感動詞:*:*:*",}]
673
- from text.japanese_bert import get_bert_feature
674
- import pyopenjtalk
675
- from marine.predict import Predictor
676
- from marine.utils.openjtalk_util import convert_njd_feature_to_marine_feature
677
- text = text_normalize(text)
678
- NJD_NODES = pyopenjtalk.run_frontend(text)
679
- predictor = Predictor()
680
- # important_info = [{"string":i["string"],"pron":i["pron"],"acc":i["acc"]}for i in pyopenjtalk.estimate_accent(NJD_NODES)]
681
- print(text)
682
-
683
- marine_feature = convert_njd_feature_to_marine_feature(NJD_NODES)
684
- results = predictor.predict([marine_feature])
685
- for mora,acc in zip(results["mora"][0],results["accent_status"][0]):
686
- print(f"{mora}:{acc}")
687
- # for i in pyopenjtalk.estimate_accent(NJD_NODES):
688
- # print(f"{i['string']}:{i['pron']}:{i['acc']}")
689
- # info = pyopenjtalk.extract_fullcontext(text,run_marine=True)
690
- # info_nomarine = pyopenjtalk.extract_fullcontext(text,run_marine=False)
691
- # # nodes = pyopenjtalk
692
- # # print(info)
693
- # for i,j in zip(info,info_nomarine):
694
- # print(i)
695
- # print(j)
696
- # print("\n")
697
- # predictor = Predictor()
698
- #print(pyopenjtalk.estimate_accent(text))
699
- # output = predictor.predict([nodes],accent_represent_mode="high_low")
700
- #print(output)
701
- # phones, tones, word2ph = g2p(text)
702
- # bert = get_bert_feature(text, word2ph)
703
-
704
- # print(phones, tones, word2ph, bert.shape)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/japanese_bert.py DELETED
@@ -1,87 +0,0 @@
1
- import torch
2
- from transformers import AutoTokenizer, AutoModelForMaskedLM
3
- import sys
4
- import os
5
- from text.japanese import text2sep_kata
6
- tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
7
-
8
- models = dict()
9
-
10
-
11
- def get_bert_feature(text, word2ph, device=None):
12
- sep_text,_ = text2sep_kata(text)
13
- sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
14
- sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
15
- sep_ids = [2]+[item for sublist in sep_ids for item in sublist]+[3]
16
- return get_bert_feature_with_token(sep_ids, word2ph, device)
17
-
18
-
19
- # def get_bert_feature(text, word2ph, device=None):
20
- # if (
21
- # sys.platform == "darwin"
22
- # and torch.backends.mps.is_available()
23
- # and device == "cpu"
24
- # ):
25
- # device = "mps"
26
- # if not device:
27
- # device = "cuda"
28
- # if device not in models.keys():
29
- # models[device] = AutoModelForMaskedLM.from_pretrained(
30
- # "cl-tohoku/bert-base-japanese-v3"
31
- # ).to(device)
32
- # with torch.no_grad():
33
- # inputs = tokenizer(text, return_tensors="pt")
34
- # for i in inputs:
35
- # inputs[i] = inputs[i].to(device)
36
- # res = models[device](**inputs, output_hidden_states=True)
37
- # res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38
- # assert inputs["input_ids"].shape[-1] == len(word2ph)
39
- # word2phone = word2ph
40
- # phone_level_feature = []
41
- # for i in range(len(word2phone)):
42
- # repeat_feature = res[i].repeat(word2phone[i], 1)
43
- # phone_level_feature.append(repeat_feature)
44
-
45
- # phone_level_feature = torch.cat(phone_level_feature, dim=0)
46
-
47
- # return phone_level_feature.T
48
-
49
- def get_bert_feature_with_token(tokens, word2ph, device=None):
50
- if (
51
- sys.platform == "darwin"
52
- and torch.backends.mps.is_available()
53
- and device == "cpu"
54
- ):
55
- device = "mps"
56
- if not device:
57
- device = "cuda"
58
- if device not in models.keys():
59
- models[device] = AutoModelForMaskedLM.from_pretrained(
60
- "./bert/bert-base-japanese-v3"
61
- ).to(device)
62
- with torch.no_grad():
63
- inputs = torch.tensor(tokens).to(device).unsqueeze(0)
64
- token_type_ids = torch.zeros_like(inputs).to(device)
65
- attention_mask = torch.ones_like(inputs).to(device)
66
- inputs = {"input_ids": inputs, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
67
-
68
-
69
- # for i in inputs:
70
- # inputs[i] = inputs[i].to(device)
71
- res = models[device](**inputs, output_hidden_states=True)
72
- res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
73
- assert inputs["input_ids"].shape[-1] == len(word2ph)
74
- word2phone = word2ph
75
- phone_level_feature = []
76
- for i in range(len(word2phone)):
77
- repeat_feature = res[i].repeat(word2phone[i], 1)
78
- phone_level_feature.append(repeat_feature)
79
-
80
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
81
-
82
- return phone_level_feature.T
83
-
84
-
85
- if __name__ == "__main__":
86
- print(get_bert_feature("観覧車",[4,2]))
87
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/opencpop-strict.txt DELETED
@@ -1,429 +0,0 @@
1
- a AA a
2
- ai AA ai
3
- an AA an
4
- ang AA ang
5
- ao AA ao
6
- ba b a
7
- bai b ai
8
- ban b an
9
- bang b ang
10
- bao b ao
11
- bei b ei
12
- ben b en
13
- beng b eng
14
- bi b i
15
- bian b ian
16
- biao b iao
17
- bie b ie
18
- bin b in
19
- bing b ing
20
- bo b o
21
- bu b u
22
- ca c a
23
- cai c ai
24
- can c an
25
- cang c ang
26
- cao c ao
27
- ce c e
28
- cei c ei
29
- cen c en
30
- ceng c eng
31
- cha ch a
32
- chai ch ai
33
- chan ch an
34
- chang ch ang
35
- chao ch ao
36
- che ch e
37
- chen ch en
38
- cheng ch eng
39
- chi ch ir
40
- chong ch ong
41
- chou ch ou
42
- chu ch u
43
- chua ch ua
44
- chuai ch uai
45
- chuan ch uan
46
- chuang ch uang
47
- chui ch ui
48
- chun ch un
49
- chuo ch uo
50
- ci c i0
51
- cong c ong
52
- cou c ou
53
- cu c u
54
- cuan c uan
55
- cui c ui
56
- cun c un
57
- cuo c uo
58
- da d a
59
- dai d ai
60
- dan d an
61
- dang d ang
62
- dao d ao
63
- de d e
64
- dei d ei
65
- den d en
66
- deng d eng
67
- di d i
68
- dia d ia
69
- dian d ian
70
- diao d iao
71
- die d ie
72
- ding d ing
73
- diu d iu
74
- dong d ong
75
- dou d ou
76
- du d u
77
- duan d uan
78
- dui d ui
79
- dun d un
80
- duo d uo
81
- e EE e
82
- ei EE ei
83
- en EE en
84
- eng EE eng
85
- er EE er
86
- fa f a
87
- fan f an
88
- fang f ang
89
- fei f ei
90
- fen f en
91
- feng f eng
92
- fo f o
93
- fou f ou
94
- fu f u
95
- ga g a
96
- gai g ai
97
- gan g an
98
- gang g ang
99
- gao g ao
100
- ge g e
101
- gei g ei
102
- gen g en
103
- geng g eng
104
- gong g ong
105
- gou g ou
106
- gu g u
107
- gua g ua
108
- guai g uai
109
- guan g uan
110
- guang g uang
111
- gui g ui
112
- gun g un
113
- guo g uo
114
- ha h a
115
- hai h ai
116
- han h an
117
- hang h ang
118
- hao h ao
119
- he h e
120
- hei h ei
121
- hen h en
122
- heng h eng
123
- hong h ong
124
- hou h ou
125
- hu h u
126
- hua h ua
127
- huai h uai
128
- huan h uan
129
- huang h uang
130
- hui h ui
131
- hun h un
132
- huo h uo
133
- ji j i
134
- jia j ia
135
- jian j ian
136
- jiang j iang
137
- jiao j iao
138
- jie j ie
139
- jin j in
140
- jing j ing
141
- jiong j iong
142
- jiu j iu
143
- ju j v
144
- jv j v
145
- juan j van
146
- jvan j van
147
- jue j ve
148
- jve j ve
149
- jun j vn
150
- jvn j vn
151
- ka k a
152
- kai k ai
153
- kan k an
154
- kang k ang
155
- kao k ao
156
- ke k e
157
- kei k ei
158
- ken k en
159
- keng k eng
160
- kong k ong
161
- kou k ou
162
- ku k u
163
- kua k ua
164
- kuai k uai
165
- kuan k uan
166
- kuang k uang
167
- kui k ui
168
- kun k un
169
- kuo k uo
170
- la l a
171
- lai l ai
172
- lan l an
173
- lang l ang
174
- lao l ao
175
- le l e
176
- lei l ei
177
- leng l eng
178
- li l i
179
- lia l ia
180
- lian l ian
181
- liang l iang
182
- liao l iao
183
- lie l ie
184
- lin l in
185
- ling l ing
186
- liu l iu
187
- lo l o
188
- long l ong
189
- lou l ou
190
- lu l u
191
- luan l uan
192
- lun l un
193
- luo l uo
194
- lv l v
195
- lve l ve
196
- ma m a
197
- mai m ai
198
- man m an
199
- mang m ang
200
- mao m ao
201
- me m e
202
- mei m ei
203
- men m en
204
- meng m eng
205
- mi m i
206
- mian m ian
207
- miao m iao
208
- mie m ie
209
- min m in
210
- ming m ing
211
- miu m iu
212
- mo m o
213
- mou m ou
214
- mu m u
215
- na n a
216
- nai n ai
217
- nan n an
218
- nang n ang
219
- nao n ao
220
- ne n e
221
- nei n ei
222
- nen n en
223
- neng n eng
224
- ni n i
225
- nian n ian
226
- niang n iang
227
- niao n iao
228
- nie n ie
229
- nin n in
230
- ning n ing
231
- niu n iu
232
- nong n ong
233
- nou n ou
234
- nu n u
235
- nuan n uan
236
- nun n un
237
- nuo n uo
238
- nv n v
239
- nve n ve
240
- o OO o
241
- ou OO ou
242
- pa p a
243
- pai p ai
244
- pan p an
245
- pang p ang
246
- pao p ao
247
- pei p ei
248
- pen p en
249
- peng p eng
250
- pi p i
251
- pian p ian
252
- piao p iao
253
- pie p ie
254
- pin p in
255
- ping p ing
256
- po p o
257
- pou p ou
258
- pu p u
259
- qi q i
260
- qia q ia
261
- qian q ian
262
- qiang q iang
263
- qiao q iao
264
- qie q ie
265
- qin q in
266
- qing q ing
267
- qiong q iong
268
- qiu q iu
269
- qu q v
270
- qv q v
271
- quan q van
272
- qvan q van
273
- que q ve
274
- qve q ve
275
- qun q vn
276
- qvn q vn
277
- ran r an
278
- rang r ang
279
- rao r ao
280
- re r e
281
- ren r en
282
- reng r eng
283
- ri r ir
284
- rong r ong
285
- rou r ou
286
- ru r u
287
- rua r ua
288
- ruan r uan
289
- rui r ui
290
- run r un
291
- ruo r uo
292
- sa s a
293
- sai s ai
294
- san s an
295
- sang s ang
296
- sao s ao
297
- se s e
298
- sen s en
299
- seng s eng
300
- sha sh a
301
- shai sh ai
302
- shan sh an
303
- shang sh ang
304
- shao sh ao
305
- she sh e
306
- shei sh ei
307
- shen sh en
308
- sheng sh eng
309
- shi sh ir
310
- shou sh ou
311
- shu sh u
312
- shua sh ua
313
- shuai sh uai
314
- shuan sh uan
315
- shuang sh uang
316
- shui sh ui
317
- shun sh un
318
- shuo sh uo
319
- si s i0
320
- song s ong
321
- sou s ou
322
- su s u
323
- suan s uan
324
- sui s ui
325
- sun s un
326
- suo s uo
327
- ta t a
328
- tai t ai
329
- tan t an
330
- tang t ang
331
- tao t ao
332
- te t e
333
- tei t ei
334
- teng t eng
335
- ti t i
336
- tian t ian
337
- tiao t iao
338
- tie t ie
339
- ting t ing
340
- tong t ong
341
- tou t ou
342
- tu t u
343
- tuan t uan
344
- tui t ui
345
- tun t un
346
- tuo t uo
347
- wa w a
348
- wai w ai
349
- wan w an
350
- wang w ang
351
- wei w ei
352
- wen w en
353
- weng w eng
354
- wo w o
355
- wu w u
356
- xi x i
357
- xia x ia
358
- xian x ian
359
- xiang x iang
360
- xiao x iao
361
- xie x ie
362
- xin x in
363
- xing x ing
364
- xiong x iong
365
- xiu x iu
366
- xu x v
367
- xv x v
368
- xuan x van
369
- xvan x van
370
- xue x ve
371
- xve x ve
372
- xun x vn
373
- xvn x vn
374
- ya y a
375
- yan y En
376
- yang y ang
377
- yao y ao
378
- ye y E
379
- yi y i
380
- yin y in
381
- ying y ing
382
- yo y o
383
- yong y ong
384
- you y ou
385
- yu y v
386
- yv y v
387
- yuan y van
388
- yvan y van
389
- yue y ve
390
- yve y ve
391
- yun y vn
392
- yvn y vn
393
- za z a
394
- zai z ai
395
- zan z an
396
- zang z ang
397
- zao z ao
398
- ze z e
399
- zei z ei
400
- zen z en
401
- zeng z eng
402
- zha zh a
403
- zhai zh ai
404
- zhan zh an
405
- zhang zh ang
406
- zhao zh ao
407
- zhe zh e
408
- zhei zh ei
409
- zhen zh en
410
- zheng zh eng
411
- zhi zh ir
412
- zhong zh ong
413
- zhou zh ou
414
- zhu zh u
415
- zhua zh ua
416
- zhuai zh uai
417
- zhuan zh uan
418
- zhuang zh uang
419
- zhui zh ui
420
- zhun zh un
421
- zhuo zh uo
422
- zi z i0
423
- zong z ong
424
- zou z ou
425
- zu z u
426
- zuan z uan
427
- zui z ui
428
- zun z un
429
- zuo z uo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/symbols.py DELETED
@@ -1,188 +0,0 @@
1
- punctuation = ["!", "?", "…", ",", ".", "'", "-"]
2
- pu_symbols = punctuation + ["SP", "UNK"]
3
- pad = "_"
4
-
5
- # chinese
6
- zh_symbols = [
7
- "E",
8
- "En",
9
- "a",
10
- "ai",
11
- "an",
12
- "ang",
13
- "ao",
14
- "b",
15
- "c",
16
- "ch",
17
- "d",
18
- "e",
19
- "ei",
20
- "en",
21
- "eng",
22
- "er",
23
- "f",
24
- "g",
25
- "h",
26
- "i",
27
- "i0",
28
- "ia",
29
- "ian",
30
- "iang",
31
- "iao",
32
- "ie",
33
- "in",
34
- "ing",
35
- "iong",
36
- "ir",
37
- "iu",
38
- "j",
39
- "k",
40
- "l",
41
- "m",
42
- "n",
43
- "o",
44
- "ong",
45
- "ou",
46
- "p",
47
- "q",
48
- "r",
49
- "s",
50
- "sh",
51
- "t",
52
- "u",
53
- "ua",
54
- "uai",
55
- "uan",
56
- "uang",
57
- "ui",
58
- "un",
59
- "uo",
60
- "v",
61
- "van",
62
- "ve",
63
- "vn",
64
- "w",
65
- "x",
66
- "y",
67
- "z",
68
- "zh",
69
- "AA",
70
- "EE",
71
- "OO",
72
- ]
73
- num_zh_tones = 6
74
-
75
- # japanese
76
- ja_symbols = [
77
- "N",
78
- "a",
79
- "a:",
80
- "b",
81
- "by",
82
- "ch",
83
- "d",
84
- "dy",
85
- "e",
86
- "e:",
87
- "f",
88
- "g",
89
- "gy",
90
- "h",
91
- "hy",
92
- "i",
93
- "i:",
94
- "j",
95
- "k",
96
- "ky",
97
- "m",
98
- "my",
99
- "n",
100
- "ny",
101
- "o",
102
- "o:",
103
- "p",
104
- "py",
105
- "q",
106
- "r",
107
- "ry",
108
- "s",
109
- "sh",
110
- "t",
111
- "ts",
112
- "ty",
113
- "u",
114
- "u:",
115
- "w",
116
- "y",
117
- "z",
118
- "zy",
119
- # ":"
120
- ]
121
- num_ja_tones = 1
122
-
123
- # English
124
- en_symbols = [
125
- "aa",
126
- "ae",
127
- "ah",
128
- "ao",
129
- "aw",
130
- "ay",
131
- "b",
132
- "ch",
133
- "d",
134
- "dh",
135
- "eh",
136
- "er",
137
- "ey",
138
- "f",
139
- "g",
140
- "hh",
141
- "ih",
142
- "iy",
143
- "jh",
144
- "k",
145
- "l",
146
- "m",
147
- "n",
148
- "ng",
149
- "ow",
150
- "oy",
151
- "p",
152
- "r",
153
- "s",
154
- "sh",
155
- "t",
156
- "th",
157
- "uh",
158
- "uw",
159
- "V",
160
- "w",
161
- "y",
162
- "z",
163
- "zh",
164
- ]
165
- num_en_tones = 4
166
-
167
- # combine all symbols
168
- normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
169
- symbols = [pad] + normal_symbols + pu_symbols
170
- sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
171
-
172
- # combine all tones
173
- num_tones = num_zh_tones + num_ja_tones + num_en_tones
174
-
175
- # language maps
176
- language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
177
- num_languages = len(language_id_map.keys())
178
-
179
- language_tone_start_map = {
180
- "ZH": 0,
181
- "JP": num_zh_tones,
182
- "EN": num_zh_tones + num_ja_tones,
183
- }
184
-
185
- if __name__ == "__main__":
186
- a = set(zh_symbols)
187
- b = set(en_symbols)
188
- print(sorted(a & b))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
text/tone_sandhi.py DELETED
@@ -1,769 +0,0 @@
1
- # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
- from typing import List
15
- from typing import Tuple
16
-
17
- import jieba
18
- from pypinyin import lazy_pinyin
19
- from pypinyin import Style
20
-
21
-
22
- class ToneSandhi:
23
- def __init__(self):
24
- self.must_neural_tone_words = {
25
- "麻烦",
26
- "麻利",
27
- "鸳鸯",
28
- "高粱",
29
- "骨头",
30
- "骆驼",
31
- "马虎",
32
- "首饰",
33
- "馒头",
34
- "馄饨",
35
- "风筝",
36
- "难为",
37
- "队伍",
38
- "阔气",
39
- "闺女",
40
- "门道",
41
- "锄头",
42
- "铺盖",
43
- "铃铛",
44
- "铁匠",
45
- "钥匙",
46
- "里脊",
47
- "里头",
48
- "部分",
49
- "那么",
50
- "道士",
51
- "造化",
52
- "迷糊",
53
- "连累",
54
- "这么",
55
- "这个",
56
- "运气",
57
- "过去",
58
- "软和",
59
- "转悠",
60
- "踏实",
61
- "跳蚤",
62
- "跟头",
63
- "趔趄",
64
- "财主",
65
- "豆腐",
66
- "讲究",
67
- "记性",
68
- "记号",
69
- "认识",
70
- "规矩",
71
- "见识",
72
- "裁缝",
73
- "补丁",
74
- "衣裳",
75
- "衣服",
76
- "衙门",
77
- "街坊",
78
- "行李",
79
- "行当",
80
- "蛤蟆",
81
- "蘑菇",
82
- "薄荷",
83
- "葫芦",
84
- "葡萄",
85
- "萝卜",
86
- "荸荠",
87
- "苗条",
88
- "苗头",
89
- "苍蝇",
90
- "芝麻",
91
- "舒服",
92
- "舒坦",
93
- "舌头",
94
- "自在",
95
- "膏药",
96
- "脾气",
97
- "脑袋",
98
- "脊梁",
99
- "能耐",
100
- "胳膊",
101
- "胭脂",
102
- "胡萝",
103
- "胡琴",
104
- "胡同",
105
- "聪明",
106
- "耽误",
107
- "耽搁",
108
- "耷拉",
109
- "耳朵",
110
- "老爷",
111
- "老实",
112
- "老婆",
113
- "老头",
114
- "老太",
115
- "翻腾",
116
- "罗嗦",
117
- "罐头",
118
- "编辑",
119
- "结实",
120
- "红火",
121
- "累赘",
122
- "糨糊",
123
- "糊涂",
124
- "精神",
125
- "粮食",
126
- "簸箕",
127
- "篱笆",
128
- "算计",
129
- "算盘",
130
- "答应",
131
- "笤帚",
132
- "笑语",
133
- "笑话",
134
- "窟窿",
135
- "窝囊",
136
- "窗户",
137
- "稳当",
138
- "稀罕",
139
- "称呼",
140
- "秧歌",
141
- "秀气",
142
- "秀才",
143
- "福气",
144
- "祖宗",
145
- "砚台",
146
- "码头",
147
- "石榴",
148
- "石头",
149
- "石匠",
150
- "知识",
151
- "眼睛",
152
- "眯缝",
153
- "眨巴",
154
- "眉毛",
155
- "相声",
156
- "盘算",
157
- "白净",
158
- "痢疾",
159
- "痛快",
160
- "疟疾",
161
- "疙瘩",
162
- "疏忽",
163
- "畜生",
164
- "生意",
165
- "甘蔗",
166
- "琵琶",
167
- "琢磨",
168
- "琉璃",
169
- "玻璃",
170
- "玫瑰",
171
- "玄乎",
172
- "狐狸",
173
- "状元",
174
- "特务",
175
- "牲口",
176
- "牙碜",
177
- "牌楼",
178
- "爽快",
179
- "爱人",
180
- "热闹",
181
- "烧饼",
182
- "烟筒",
183
- "烂糊",
184
- "点心",
185
- "炊帚",
186
- "灯笼",
187
- "火候",
188
- "漂亮",
189
- "滑溜",
190
- "溜达",
191
- "温和",
192
- "清楚",
193
- "消息",
194
- "浪头",
195
- "活泼",
196
- "比方",
197
- "正经",
198
- "欺负",
199
- "模糊",
200
- "槟榔",
201
- "棺材",
202
- "棒槌",
203
- "棉花",
204
- "核桃",
205
- "栅栏",
206
- "柴火",
207
- "架势",
208
- "枕头",
209
- "枇杷",
210
- "机灵",
211
- "本事",
212
- "木头",
213
- "木匠",
214
- "朋友",
215
- "月饼",
216
- "月亮",
217
- "暖和",
218
- "明白",
219
- "时候",
220
- "新鲜",
221
- "故事",
222
- "收拾",
223
- "收成",
224
- "提防",
225
- "挖苦",
226
- "挑剔",
227
- "指甲",
228
- "指头",
229
- "拾掇",
230
- "拳头",
231
- "拨弄",
232
- "招牌",
233
- "招呼",
234
- "抬举",
235
- "护士",
236
- "折腾",
237
- "扫帚",
238
- "打量",
239
- "打算",
240
- "打点",
241
- "打扮",
242
- "打听",
243
- "打发",
244
- "扎实",
245
- "扁担",
246
- "戒指",
247
- "懒得",
248
- "意识",
249
- "意思",
250
- "情形",
251
- "悟性",
252
- "怪物",
253
- "思量",
254
- "怎么",
255
- "念头",
256
- "念叨",
257
- "快活",
258
- "忙活",
259
- "志气",
260
- "心思",
261
- "得罪",
262
- "张罗",
263
- "弟兄",
264
- "开通",
265
- "应酬",
266
- "庄稼",
267
- "干事",
268
- "帮手",
269
- "帐篷",
270
- "希罕",
271
- "师父",
272
- "师傅",
273
- "巴结",
274
- "巴掌",
275
- "差事",
276
- "工夫",
277
- "岁数",
278
- "屁股",
279
- "尾巴",
280
- "少爷",
281
- "小气",
282
- "小伙",
283
- "将就",
284
- "对头",
285
- "对付",
286
- "寡妇",
287
- "家伙",
288
- "客气",
289
- "实在",
290
- "官司",
291
- "学问",
292
- "学生",
293
- "字号",
294
- "嫁妆",
295
- "媳妇",
296
- "媒人",
297
- "婆家",
298
- "娘家",
299
- "委屈",
300
- "姑娘",
301
- "姐夫",
302
- "妯娌",
303
- "妥当",
304
- "妖精",
305
- "奴才",
306
- "女婿",
307
- "头发",
308
- "太阳",
309
- "大爷",
310
- "大方",
311
- "大意",
312
- "大夫",
313
- "多少",
314
- "多么",
315
- "外甥",
316
- "壮实",
317
- "地道",
318
- "地方",
319
- "在乎",
320
- "困难",
321
- "嘴巴",
322
- "嘱咐",
323
- "嘟囔",
324
- "嘀咕",
325
- "喜欢",
326
- "喇嘛",
327
- "喇叭",
328
- "商量",
329
- "唾沫",
330
- "哑巴",
331
- "哈欠",
332
- "哆嗦",
333
- "咳嗽",
334
- "和尚",
335
- "告诉",
336
- "告示",
337
- "含糊",
338
- "吓唬",
339
- "后头",
340
- "名字",
341
- "名堂",
342
- "合同",
343
- "吆喝",
344
- "叫唤",
345
- "口袋",
346
- "厚道",
347
- "厉害",
348
- "千斤",
349
- "包袱",
350
- "包涵",
351
- "匀称",
352
- "勤快",
353
- "动静",
354
- "动弹",
355
- "功夫",
356
- "力气",
357
- "前头",
358
- "刺猬",
359
- "刺激",
360
- "别扭",
361
- "利落",
362
- "利索",
363
- "利害",
364
- "分析",
365
- "出息",
366
- "凑合",
367
- "凉快",
368
- "冷战",
369
- "冤枉",
370
- "冒失",
371
- "养活",
372
- "关系",
373
- "先生",
374
- "兄弟",
375
- "便宜",
376
- "使唤",
377
- "佩服",
378
- "作坊",
379
- "体面",
380
- "位置",
381
- "似的",
382
- "伙计",
383
- "休息",
384
- "什么",
385
- "人家",
386
- "亲戚",
387
- "亲家",
388
- "交情",
389
- "云彩",
390
- "事情",
391
- "买卖",
392
- "主意",
393
- "丫头",
394
- "丧气",
395
- "两口",
396
- "东西",
397
- "东家",
398
- "世故",
399
- "不由",
400
- "不在",
401
- "下水",
402
- "下巴",
403
- "上头",
404
- "上司",
405
- "丈夫",
406
- "丈人",
407
- "一辈",
408
- "那个",
409
- "菩萨",
410
- "父亲",
411
- "母亲",
412
- "咕噜",
413
- "邋遢",
414
- "费用",
415
- "冤家",
416
- "甜头",
417
- "介绍",
418
- "荒唐",
419
- "大人",
420
- "泥鳅",
421
- "幸福",
422
- "熟悉",
423
- "计划",
424
- "扑腾",
425
- "蜡烛",
426
- "姥爷",
427
- "照顾",
428
- "喉咙",
429
- "吉他",
430
- "弄堂",
431
- "蚂蚱",
432
- "凤凰",
433
- "拖沓",
434
- "寒碜",
435
- "糟蹋",
436
- "倒腾",
437
- "报复",
438
- "逻辑",
439
- "盘缠",
440
- "喽啰",
441
- "牢骚",
442
- "咖喱",
443
- "扫把",
444
- "惦记",
445
- }
446
- self.must_not_neural_tone_words = {
447
- "男子",
448
- "女子",
449
- "分子",
450
- "原子",
451
- "量子",
452
- "莲子",
453
- "石子",
454
- "瓜子",
455
- "电子",
456
- "人人",
457
- "虎虎",
458
- }
459
- self.punc = ":,;。?!“”‘’':,;.?!"
460
-
461
- # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
462
- # e.g.
463
- # word: "家里"
464
- # pos: "s"
465
- # finals: ['ia1', 'i3']
466
- def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
467
- # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
468
- for j, item in enumerate(word):
469
- if (
470
- j - 1 >= 0
471
- and item == word[j - 1]
472
- and pos[0] in {"n", "v", "a"}
473
- and word not in self.must_not_neural_tone_words
474
- ):
475
- finals[j] = finals[j][:-1] + "5"
476
- ge_idx = word.find("个")
477
- if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
478
- finals[-1] = finals[-1][:-1] + "5"
479
- elif len(word) >= 1 and word[-1] in "的地得":
480
- finals[-1] = finals[-1][:-1] + "5"
481
- # e.g. 走了, 看着, 去过
482
- # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
483
- # finals[-1] = finals[-1][:-1] + "5"
484
- elif (
485
- len(word) > 1
486
- and word[-1] in "们子"
487
- and pos in {"r", "n"}
488
- and word not in self.must_not_neural_tone_words
489
- ):
490
- finals[-1] = finals[-1][:-1] + "5"
491
- # e.g. 桌上, 地下, 家里
492
- elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
493
- finals[-1] = finals[-1][:-1] + "5"
494
- # e.g. 上来, 下去
495
- elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
496
- finals[-1] = finals[-1][:-1] + "5"
497
- # 个做量词
498
- elif (
499
- ge_idx >= 1
500
- and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
501
- ) or word == "个":
502
- finals[ge_idx] = finals[ge_idx][:-1] + "5"
503
- else:
504
- if (
505
- word in self.must_neural_tone_words
506
- or word[-2:] in self.must_neural_tone_words
507
- ):
508
- finals[-1] = finals[-1][:-1] + "5"
509
-
510
- word_list = self._split_word(word)
511
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
512
- for i, word in enumerate(word_list):
513
- # conventional neural in Chinese
514
- if (
515
- word in self.must_neural_tone_words
516
- or word[-2:] in self.must_neural_tone_words
517
- ):
518
- finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
519
- finals = sum(finals_list, [])
520
- return finals
521
-
522
- def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
523
- # e.g. 看不懂
524
- if len(word) == 3 and word[1] == "不":
525
- finals[1] = finals[1][:-1] + "5"
526
- else:
527
- for i, char in enumerate(word):
528
- # "不" before tone4 should be bu2, e.g. 不怕
529
- if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
530
- finals[i] = finals[i][:-1] + "2"
531
- return finals
532
-
533
- def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
534
- # "一" in number sequences, e.g. 一零零, 二一零
535
- if word.find("一") != -1 and all(
536
- [item.isnumeric() for item in word if item != "一"]
537
- ):
538
- return finals
539
- # "一" between reduplication words should be yi5, e.g. 看一看
540
- elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
541
- finals[1] = finals[1][:-1] + "5"
542
- # when "一" is ordinal word, it should be yi1
543
- elif word.startswith("第一"):
544
- finals[1] = finals[1][:-1] + "1"
545
- else:
546
- for i, char in enumerate(word):
547
- if char == "一" and i + 1 < len(word):
548
- # "一" before tone4 should be yi2, e.g. 一段
549
- if finals[i + 1][-1] == "4":
550
- finals[i] = finals[i][:-1] + "2"
551
- # "一" before non-tone4 should be yi4, e.g. 一天
552
- else:
553
- # "一" 后面如果是标点,还读一声
554
- if word[i + 1] not in self.punc:
555
- finals[i] = finals[i][:-1] + "4"
556
- return finals
557
-
558
- def _split_word(self, word: str) -> List[str]:
559
- word_list = jieba.cut_for_search(word)
560
- word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
561
- first_subword = word_list[0]
562
- first_begin_idx = word.find(first_subword)
563
- if first_begin_idx == 0:
564
- second_subword = word[len(first_subword) :]
565
- new_word_list = [first_subword, second_subword]
566
- else:
567
- second_subword = word[: -len(first_subword)]
568
- new_word_list = [second_subword, first_subword]
569
- return new_word_list
570
-
571
- def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
572
- if len(word) == 2 and self._all_tone_three(finals):
573
- finals[0] = finals[0][:-1] + "2"
574
- elif len(word) == 3:
575
- word_list = self._split_word(word)
576
- if self._all_tone_three(finals):
577
- # disyllabic + monosyllabic, e.g. 蒙古/包
578
- if len(word_list[0]) == 2:
579
- finals[0] = finals[0][:-1] + "2"
580
- finals[1] = finals[1][:-1] + "2"
581
- # monosyllabic + disyllabic, e.g. 纸/老虎
582
- elif len(word_list[0]) == 1:
583
- finals[1] = finals[1][:-1] + "2"
584
- else:
585
- finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
586
- if len(finals_list) == 2:
587
- for i, sub in enumerate(finals_list):
588
- # e.g. 所有/人
589
- if self._all_tone_three(sub) and len(sub) == 2:
590
- finals_list[i][0] = finals_list[i][0][:-1] + "2"
591
- # e.g. 好/喜欢
592
- elif (
593
- i == 1
594
- and not self._all_tone_three(sub)
595
- and finals_list[i][0][-1] == "3"
596
- and finals_list[0][-1][-1] == "3"
597
- ):
598
- finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
599
- finals = sum(finals_list, [])
600
- # split idiom into two words who's length is 2
601
- elif len(word) == 4:
602
- finals_list = [finals[:2], finals[2:]]
603
- finals = []
604
- for sub in finals_list:
605
- if self._all_tone_three(sub):
606
- sub[0] = sub[0][:-1] + "2"
607
- finals += sub
608
-
609
- return finals
610
-
611
- def _all_tone_three(self, finals: List[str]) -> bool:
612
- return all(x[-1] == "3" for x in finals)
613
-
614
- # merge "不" and the word behind it
615
- # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
616
- def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
617
- new_seg = []
618
- last_word = ""
619
- for word, pos in seg:
620
- if last_word == "不":
621
- word = last_word + word
622
- if word != "不":
623
- new_seg.append((word, pos))
624
- last_word = word[:]
625
- if last_word == "不":
626
- new_seg.append((last_word, "d"))
627
- last_word = ""
628
- return new_seg
629
-
630
- # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
631
- # function 2: merge single "一" and the word behind it
632
- # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
633
- # e.g.
634
- # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
635
- # output seg: [['听一听', 'v']]
636
- def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
637
- new_seg = []
638
- # function 1
639
- for i, (word, pos) in enumerate(seg):
640
- if (
641
- i - 1 >= 0
642
- and word == "一"
643
- and i + 1 < len(seg)
644
- and seg[i - 1][0] == seg[i + 1][0]
645
- and seg[i - 1][1] == "v"
646
- ):
647
- new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
648
- else:
649
- if (
650
- i - 2 >= 0
651
- and seg[i - 1][0] == "一"
652
- and seg[i - 2][0] == word
653
- and pos == "v"
654
- ):
655
- continue
656
- else:
657
- new_seg.append([word, pos])
658
- seg = new_seg
659
- new_seg = []
660
- # function 2
661
- for i, (word, pos) in enumerate(seg):
662
- if new_seg and new_seg[-1][0] == "一":
663
- new_seg[-1][0] = new_seg[-1][0] + word
664
- else:
665
- new_seg.append([word, pos])
666
- return new_seg
667
-
668
- # the first and the second words are all_tone_three
669
- def _merge_continuous_three_tones(
670
- self, seg: List[Tuple[str, str]]
671
- ) -> List[Tuple[str, str]]:
672
- new_seg = []
673
- sub_finals_list = [
674
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
675
- for (word, pos) in seg
676
- ]
677
- assert len(sub_finals_list) == len(seg)
678
- merge_last = [False] * len(seg)
679
- for i, (word, pos) in enumerate(seg):
680
- if (
681
- i - 1 >= 0
682
- and self._all_tone_three(sub_finals_list[i - 1])
683
- and self._all_tone_three(sub_finals_list[i])
684
- and not merge_last[i - 1]
685
- ):
686
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
687
- if (
688
- not self._is_reduplication(seg[i - 1][0])
689
- and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
690
- ):
691
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
692
- merge_last[i] = True
693
- else:
694
- new_seg.append([word, pos])
695
- else:
696
- new_seg.append([word, pos])
697
-
698
- return new_seg
699
-
700
- def _is_reduplication(self, word: str) -> bool:
701
- return len(word) == 2 and word[0] == word[1]
702
-
703
- # the last char of first word and the first char of second word is tone_three
704
- def _merge_continuous_three_tones_2(
705
- self, seg: List[Tuple[str, str]]
706
- ) -> List[Tuple[str, str]]:
707
- new_seg = []
708
- sub_finals_list = [
709
- lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
710
- for (word, pos) in seg
711
- ]
712
- assert len(sub_finals_list) == len(seg)
713
- merge_last = [False] * len(seg)
714
- for i, (word, pos) in enumerate(seg):
715
- if (
716
- i - 1 >= 0
717
- and sub_finals_list[i - 1][-1][-1] == "3"
718
- and sub_finals_list[i][0][-1] == "3"
719
- and not merge_last[i - 1]
720
- ):
721
- # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
722
- if (
723
- not self._is_reduplication(seg[i - 1][0])
724
- and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
725
- ):
726
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
727
- merge_last[i] = True
728
- else:
729
- new_seg.append([word, pos])
730
- else:
731
- new_seg.append([word, pos])
732
- return new_seg
733
-
734
- def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
735
- new_seg = []
736
- for i, (word, pos) in enumerate(seg):
737
- if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
738
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
739
- else:
740
- new_seg.append([word, pos])
741
- return new_seg
742
-
743
- def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
744
- new_seg = []
745
- for i, (word, pos) in enumerate(seg):
746
- if new_seg and word == new_seg[-1][0]:
747
- new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
748
- else:
749
- new_seg.append([word, pos])
750
- return new_seg
751
-
752
- def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
753
- seg = self._merge_bu(seg)
754
- try:
755
- seg = self._merge_yi(seg)
756
- except:
757
- print("_merge_yi failed")
758
- seg = self._merge_reduplication(seg)
759
- seg = self._merge_continuous_three_tones(seg)
760
- seg = self._merge_continuous_three_tones_2(seg)
761
- seg = self._merge_er(seg)
762
- return seg
763
-
764
- def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
765
- finals = self._bu_sandhi(word, finals)
766
- finals = self._yi_sandhi(word, finals)
767
- finals = self._neural_sandhi(word, pos, finals)
768
- finals = self._three_sandhi(word, finals)
769
- return finals