JotunnBurton commited on
Commit
56cda8a
·
verified ·
1 Parent(s): 8a53174

Upload 2 files

Browse files
Files changed (2) hide show
  1. text/japanese.py +704 -704
  2. text/japanese_bert.py +87 -68
text/japanese.py CHANGED
@@ -1,704 +1,704 @@
1
- # Convert Japanese text to phonemes which is
2
- # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
- import re
4
- import unicodedata
5
-
6
- from transformers import AutoTokenizer
7
-
8
- from text import punctuation, symbols
9
-
10
- try:
11
- import MeCab
12
- except ImportError as e:
13
- raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
14
- from num2words import num2words
15
-
16
- _CONVRULES = [
17
- # Conversion of 2 letters
18
- "アァ/ a a",
19
- "イィ/ i i",
20
- "イェ/ i e",
21
- "イャ/ y a",
22
- "ウゥ/ u:",
23
- "エェ/ e e",
24
- "オォ/ o:",
25
- "カァ/ k a:",
26
- "キィ/ k i:",
27
- "クゥ/ k u:",
28
- "クャ/ ky a",
29
- "クュ/ ky u",
30
- "クョ/ ky o",
31
- "ケェ/ k e:",
32
- "コォ/ k o:",
33
- "ガァ/ g a:",
34
- "ギィ/ g i:",
35
- "グゥ/ g u:",
36
- "グャ/ gy a",
37
- "グュ/ gy u",
38
- "グョ/ gy o",
39
- "ゲェ/ g e:",
40
- "ゴォ/ g o:",
41
- "サァ/ s a:",
42
- "シィ/ sh i:",
43
- "スゥ/ s u:",
44
- "スャ/ sh a",
45
- "スュ/ sh u",
46
- "スョ/ sh o",
47
- "セェ/ s e:",
48
- "ソォ/ s o:",
49
- "ザァ/ z a:",
50
- "ジィ/ j i:",
51
- "ズゥ/ z u:",
52
- "ズャ/ zy a",
53
- "ズュ/ zy u",
54
- "ズョ/ zy o",
55
- "ゼェ/ z e:",
56
- "ゾォ/ z o:",
57
- "タァ/ t a:",
58
- "チィ/ ch i:",
59
- "ツァ/ ts a",
60
- "ツィ/ ts i",
61
- "ツゥ/ ts u:",
62
- "ツャ/ ch a",
63
- "ツュ/ ch u",
64
- "ツョ/ ch o",
65
- "ツェ/ ts e",
66
- "ツォ/ ts o",
67
- "テェ/ t e:",
68
- "トォ/ t o:",
69
- "ダァ/ d a:",
70
- "ヂィ/ j i:",
71
- "ヅゥ/ d u:",
72
- "ヅャ/ zy a",
73
- "ヅュ/ zy u",
74
- "ヅョ/ zy o",
75
- "デェ/ d e:",
76
- "ドォ/ d o:",
77
- "ナァ/ n a:",
78
- "ニィ/ n i:",
79
- "ヌゥ/ n u:",
80
- "ヌャ/ ny a",
81
- "ヌュ/ ny u",
82
- "ヌョ/ ny o",
83
- "ネェ/ n e:",
84
- "ノォ/ n o:",
85
- "ハァ/ h a:",
86
- "ヒィ/ h i:",
87
- "フゥ/ f u:",
88
- "フャ/ hy a",
89
- "フュ/ hy u",
90
- "フョ/ hy o",
91
- "ヘェ/ h e:",
92
- "ホォ/ h o:",
93
- "バァ/ b a:",
94
- "ビィ/ b i:",
95
- "ブゥ/ b u:",
96
- "フャ/ hy a",
97
- "ブュ/ by u",
98
- "フョ/ hy o",
99
- "ベェ/ b e:",
100
- "ボォ/ b o:",
101
- "パァ/ p a:",
102
- "ピィ/ p i:",
103
- "プゥ/ p u:",
104
- "プャ/ py a",
105
- "プュ/ py u",
106
- "プョ/ py o",
107
- "ペェ/ p e:",
108
- "ポォ/ p o:",
109
- "マァ/ m a:",
110
- "ミィ/ m i:",
111
- "ムゥ/ m u:",
112
- "ムャ/ my a",
113
- "ムュ/ my u",
114
- "ムョ/ my o",
115
- "メェ/ m e:",
116
- "モォ/ m o:",
117
- "ヤァ/ y a:",
118
- "ユゥ/ y u:",
119
- "ユャ/ y a:",
120
- "ユュ/ y u:",
121
- "ユョ/ y o:",
122
- "ヨォ/ y o:",
123
- "ラァ/ r a:",
124
- "リィ/ r i:",
125
- "ルゥ/ r u:",
126
- "ルャ/ ry a",
127
- "ルュ/ ry u",
128
- "ルョ/ ry o",
129
- "レェ/ r e:",
130
- "ロォ/ r o:",
131
- "ワァ/ w a:",
132
- "ヲォ/ o:",
133
- "ディ/ d i",
134
- "デェ/ d e:",
135
- "デャ/ dy a",
136
- "デュ/ dy u",
137
- "デョ/ dy o",
138
- "ティ/ t i",
139
- "テェ/ t e:",
140
- "テャ/ ty a",
141
- "テュ/ ty u",
142
- "テョ/ ty o",
143
- "スィ/ s i",
144
- "ズァ/ z u a",
145
- "ズィ/ z i",
146
- "ズゥ/ z u",
147
- "ズャ/ zy a",
148
- "ズュ/ zy u",
149
- "ズョ/ zy o",
150
- "ズェ/ z e",
151
- "ズォ/ z o",
152
- "キャ/ ky a",
153
- "キュ/ ky u",
154
- "キョ/ ky o",
155
- "シャ/ sh a",
156
- "シュ/ sh u",
157
- "シェ/ sh e",
158
- "ショ/ sh o",
159
- "チャ/ ch a",
160
- "チュ/ ch u",
161
- "チェ/ ch e",
162
- "チョ/ ch o",
163
- "トゥ/ t u",
164
- "トャ/ ty a",
165
- "トュ/ ty u",
166
- "トョ/ ty o",
167
- "ドァ/ d o a",
168
- "ドゥ/ d u",
169
- "ドャ/ dy a",
170
- "ドュ/ dy u",
171
- "ドョ/ dy o",
172
- "ドォ/ d o:",
173
- "ニャ/ ny a",
174
- "ニュ/ ny u",
175
- "ニョ/ ny o",
176
- "ヒャ/ hy a",
177
- "ヒュ/ hy u",
178
- "ヒョ/ hy o",
179
- "ミャ/ my a",
180
- "ミュ/ my u",
181
- "ミョ/ my o",
182
- "リャ/ ry a",
183
- "リュ/ ry u",
184
- "リョ/ ry o",
185
- "ギャ/ gy a",
186
- "ギュ/ gy u",
187
- "ギョ/ gy o",
188
- "ヂェ/ j e",
189
- "ヂャ/ j a",
190
- "ヂュ/ j u",
191
- "ヂョ/ j o",
192
- "ジェ/ j e",
193
- "ジャ/ j a",
194
- "ジュ/ j u",
195
- "ジョ/ j o",
196
- "ビャ/ by a",
197
- "ビュ/ by u",
198
- "ビョ/ by o",
199
- "ピャ/ py a",
200
- "ピュ/ py u",
201
- "ピョ/ py o",
202
- "ウァ/ u a",
203
- "ウィ/ w i",
204
- "ウェ/ w e",
205
- "ウォ/ w o",
206
- "ファ/ f a",
207
- "フィ/ f i",
208
- "フゥ/ f u",
209
- "フャ/ hy a",
210
- "フュ/ hy u",
211
- "フョ/ hy o",
212
- "フェ/ f e",
213
- "フォ/ f o",
214
- "ヴァ/ b a",
215
- "ヴィ/ b i",
216
- "ヴェ/ b e",
217
- "ヴォ/ b o",
218
- "ヴュ/ by u",
219
- "アー/ a:",
220
- "イー/ i:",
221
- "ウー/ u:",
222
- "エー/ e:",
223
- "オー/ o:",
224
- "カー/ k a:",
225
- "キー/ k i:",
226
- "クー/ k u:",
227
- "ケー/ k e:",
228
- "コー/ k o:",
229
- "サー/ s a:",
230
- "シー/ sh i:",
231
- "スー/ s u:",
232
- "セー/ s e:",
233
- "ソー/ s o:",
234
- "ター/ t a:",
235
- "チー/ ch i:",
236
- "ツー/ ts u:",
237
- "テー/ t e:",
238
- "トー/ t o:",
239
- "ナー/ n a:",
240
- "ニー/ n i:",
241
- "��ー/ n u:",
242
- "ネー/ n e:",
243
- "ノー/ n o:",
244
- "ハー/ h a:",
245
- "ヒー/ h i:",
246
- "フー/ f u:",
247
- "ヘー/ h e:",
248
- "ホー/ h o:",
249
- "マー/ m a:",
250
- "ミー/ m i:",
251
- "ムー/ m u:",
252
- "メー/ m e:",
253
- "モー/ m o:",
254
- "ラー/ r a:",
255
- "リー/ r i:",
256
- "ルー/ r u:",
257
- "レー/ r e:",
258
- "ロー/ r o:",
259
- "ガー/ g a:",
260
- "ギー/ g i:",
261
- "グー/ g u:",
262
- "ゲー/ g e:",
263
- "ゴー/ g o:",
264
- "ザー/ z a:",
265
- "ジー/ j i:",
266
- "ズー/ z u:",
267
- "ゼー/ z e:",
268
- "ゾー/ z o:",
269
- "ダー/ d a:",
270
- "ヂー/ j i:",
271
- "ヅー/ z u:",
272
- "デー/ d e:",
273
- "ドー/ d o:",
274
- "バー/ b a:",
275
- "ビー/ b i:",
276
- "ブー/ b u:",
277
- "ベー/ b e:",
278
- "ボー/ b o:",
279
- "パー/ p a:",
280
- "ピー/ p i:",
281
- "プー/ p u:",
282
- "ペー/ p e:",
283
- "ポー/ p o:",
284
- "ヤー/ y a:",
285
- "ユー/ y u:",
286
- "ヨー/ y o:",
287
- "ワー/ w a:",
288
- "ヰー/ i:",
289
- "ヱー/ e:",
290
- "ヲー/ o:",
291
- "ヴー/ b u:",
292
- # Conversion of 1 letter
293
- "ア/ a",
294
- "イ/ i",
295
- "ウ/ u",
296
- "エ/ e",
297
- "オ/ o",
298
- "カ/ k a",
299
- "キ/ k i",
300
- "ク/ k u",
301
- "ケ/ k e",
302
- "コ/ k o",
303
- "サ/ s a",
304
- "シ/ sh i",
305
- "ス/ s u",
306
- "セ/ s e",
307
- "ソ/ s o",
308
- "タ/ t a",
309
- "チ/ ch i",
310
- "ツ/ ts u",
311
- "テ/ t e",
312
- "ト/ t o",
313
- "ナ/ n a",
314
- "ニ/ n i",
315
- "ヌ/ n u",
316
- "ネ/ n e",
317
- "ノ/ n o",
318
- "ハ/ h a",
319
- "ヒ/ h i",
320
- "フ/ f u",
321
- "ヘ/ h e",
322
- "ホ/ h o",
323
- "マ/ m a",
324
- "ミ/ m i",
325
- "ム/ m u",
326
- "メ/ m e",
327
- "モ/ m o",
328
- "ラ/ r a",
329
- "リ/ r i",
330
- "ル/ r u",
331
- "レ/ r e",
332
- "ロ/ r o",
333
- "ガ/ g a",
334
- "ギ/ g i",
335
- "グ/ g u",
336
- "ゲ/ g e",
337
- "ゴ/ g o",
338
- "ザ/ z a",
339
- "ジ/ j i",
340
- "ズ/ z u",
341
- "ゼ/ z e",
342
- "ゾ/ z o",
343
- "ダ/ d a",
344
- "ヂ/ j i",
345
- "ヅ/ z u",
346
- "デ/ d e",
347
- "ド/ d o",
348
- "バ/ b a",
349
- "ビ/ b i",
350
- "ブ/ b u",
351
- "ベ/ b e",
352
- "ボ/ b o",
353
- "パ/ p a",
354
- "ピ/ p i",
355
- "プ/ p u",
356
- "ペ/ p e",
357
- "ポ/ p o",
358
- "ヤ/ y a",
359
- "ユ/ y u",
360
- "ヨ/ y o",
361
- "ワ/ w a",
362
- "ヰ/ i",
363
- "ヱ/ e",
364
- "ヲ/ o",
365
- "ン/ N",
366
- "ッ/ q",
367
- "ヴ/ b u",
368
- "ー/:", #这个不起作用
369
- # Try converting broken text
370
- "ァ/ a",
371
- "ィ/ i",
372
- "ゥ/ u",
373
- "ェ/ e",
374
- "ォ/ o",
375
- "ヮ/ w a",
376
- "ォ/ o",
377
- # Symbols
378
- "、/ ,",
379
- "。/ .",
380
- "!/ !",
381
- "?/ ?",
382
- "・/ ,",
383
- ]
384
-
385
- _COLON_RX = re.compile(":+")
386
- _REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
387
-
388
-
389
- def _makerulemap():
390
- l = [tuple(x.split("/")) for x in _CONVRULES]
391
- return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
392
-
393
-
394
- _RULEMAP1, _RULEMAP2 = _makerulemap()
395
-
396
-
397
- def kata2phoneme(text: str) -> str:
398
- """Convert katakana text to phonemes."""
399
- text = text.strip()
400
- res = []
401
- while text:
402
- if len(text) >= 2:
403
- x = _RULEMAP2.get(text[:2])
404
- if x is not None:
405
- text = text[2:]
406
- res += x.split(" ")[1:]
407
- continue
408
- x = _RULEMAP1.get(text[0])
409
- if x is not None:
410
- text = text[1:]
411
- res += x.split(" ")[1:]
412
- continue
413
- res.append(text[0])
414
- text = text[1:]
415
- # res = _COLON_RX.sub(":", res)
416
- return res
417
-
418
-
419
- _KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
420
- _HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
421
- _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
422
-
423
-
424
- def hira2kata(text: str) -> str:
425
- text = text.translate(_HIRA2KATATRANS)
426
- return text.replace("う゛", "ヴ")
427
-
428
-
429
- _SYMBOL_TOKENS = set(list("・、。?!"))
430
- _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
431
- _TAGGER = MeCab.Tagger()
432
-
433
-
434
- def text2kata(text: str) -> str:
435
- parsed = _TAGGER.parse(text)
436
- res = []
437
- for line in parsed.split("\n"):
438
- if line == "EOS":
439
- break
440
- parts = line.split("\t")
441
-
442
- word, yomi = parts[0], parts[1]
443
- if yomi:
444
- res.append(yomi)
445
- else:
446
- if word in _SYMBOL_TOKENS:
447
- res.append(word)
448
- elif word in ("っ", "ッ"):
449
- res.append("ッ")
450
- elif word in _NO_YOMI_TOKENS:
451
- pass
452
- else:
453
- res.append(word)
454
- return hira2kata("".join(res))
455
-
456
-
457
- def text2sep_kata(text: str) -> (list, list):
458
- parsed = _TAGGER.parse(text)
459
- res = []
460
- sep = []
461
- for line in parsed.split("\n"):
462
- if line == "EOS":
463
- break
464
- parts = line.split("\t")
465
-
466
- word, yomi = parts[0], parts[1]
467
- if yomi:
468
- res.append(yomi)
469
- else:
470
- if word in _SYMBOL_TOKENS:
471
- res.append(word)
472
- elif word in ("っ", "ッ"):
473
- res.append("ッ")
474
- elif word in _NO_YOMI_TOKENS:
475
- pass
476
- else:
477
- res.append(word)
478
- sep.append(word)
479
- return sep, [hira2kata(i) for i in res]
480
-
481
-
482
- _ALPHASYMBOL_YOMI = {
483
- "#": "シャープ",
484
- "%": "パーセント",
485
- "&": "アンド",
486
- "+": "プラス",
487
- "-": "マイナス",
488
- ":": "コロン",
489
- ";": "セミコロン",
490
- "<": "小なり",
491
- "=": "イコール",
492
- ">": "大なり",
493
- "@": "アット",
494
- "a": "エー",
495
- "b": "ビー",
496
- "c": "シー",
497
- "d": "ディー",
498
- "e": "イー",
499
- "f": "エフ",
500
- "g": "ジー",
501
- "h": "エイチ",
502
- "i": "アイ",
503
- "j": "ジェー",
504
- "k": "ケー",
505
- "l": "エル",
506
- "m": "エム",
507
- "n": "エヌ",
508
- "o": "オー",
509
- "p": "ピー",
510
- "q": "キュー",
511
- "r": "アール",
512
- "s": "エス",
513
- "t": "ティー",
514
- "u": "ユー",
515
- "v": "ブイ",
516
- "w": "ダブリュー",
517
- "x": "エックス",
518
- "y": "ワイ",
519
- "z": "ゼット",
520
- "α": "アルファ",
521
- "β": "ベータ",
522
- "γ": "ガンマ",
523
- "δ": "デルタ",
524
- "ε": "イプシロン",
525
- "ζ": "ゼータ",
526
- "η": "イータ",
527
- "θ": "シータ",
528
- "ι": "イオタ",
529
- "κ": "カッパ",
530
- "λ": "ラムダ",
531
- "μ": "ミュー",
532
- "ν": "ニュー",
533
- "ξ": "クサイ",
534
- "ο": "オミクロン",
535
- "π": "パイ",
536
- "ρ": "ロー",
537
- "σ": "シグマ",
538
- "τ": "タウ",
539
- "υ": "ウプシロン",
540
- "φ": "ファイ",
541
- "χ": "カイ",
542
- "ψ": "プサイ",
543
- "ω": "オメガ",
544
- }
545
-
546
-
547
- _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
548
- _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
549
- _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
550
- _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
551
-
552
-
553
- def japanese_convert_numbers_to_words(text: str) -> str:
554
- res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
555
- res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
556
- res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
557
- return res
558
-
559
-
560
- def japanese_convert_alpha_symbols_to_words(text: str) -> str:
561
- return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
562
-
563
-
564
- def japanese_text_to_phonemes(text: str) -> str:
565
- """Convert Japanese text to phonemes."""
566
- res = unicodedata.normalize("NFKC", text)
567
- res = japanese_convert_numbers_to_words(res)
568
- # res = japanese_convert_alpha_symbols_to_words(res)
569
- res = text2kata(res)
570
- res = kata2phoneme(res)
571
- return res
572
-
573
-
574
- def is_japanese_character(char):
575
- # 定义日语文字系统的 Unicode 范围
576
- japanese_ranges = [
577
- (0x3040, 0x309F), # 平假名
578
- (0x30A0, 0x30FF), # 片假名
579
- (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
580
- (0x3400, 0x4DBF), # 汉字扩展 A
581
- (0x20000, 0x2A6DF), # 汉字扩展 B
582
- # 可以根据需要添加其他汉字扩展范围
583
- ]
584
-
585
- # 将字符的 Unicode 编码转换为整数
586
- char_code = ord(char)
587
-
588
- # 检查字符是否在任何一个日语范围内
589
- for start, end in japanese_ranges:
590
- if start <= char_code <= end:
591
- return True
592
-
593
- return False
594
-
595
-
596
- rep_map = {
597
- ":": ",",
598
- ";": ",",
599
- ",": ",",
600
- "。": ".",
601
- "!": "!",
602
- "?": "?",
603
- "\n": ".",
604
- "·": ",",
605
- "、": ",",
606
- "…": "...",
607
- }
608
-
609
-
610
- def replace_punctuation(text):
611
- pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
612
-
613
- replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
614
-
615
- replaced_text = re.sub(
616
- r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
617
- + "".join(punctuation)
618
- + r"]+",
619
- "",
620
- replaced_text,
621
- )
622
-
623
- return replaced_text
624
-
625
-
626
- def text_normalize(text):
627
- res = unicodedata.normalize("NFKC", text)
628
- res = japanese_convert_numbers_to_words(res)
629
- # res = "".join([i for i in res if is_japanese_character(i)])
630
- res = replace_punctuation(res)
631
- return res
632
-
633
-
634
- def distribute_phone(n_phone, n_word):
635
- phones_per_word = [0] * n_word
636
- for task in range(n_phone):
637
- min_tasks = min(phones_per_word)
638
- min_index = phones_per_word.index(min_tasks)
639
- phones_per_word[min_index] += 1
640
- return phones_per_word
641
-
642
-
643
- tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
644
-
645
-
646
- def g2p(norm_text):
647
- sep_text, sep_kata = text2sep_kata(norm_text)
648
- sep_tokenized = [tokenizer.tokenize(i) for i in sep_text]
649
- sep_phonemes = [kata2phoneme(i) for i in sep_kata]
650
- # 异常处理,MeCab不认识的词的话会一路传到这里来,然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
651
- for i in sep_phonemes:
652
- for j in i:
653
- assert j in symbols, (sep_text, sep_kata, sep_phonemes)
654
-
655
- word2ph = []
656
- for token, phoneme in zip(sep_tokenized, sep_phonemes):
657
- phone_len = len(phoneme)
658
- word_len = len(token)
659
-
660
- aaa = distribute_phone(phone_len, word_len)
661
- word2ph += aaa
662
- phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
663
- tones = [0 for i in phones]
664
- word2ph = [1] + word2ph + [1]
665
- return phones, tones, word2ph
666
-
667
- if __name__ == "__main__":
668
- tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
669
- text = "だったら私、スズカさんと同じチームに入りたいです! スズカさんの走りを毎日近くで、なんなら真横から見ていたいので!"
670
- #print(_TAGGER.parse(text))
671
- # nodes = [{"surface": "こんにちは", "pos": "感動詞:*:*:*", "pron": "コンニチワ", "c_type": "*", "c_form": "*", "accent_type": 0, "accent_con_type": "-1", "chain_flag": -1}]
672
- nodes = [{"surface":"こんにちは","pron": "コンニチワ","pos": "感動詞:*:*:*",}]
673
- from text.japanese_bert import get_bert_feature
674
- import pyopenjtalk
675
- from marine.predict import Predictor
676
- from marine.utils.openjtalk_util import convert_njd_feature_to_marine_feature
677
- text = text_normalize(text)
678
- NJD_NODES = pyopenjtalk.run_frontend(text)
679
- predictor = Predictor()
680
- # important_info = [{"string":i["string"],"pron":i["pron"],"acc":i["acc"]}for i in pyopenjtalk.estimate_accent(NJD_NODES)]
681
- print(text)
682
-
683
- marine_feature = convert_njd_feature_to_marine_feature(NJD_NODES)
684
- results = predictor.predict([marine_feature])
685
- for mora,acc in zip(results["mora"][0],results["accent_status"][0]):
686
- print(f"{mora}:{acc}")
687
- # for i in pyopenjtalk.estimate_accent(NJD_NODES):
688
- # print(f"{i['string']}:{i['pron']}:{i['acc']}")
689
- # info = pyopenjtalk.extract_fullcontext(text,run_marine=True)
690
- # info_nomarine = pyopenjtalk.extract_fullcontext(text,run_marine=False)
691
- # # nodes = pyopenjtalk
692
- # # print(info)
693
- # for i,j in zip(info,info_nomarine):
694
- # print(i)
695
- # print(j)
696
- # print("\n")
697
- # predictor = Predictor()
698
- #print(pyopenjtalk.estimate_accent(text))
699
- # output = predictor.predict([nodes],accent_represent_mode="high_low")
700
- #print(output)
701
- # phones, tones, word2ph = g2p(text)
702
- # bert = get_bert_feature(text, word2ph)
703
-
704
- # print(phones, tones, word2ph, bert.shape)
 
1
+ # Convert Japanese text to phonemes which is
2
+ # compatible with Julius https://github.com/julius-speech/segmentation-kit
3
+ import re
4
+ import unicodedata
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ from text import punctuation, symbols
9
+
10
+ try:
11
+ import MeCab
12
+ except ImportError as e:
13
+ raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
14
+ from num2words import num2words
15
+
16
+ _CONVRULES = [
17
+ # Conversion of 2 letters
18
+ "アァ/ a a",
19
+ "イィ/ i i",
20
+ "イェ/ i e",
21
+ "イャ/ y a",
22
+ "ウゥ/ u:",
23
+ "エェ/ e e",
24
+ "オォ/ o:",
25
+ "カァ/ k a:",
26
+ "キィ/ k i:",
27
+ "クゥ/ k u:",
28
+ "クャ/ ky a",
29
+ "クュ/ ky u",
30
+ "クョ/ ky o",
31
+ "ケェ/ k e:",
32
+ "コォ/ k o:",
33
+ "ガァ/ g a:",
34
+ "ギィ/ g i:",
35
+ "グゥ/ g u:",
36
+ "グャ/ gy a",
37
+ "グュ/ gy u",
38
+ "グョ/ gy o",
39
+ "ゲェ/ g e:",
40
+ "ゴォ/ g o:",
41
+ "サァ/ s a:",
42
+ "シィ/ sh i:",
43
+ "スゥ/ s u:",
44
+ "スャ/ sh a",
45
+ "スュ/ sh u",
46
+ "スョ/ sh o",
47
+ "セェ/ s e:",
48
+ "ソォ/ s o:",
49
+ "ザァ/ z a:",
50
+ "ジィ/ j i:",
51
+ "ズゥ/ z u:",
52
+ "ズャ/ zy a",
53
+ "ズュ/ zy u",
54
+ "ズョ/ zy o",
55
+ "ゼェ/ z e:",
56
+ "ゾォ/ z o:",
57
+ "タァ/ t a:",
58
+ "チィ/ ch i:",
59
+ "ツァ/ ts a",
60
+ "ツィ/ ts i",
61
+ "ツゥ/ ts u:",
62
+ "ツャ/ ch a",
63
+ "ツュ/ ch u",
64
+ "ツョ/ ch o",
65
+ "ツェ/ ts e",
66
+ "ツォ/ ts o",
67
+ "テェ/ t e:",
68
+ "トォ/ t o:",
69
+ "ダァ/ d a:",
70
+ "ヂィ/ j i:",
71
+ "ヅゥ/ d u:",
72
+ "ヅャ/ zy a",
73
+ "ヅュ/ zy u",
74
+ "ヅョ/ zy o",
75
+ "デェ/ d e:",
76
+ "ドォ/ d o:",
77
+ "ナァ/ n a:",
78
+ "ニィ/ n i:",
79
+ "ヌゥ/ n u:",
80
+ "ヌャ/ ny a",
81
+ "ヌュ/ ny u",
82
+ "ヌョ/ ny o",
83
+ "ネェ/ n e:",
84
+ "ノォ/ n o:",
85
+ "ハァ/ h a:",
86
+ "ヒィ/ h i:",
87
+ "フゥ/ f u:",
88
+ "フャ/ hy a",
89
+ "フュ/ hy u",
90
+ "フョ/ hy o",
91
+ "ヘェ/ h e:",
92
+ "ホォ/ h o:",
93
+ "バァ/ b a:",
94
+ "ビィ/ b i:",
95
+ "ブゥ/ b u:",
96
+ "フャ/ hy a",
97
+ "ブュ/ by u",
98
+ "フョ/ hy o",
99
+ "ベェ/ b e:",
100
+ "ボォ/ b o:",
101
+ "パァ/ p a:",
102
+ "ピィ/ p i:",
103
+ "プゥ/ p u:",
104
+ "プャ/ py a",
105
+ "プュ/ py u",
106
+ "プョ/ py o",
107
+ "ペェ/ p e:",
108
+ "ポォ/ p o:",
109
+ "マァ/ m a:",
110
+ "ミィ/ m i:",
111
+ "ムゥ/ m u:",
112
+ "ムャ/ my a",
113
+ "ムュ/ my u",
114
+ "ムョ/ my o",
115
+ "メェ/ m e:",
116
+ "モォ/ m o:",
117
+ "ヤァ/ y a:",
118
+ "ユゥ/ y u:",
119
+ "ユャ/ y a:",
120
+ "ユュ/ y u:",
121
+ "ユョ/ y o:",
122
+ "ヨォ/ y o:",
123
+ "ラァ/ r a:",
124
+ "リィ/ r i:",
125
+ "ルゥ/ r u:",
126
+ "ルャ/ ry a",
127
+ "��ュ/ ry u",
128
+ "ルョ/ ry o",
129
+ "レェ/ r e:",
130
+ "ロォ/ r o:",
131
+ "ワァ/ w a:",
132
+ "ヲォ/ o:",
133
+ "ディ/ d i",
134
+ "デェ/ d e:",
135
+ "デャ/ dy a",
136
+ "デュ/ dy u",
137
+ "デョ/ dy o",
138
+ "ティ/ t i",
139
+ "テェ/ t e:",
140
+ "テャ/ ty a",
141
+ "テュ/ ty u",
142
+ "テョ/ ty o",
143
+ "スィ/ s i",
144
+ "ズァ/ z u a",
145
+ "ズィ/ z i",
146
+ "ズゥ/ z u",
147
+ "ズャ/ zy a",
148
+ "ズュ/ zy u",
149
+ "ズョ/ zy o",
150
+ "ズェ/ z e",
151
+ "ズォ/ z o",
152
+ "キャ/ ky a",
153
+ "キュ/ ky u",
154
+ "キョ/ ky o",
155
+ "シャ/ sh a",
156
+ "シュ/ sh u",
157
+ "シェ/ sh e",
158
+ "ショ/ sh o",
159
+ "チャ/ ch a",
160
+ "チュ/ ch u",
161
+ "チェ/ ch e",
162
+ "チョ/ ch o",
163
+ "トゥ/ t u",
164
+ "トャ/ ty a",
165
+ "トュ/ ty u",
166
+ "トョ/ ty o",
167
+ "ドァ/ d o a",
168
+ "ドゥ/ d u",
169
+ "ドャ/ dy a",
170
+ "ドュ/ dy u",
171
+ "ドョ/ dy o",
172
+ "ドォ/ d o:",
173
+ "ニャ/ ny a",
174
+ "ニュ/ ny u",
175
+ "ニョ/ ny o",
176
+ "ヒャ/ hy a",
177
+ "ヒュ/ hy u",
178
+ "ヒョ/ hy o",
179
+ "ミャ/ my a",
180
+ "ミュ/ my u",
181
+ "ミョ/ my o",
182
+ "リャ/ ry a",
183
+ "リュ/ ry u",
184
+ "リョ/ ry o",
185
+ "ギャ/ gy a",
186
+ "ギュ/ gy u",
187
+ "ギョ/ gy o",
188
+ "ヂェ/ j e",
189
+ "ヂャ/ j a",
190
+ "ヂュ/ j u",
191
+ "ヂョ/ j o",
192
+ "ジェ/ j e",
193
+ "ジャ/ j a",
194
+ "ジュ/ j u",
195
+ "ジョ/ j o",
196
+ "ビャ/ by a",
197
+ "ビュ/ by u",
198
+ "ビョ/ by o",
199
+ "ピャ/ py a",
200
+ "ピュ/ py u",
201
+ "ピョ/ py o",
202
+ "ウァ/ u a",
203
+ "ウィ/ w i",
204
+ "ウェ/ w e",
205
+ "ウォ/ w o",
206
+ "ファ/ f a",
207
+ "フィ/ f i",
208
+ "フゥ/ f u",
209
+ "フャ/ hy a",
210
+ "フュ/ hy u",
211
+ "フョ/ hy o",
212
+ "フェ/ f e",
213
+ "フォ/ f o",
214
+ "ヴァ/ b a",
215
+ "ヴィ/ b i",
216
+ "ヴェ/ b e",
217
+ "ヴォ/ b o",
218
+ "ヴュ/ by u",
219
+ "アー/ a:",
220
+ "イー/ i:",
221
+ "ウー/ u:",
222
+ "エー/ e:",
223
+ "オー/ o:",
224
+ "カー/ k a:",
225
+ "キー/ k i:",
226
+ "クー/ k u:",
227
+ "ケー/ k e:",
228
+ "コー/ k o:",
229
+ "サー/ s a:",
230
+ "シー/ sh i:",
231
+ "スー/ s u:",
232
+ "セー/ s e:",
233
+ "ソー/ s o:",
234
+ "ター/ t a:",
235
+ "チー/ ch i:",
236
+ "ツー/ ts u:",
237
+ "テー/ t e:",
238
+ "トー/ t o:",
239
+ "ナー/ n a:",
240
+ "ニー/ n i:",
241
+ "ヌー/ n u:",
242
+ "ネー/ n e:",
243
+ "ノー/ n o:",
244
+ "ハー/ h a:",
245
+ "ヒー/ h i:",
246
+ "フー/ f u:",
247
+ "ヘー/ h e:",
248
+ "ホー/ h o:",
249
+ "マー/ m a:",
250
+ "ミー/ m i:",
251
+ "ムー/ m u:",
252
+ "メー/ m e:",
253
+ "モー/ m o:",
254
+ "ラー/ r a:",
255
+ "リー/ r i:",
256
+ "ルー/ r u:",
257
+ "レー/ r e:",
258
+ "ロー/ r o:",
259
+ "ガー/ g a:",
260
+ "ギー/ g i:",
261
+ "グー/ g u:",
262
+ "ゲー/ g e:",
263
+ "ゴー/ g o:",
264
+ "ザー/ z a:",
265
+ "ジー/ j i:",
266
+ "ズー/ z u:",
267
+ "ゼー/ z e:",
268
+ "ゾー/ z o:",
269
+ "ダー/ d a:",
270
+ "ヂー/ j i:",
271
+ "ヅー/ z u:",
272
+ "デー/ d e:",
273
+ "ドー/ d o:",
274
+ "バー/ b a:",
275
+ "ビー/ b i:",
276
+ "ブー/ b u:",
277
+ "ベー/ b e:",
278
+ "ボー/ b o:",
279
+ "パー/ p a:",
280
+ "ピー/ p i:",
281
+ "プー/ p u:",
282
+ "ペー/ p e:",
283
+ "ポー/ p o:",
284
+ "ヤー/ y a:",
285
+ "ユー/ y u:",
286
+ "ヨー/ y o:",
287
+ "ワー/ w a:",
288
+ "ヰー/ i:",
289
+ "ヱー/ e:",
290
+ "ヲー/ o:",
291
+ "ヴー/ b u:",
292
+ # Conversion of 1 letter
293
+ "ア/ a",
294
+ "イ/ i",
295
+ "ウ/ u",
296
+ "エ/ e",
297
+ "オ/ o",
298
+ "カ/ k a",
299
+ "キ/ k i",
300
+ "ク/ k u",
301
+ "ケ/ k e",
302
+ "コ/ k o",
303
+ "サ/ s a",
304
+ "シ/ sh i",
305
+ "ス/ s u",
306
+ "セ/ s e",
307
+ "ソ/ s o",
308
+ "タ/ t a",
309
+ "チ/ ch i",
310
+ "ツ/ ts u",
311
+ "テ/ t e",
312
+ "ト/ t o",
313
+ "ナ/ n a",
314
+ "ニ/ n i",
315
+ "ヌ/ n u",
316
+ "ネ/ n e",
317
+ "ノ/ n o",
318
+ "ハ/ h a",
319
+ "ヒ/ h i",
320
+ "フ/ f u",
321
+ "ヘ/ h e",
322
+ "ホ/ h o",
323
+ "マ/ m a",
324
+ "ミ/ m i",
325
+ "ム/ m u",
326
+ "メ/ m e",
327
+ "モ/ m o",
328
+ "ラ/ r a",
329
+ "リ/ r i",
330
+ "ル/ r u",
331
+ "レ/ r e",
332
+ "ロ/ r o",
333
+ "ガ/ g a",
334
+ "ギ/ g i",
335
+ "グ/ g u",
336
+ "ゲ/ g e",
337
+ "ゴ/ g o",
338
+ "ザ/ z a",
339
+ "ジ/ j i",
340
+ "ズ/ z u",
341
+ "ゼ/ z e",
342
+ "ゾ/ z o",
343
+ "ダ/ d a",
344
+ "ヂ/ j i",
345
+ "ヅ/ z u",
346
+ "デ/ d e",
347
+ "ド/ d o",
348
+ "バ/ b a",
349
+ "ビ/ b i",
350
+ "ブ/ b u",
351
+ "ベ/ b e",
352
+ "ボ/ b o",
353
+ "パ/ p a",
354
+ "ピ/ p i",
355
+ "プ/ p u",
356
+ "ペ/ p e",
357
+ "ポ/ p o",
358
+ "ヤ/ y a",
359
+ "ユ/ y u",
360
+ "ヨ/ y o",
361
+ "ワ/ w a",
362
+ "ヰ/ i",
363
+ "ヱ/ e",
364
+ "ヲ/ o",
365
+ "ン/ N",
366
+ "ッ/ q",
367
+ "ヴ/ b u",
368
+ "ー/:", #这个不起作用
369
+ # Try converting broken text
370
+ "ァ/ a",
371
+ "ィ/ i",
372
+ "ゥ/ u",
373
+ "ェ/ e",
374
+ "ォ/ o",
375
+ "ヮ/ w a",
376
+ "ォ/ o",
377
+ # Symbols
378
+ "、/ ,",
379
+ "。/ .",
380
+ "!/ !",
381
+ "?/ ?",
382
+ "・/ ,",
383
+ ]
384
+
385
+ _COLON_RX = re.compile(":+")
386
+ _REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
387
+
388
+
389
+ def _makerulemap():
390
+ l = [tuple(x.split("/")) for x in _CONVRULES]
391
+ return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
392
+
393
+
394
+ _RULEMAP1, _RULEMAP2 = _makerulemap()
395
+
396
+
397
+ def kata2phoneme(text: str) -> str:
398
+ """Convert katakana text to phonemes."""
399
+ text = text.strip()
400
+ res = []
401
+ while text:
402
+ if len(text) >= 2:
403
+ x = _RULEMAP2.get(text[:2])
404
+ if x is not None:
405
+ text = text[2:]
406
+ res += x.split(" ")[1:]
407
+ continue
408
+ x = _RULEMAP1.get(text[0])
409
+ if x is not None:
410
+ text = text[1:]
411
+ res += x.split(" ")[1:]
412
+ continue
413
+ res.append(text[0])
414
+ text = text[1:]
415
+ # res = _COLON_RX.sub(":", res)
416
+ return res
417
+
418
+
419
+ _KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
420
+ _HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
421
+ _HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
422
+
423
+
424
+ def hira2kata(text: str) -> str:
425
+ text = text.translate(_HIRA2KATATRANS)
426
+ return text.replace("う゛", "ヴ")
427
+
428
+
429
+ _SYMBOL_TOKENS = set(list("・、。?!"))
430
+ _NO_YOMI_TOKENS = set(list("「」『』―()[][]"))
431
+ _TAGGER = MeCab.Tagger()
432
+
433
+
434
+ def text2kata(text: str) -> str:
435
+ parsed = _TAGGER.parse(text)
436
+ res = []
437
+ for line in parsed.split("\n"):
438
+ if line == "EOS":
439
+ break
440
+ parts = line.split("\t")
441
+
442
+ word, yomi = parts[0], parts[1]
443
+ if yomi:
444
+ res.append(yomi)
445
+ else:
446
+ if word in _SYMBOL_TOKENS:
447
+ res.append(word)
448
+ elif word in ("っ", "ッ"):
449
+ res.append("ッ")
450
+ elif word in _NO_YOMI_TOKENS:
451
+ pass
452
+ else:
453
+ res.append(word)
454
+ return hira2kata("".join(res))
455
+
456
+
457
+ def text2sep_kata(text: str) -> (list, list):
458
+ parsed = _TAGGER.parse(text)
459
+ res = []
460
+ sep = []
461
+ for line in parsed.split("\n"):
462
+ if line == "EOS":
463
+ break
464
+ parts = line.split("\t")
465
+
466
+ word, yomi = parts[0], parts[1]
467
+ if yomi:
468
+ res.append(yomi)
469
+ else:
470
+ if word in _SYMBOL_TOKENS:
471
+ res.append(word)
472
+ elif word in ("っ", "ッ"):
473
+ res.append("ッ")
474
+ elif word in _NO_YOMI_TOKENS:
475
+ pass
476
+ else:
477
+ res.append(word)
478
+ sep.append(word)
479
+ return sep, [hira2kata(i) for i in res]
480
+
481
+
482
+ _ALPHASYMBOL_YOMI = {
483
+ "#": "シャープ",
484
+ "%": "パーセント",
485
+ "&": "アンド",
486
+ "+": "プラス",
487
+ "-": "マイナス",
488
+ ":": "コロン",
489
+ ";": "セミコロン",
490
+ "<": "小なり",
491
+ "=": "イコール",
492
+ ">": "大なり",
493
+ "@": "アット",
494
+ "a": "エー",
495
+ "b": "ビー",
496
+ "c": "シー",
497
+ "d": "ディー",
498
+ "e": "イー",
499
+ "f": "エフ",
500
+ "g": "ジー",
501
+ "h": "エイチ",
502
+ "i": "アイ",
503
+ "j": "ジェー",
504
+ "k": "ケー",
505
+ "l": "エル",
506
+ "m": "エム",
507
+ "n": "エヌ",
508
+ "o": "オー",
509
+ "p": "ピー",
510
+ "q": "キュー",
511
+ "r": "アール",
512
+ "s": "エス",
513
+ "t": "ティー",
514
+ "u": "ユー",
515
+ "v": "ブイ",
516
+ "w": "ダブリュー",
517
+ "x": "エックス",
518
+ "y": "ワイ",
519
+ "z": "ゼット",
520
+ "α": "アルファ",
521
+ "β": "ベータ",
522
+ "γ": "ガンマ",
523
+ "δ": "デルタ",
524
+ "ε": "イプシロン",
525
+ "ζ": "ゼータ",
526
+ "η": "イータ",
527
+ "θ": "シータ",
528
+ "ι": "イオタ",
529
+ "κ": "カッパ",
530
+ "λ": "ラムダ",
531
+ "μ": "ミュー",
532
+ "ν": "ニュー",
533
+ "ξ": "クサイ",
534
+ "ο": "オミクロン",
535
+ "π": "パイ",
536
+ "ρ": "ロー",
537
+ "σ": "シグマ",
538
+ "τ": "タウ",
539
+ "υ": "ウプシロン",
540
+ "φ": "ファイ",
541
+ "χ": "カイ",
542
+ "ψ": "プサイ",
543
+ "ω": "オメガ",
544
+ }
545
+
546
+
547
+ _NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
548
+ _CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
549
+ _CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
550
+ _NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
551
+
552
+
553
+ def japanese_convert_numbers_to_words(text: str) -> str:
554
+ res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
555
+ res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
556
+ res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
557
+ return res
558
+
559
+
560
+ def japanese_convert_alpha_symbols_to_words(text: str) -> str:
561
+ return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
562
+
563
+
564
+ def japanese_text_to_phonemes(text: str) -> str:
565
+ """Convert Japanese text to phonemes."""
566
+ res = unicodedata.normalize("NFKC", text)
567
+ res = japanese_convert_numbers_to_words(res)
568
+ # res = japanese_convert_alpha_symbols_to_words(res)
569
+ res = text2kata(res)
570
+ res = kata2phoneme(res)
571
+ return res
572
+
573
+
574
+ def is_japanese_character(char):
575
+ # 定义日语文字系统的 Unicode 范围
576
+ japanese_ranges = [
577
+ (0x3040, 0x309F), # 平假名
578
+ (0x30A0, 0x30FF), # 片假名
579
+ (0x4E00, 0x9FFF), # 汉字 (CJK Unified Ideographs)
580
+ (0x3400, 0x4DBF), # 汉字扩展 A
581
+ (0x20000, 0x2A6DF), # 汉字扩展 B
582
+ # 可以根据需要添加其他汉字扩展范围
583
+ ]
584
+
585
+ # 将字符的 Unicode 编码转换为整数
586
+ char_code = ord(char)
587
+
588
+ # 检查字符是否在任何一个日语范围内
589
+ for start, end in japanese_ranges:
590
+ if start <= char_code <= end:
591
+ return True
592
+
593
+ return False
594
+
595
+
596
+ rep_map = {
597
+ ":": ",",
598
+ ";": ",",
599
+ ",": ",",
600
+ "。": ".",
601
+ "!": "!",
602
+ "?": "?",
603
+ "\n": ".",
604
+ "·": ",",
605
+ "、": ",",
606
+ "…": "...",
607
+ }
608
+
609
+
610
+ def replace_punctuation(text):
611
+ pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
612
+
613
+ replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
614
+
615
+ replaced_text = re.sub(
616
+ r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
617
+ + "".join(punctuation)
618
+ + r"]+",
619
+ "",
620
+ replaced_text,
621
+ )
622
+
623
+ return replaced_text
624
+
625
+
626
+ def text_normalize(text):
627
+ res = unicodedata.normalize("NFKC", text)
628
+ res = japanese_convert_numbers_to_words(res)
629
+ # res = "".join([i for i in res if is_japanese_character(i)])
630
+ res = replace_punctuation(res)
631
+ return res
632
+
633
+
634
+ def distribute_phone(n_phone, n_word):
635
+ phones_per_word = [0] * n_word
636
+ for task in range(n_phone):
637
+ min_tasks = min(phones_per_word)
638
+ min_index = phones_per_word.index(min_tasks)
639
+ phones_per_word[min_index] += 1
640
+ return phones_per_word
641
+
642
+
643
+ tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
644
+
645
+
646
+ def g2p(norm_text):
647
+ sep_text, sep_kata = text2sep_kata(norm_text)
648
+ sep_tokenized = [tokenizer.tokenize(i) for i in sep_text]
649
+ sep_phonemes = [kata2phoneme(i) for i in sep_kata]
650
+ # 异常处理,MeCab不认识的词的话会一路传到这里来,然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
651
+ for i in sep_phonemes:
652
+ for j in i:
653
+ assert j in symbols, (sep_text, sep_kata, sep_phonemes)
654
+
655
+ word2ph = []
656
+ for token, phoneme in zip(sep_tokenized, sep_phonemes):
657
+ phone_len = len(phoneme)
658
+ word_len = len(token)
659
+
660
+ aaa = distribute_phone(phone_len, word_len)
661
+ word2ph += aaa
662
+ phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
663
+ tones = [0 for i in phones]
664
+ word2ph = [1] + word2ph + [1]
665
+ return phones, tones, word2ph
666
+
667
+ if __name__ == "__main__":
668
+ tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
669
+ text = "だったら私、スズカさんと同じチームに入りたいです! スズカさんの走りを毎日近くで、なんなら真横から見ていたいので!"
670
+ #print(_TAGGER.parse(text))
671
+ # nodes = [{"surface": "こんにちは", "pos": "感動詞:*:*:*", "pron": "コンニチワ", "c_type": "*", "c_form": "*", "accent_type": 0, "accent_con_type": "-1", "chain_flag": -1}]
672
+ nodes = [{"surface":"こんにちは","pron": "コンニチワ","pos": "感動詞:*:*:*",}]
673
+ from text.japanese_bert import get_bert_feature
674
+ import pyopenjtalk
675
+ from marine.predict import Predictor
676
+ from marine.utils.openjtalk_util import convert_njd_feature_to_marine_feature
677
+ text = text_normalize(text)
678
+ NJD_NODES = pyopenjtalk.run_frontend(text)
679
+ predictor = Predictor()
680
+ # important_info = [{"string":i["string"],"pron":i["pron"],"acc":i["acc"]}for i in pyopenjtalk.estimate_accent(NJD_NODES)]
681
+ print(text)
682
+
683
+ marine_feature = convert_njd_feature_to_marine_feature(NJD_NODES)
684
+ results = predictor.predict([marine_feature])
685
+ for mora,acc in zip(results["mora"][0],results["accent_status"][0]):
686
+ print(f"{mora}:{acc}")
687
+ # for i in pyopenjtalk.estimate_accent(NJD_NODES):
688
+ # print(f"{i['string']}:{i['pron']}:{i['acc']}")
689
+ # info = pyopenjtalk.extract_fullcontext(text,run_marine=True)
690
+ # info_nomarine = pyopenjtalk.extract_fullcontext(text,run_marine=False)
691
+ # # nodes = pyopenjtalk
692
+ # # print(info)
693
+ # for i,j in zip(info,info_nomarine):
694
+ # print(i)
695
+ # print(j)
696
+ # print("\n")
697
+ # predictor = Predictor()
698
+ #print(pyopenjtalk.estimate_accent(text))
699
+ # output = predictor.predict([nodes],accent_represent_mode="high_low")
700
+ #print(output)
701
+ # phones, tones, word2ph = g2p(text)
702
+ # bert = get_bert_feature(text, word2ph)
703
+
704
+ # print(phones, tones, word2ph, bert.shape)
text/japanese_bert.py CHANGED
@@ -1,68 +1,87 @@
1
- import sys
2
-
3
- import torch
4
- from transformers import AutoModelForMaskedLM, AutoTokenizer
5
-
6
- from config import config
7
- from text.japanese import text2sep_kata
8
-
9
- LOCAL_PATH = "./bert/deberta-v2-large-japanese-char-wwm"
10
-
11
- tokenizer = AutoTokenizer.from_pretrained(LOCAL_PATH)
12
-
13
- models = dict()
14
-
15
-
16
- def get_bert_feature(
17
- text,
18
- word2ph,
19
- device=config.bert_gen_config.device,
20
- style_text=None,
21
- style_weight=0.7,
22
- ):
23
- text = "".join(text2sep_kata(text)[0])
24
- if style_text:
25
- style_text = "".join(text2sep_kata(style_text)[0])
26
- if (
27
- sys.platform == "darwin"
28
- and torch.backends.mps.is_available()
29
- and device == "cpu"
30
- ):
31
- device = "mps"
32
- if not device:
33
- device = "cuda"
34
- if device not in models.keys():
35
- if config.webui_config.fp16_run:
36
- models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH, torch_dtype=torch.float16).to(device)
37
- else:
38
- models[device] = AutoModelForMaskedLM.from_pretrained(LOCAL_PATH).to(device)
39
- with torch.no_grad():
40
- inputs = tokenizer(text, return_tensors="pt")
41
- for i in inputs:
42
- inputs[i] = inputs[i].to(device)
43
- res = models[device](**inputs, output_hidden_states=True)
44
- res = torch.cat(res["hidden_states"][-3:-2], -1)[0].float().cpu()
45
- if style_text:
46
- style_inputs = tokenizer(style_text, return_tensors="pt")
47
- for i in style_inputs:
48
- style_inputs[i] = style_inputs[i].to(device)
49
- style_res = models[device](**style_inputs, output_hidden_states=True)
50
- style_res = torch.cat(style_res["hidden_states"][-3:-2], -1)[0].float().cpu()
51
- style_res_mean = style_res.mean(0)
52
-
53
- assert len(word2ph) == len(text) + 2
54
- word2phone = word2ph
55
- phone_level_feature = []
56
- for i in range(len(word2phone)):
57
- if style_text:
58
- repeat_feature = (
59
- res[i].repeat(word2phone[i], 1) * (1 - style_weight)
60
- + style_res_mean.repeat(word2phone[i], 1) * style_weight
61
- )
62
- else:
63
- repeat_feature = res[i].repeat(word2phone[i], 1)
64
- phone_level_feature.append(repeat_feature)
65
-
66
- phone_level_feature = torch.cat(phone_level_feature, dim=0)
67
-
68
- return phone_level_feature.T
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
3
+ import sys
4
+ import os
5
+ from text.japanese import text2sep_kata
6
+ tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
7
+
8
+ models = dict()
9
+
10
+
11
+ def get_bert_feature(text, word2ph, device=None):
12
+ sep_text,_ = text2sep_kata(text)
13
+ sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
14
+ sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
15
+ sep_ids = [2]+[item for sublist in sep_ids for item in sublist]+[3]
16
+ return get_bert_feature_with_token(sep_ids, word2ph, device)
17
+
18
+
19
+ # def get_bert_feature(text, word2ph, device=None):
20
+ # if (
21
+ # sys.platform == "darwin"
22
+ # and torch.backends.mps.is_available()
23
+ # and device == "cpu"
24
+ # ):
25
+ # device = "mps"
26
+ # if not device:
27
+ # device = "cuda"
28
+ # if device not in models.keys():
29
+ # models[device] = AutoModelForMaskedLM.from_pretrained(
30
+ # "cl-tohoku/bert-base-japanese-v3"
31
+ # ).to(device)
32
+ # with torch.no_grad():
33
+ # inputs = tokenizer(text, return_tensors="pt")
34
+ # for i in inputs:
35
+ # inputs[i] = inputs[i].to(device)
36
+ # res = models[device](**inputs, output_hidden_states=True)
37
+ # res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
38
+ # assert inputs["input_ids"].shape[-1] == len(word2ph)
39
+ # word2phone = word2ph
40
+ # phone_level_feature = []
41
+ # for i in range(len(word2phone)):
42
+ # repeat_feature = res[i].repeat(word2phone[i], 1)
43
+ # phone_level_feature.append(repeat_feature)
44
+
45
+ # phone_level_feature = torch.cat(phone_level_feature, dim=0)
46
+
47
+ # return phone_level_feature.T
48
+
49
+ def get_bert_feature_with_token(tokens, word2ph, device=None):
50
+ if (
51
+ sys.platform == "darwin"
52
+ and torch.backends.mps.is_available()
53
+ and device == "cpu"
54
+ ):
55
+ device = "mps"
56
+ if not device:
57
+ device = "cuda"
58
+ if device not in models.keys():
59
+ models[device] = AutoModelForMaskedLM.from_pretrained(
60
+ "./bert/bert-base-japanese-v3"
61
+ ).to(device)
62
+ with torch.no_grad():
63
+ inputs = torch.tensor(tokens).to(device).unsqueeze(0)
64
+ token_type_ids = torch.zeros_like(inputs).to(device)
65
+ attention_mask = torch.ones_like(inputs).to(device)
66
+ inputs = {"input_ids": inputs, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
67
+
68
+
69
+ # for i in inputs:
70
+ # inputs[i] = inputs[i].to(device)
71
+ res = models[device](**inputs, output_hidden_states=True)
72
+ res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
73
+ assert inputs["input_ids"].shape[-1] == len(word2ph)
74
+ word2phone = word2ph
75
+ phone_level_feature = []
76
+ for i in range(len(word2phone)):
77
+ repeat_feature = res[i].repeat(word2phone[i], 1)
78
+ phone_level_feature.append(repeat_feature)
79
+
80
+ phone_level_feature = torch.cat(phone_level_feature, dim=0)
81
+
82
+ return phone_level_feature.T
83
+
84
+
85
+ if __name__ == "__main__":
86
+ print(get_bert_feature("観覧車",[4,2]))
87
+ pass