Spaces:

JotunnBurton
/

wuwa-bert-vits2

Sleeping

App Files Files Community

JotunnBurton commited on Apr 16

Commit

ae73f54

verified ·

1 Parent(s): e0c20c8

Delete text

Browse files

Files changed (13) hide show

text/__init__.py +0 -26
text/chinese.py +0 -198
text/chinese_bert.py +0 -100
text/cleaner.py +0 -28
text/cmudict.rep +0 -0
text/cmudict_cache.pickle +0 -3
text/english.py +0 -214
text/english_bert_mock.py +0 -5
text/japanese.py +0 -704
text/japanese_bert.py +0 -87
text/opencpop-strict.txt +0 -429
text/symbols.py +0 -188
text/tone_sandhi.py +0 -769

text/__init__.py DELETED Viewed

@@ -1,26 +0,0 @@
-from text.symbols import *
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-def cleaned_text_to_sequence(cleaned_text, tones, language):
-    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    Args:
-      text: string to convert to a sequence
-    Returns:
-      List of integers corresponding to the symbols in the text
-    """
-    phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
-    tone_start = language_tone_start_map[language]
-    tones = [i + tone_start for i in tones]
-    lang_id = language_id_map[language]
-    lang_ids = [lang_id for i in phones]
-    return phones, tones, lang_ids
-def get_bert(norm_text, word2ph, language, device):
-    from .japanese_bert import get_bert_feature as jp_bert
-    lang_bert_func_map = {"JP": jp_bert}
-    bert = lang_bert_func_map[language](norm_text, word2ph, device)
-    return bert

text/chinese.py DELETED Viewed

@@ -1,198 +0,0 @@
-import os
-import re
-import cn2an
-from pypinyin import lazy_pinyin, Style
-from text.symbols import punctuation
-from text.tone_sandhi import ToneSandhi
-current_file_path = os.path.dirname(__file__)
-pinyin_to_symbol_map = {
-    line.split("\t")[0]: line.strip().split("\t")[1]
-    for line in open(os.path.join(current_file_path, "opencpop-strict.txt")).readlines()
-}
-import jieba.posseg as psg
-rep_map = {
-    "：": ",",
-    "；": ",",
-    "，": ",",
-    "。": ".",
-    "！": "!",
-    "？": "?",
-    "\n": ".",
-    "·": ",",
-    "、": ",",
-    "...": "…",
-    "$": ".",
-    "“": "'",
-    "”": "'",
-    "‘": "'",
-    "’": "'",
-    "（": "'",
-    "）": "'",
-    "(": "'",
-    ")": "'",
-    "《": "'",
-    "》": "'",
-    "【": "'",
-    "】": "'",
-    "[": "'",
-    "]": "'",
-    "—": "-",
-    "～": "-",
-    "~": "-",
-    "「": "'",
-    "」": "'",
-}
-tone_modifier = ToneSandhi()
-def replace_punctuation(text):
-    text = text.replace("嗯", "恩").replace("呣", "母")
-    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
-    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
-    replaced_text = re.sub(
-        r"[^\u4e00-\u9fa5" + "".join(punctuation) + r"]+", "", replaced_text
-    )
-    return replaced_text
-def g2p(text):
-    pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
-    sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
-    phones, tones, word2ph = _g2p(sentences)
-    assert sum(word2ph) == len(phones)
-    assert len(word2ph) == len(text)  # Sometimes it will crash,you can add a try-catch.
-    phones = ["_"] + phones + ["_"]
-    tones = [0] + tones + [0]
-    word2ph = [1] + word2ph + [1]
-    return phones, tones, word2ph
-def _get_initials_finals(word):
-    initials = []
-    finals = []
-    orig_initials = lazy_pinyin(word, neutral_tone_with_five=True, style=Style.INITIALS)
-    orig_finals = lazy_pinyin(
-        word, neutral_tone_with_five=True, style=Style.FINALS_TONE3
-    )
-    for c, v in zip(orig_initials, orig_finals):
-        initials.append(c)
-        finals.append(v)
-    return initials, finals
-def _g2p(segments):
-    phones_list = []
-    tones_list = []
-    word2ph = []
-    for seg in segments:
-        # Replace all English words in the sentence
-        seg = re.sub("[a-zA-Z]+", "", seg)
-        seg_cut = psg.lcut(seg)
-        initials = []
-        finals = []
-        seg_cut = tone_modifier.pre_merge_for_modify(seg_cut)
-        for word, pos in seg_cut:
-            if pos == "eng":
-                continue
-            sub_initials, sub_finals = _get_initials_finals(word)
-            sub_finals = tone_modifier.modified_tone(word, pos, sub_finals)
-            initials.append(sub_initials)
-            finals.append(sub_finals)
-            # assert len(sub_initials) == len(sub_finals) == len(word)
-        initials = sum(initials, [])
-        finals = sum(finals, [])
-        #
-        for c, v in zip(initials, finals):
-            raw_pinyin = c + v
-            # NOTE: post process for pypinyin outputs
-            # we discriminate i, ii and iii
-            if c == v:
-                assert c in punctuation
-                phone = [c]
-                tone = "0"
-                word2ph.append(1)
-            else:
-                v_without_tone = v[:-1]
-                tone = v[-1]
-                pinyin = c + v_without_tone
-                assert tone in "12345"
-                if c:
-                    # 多音节
-                    v_rep_map = {
-                        "uei": "ui",
-                        "iou": "iu",
-                        "uen": "un",
-                    }
-                    if v_without_tone in v_rep_map.keys():
-                        pinyin = c + v_rep_map[v_without_tone]
-                else:
-                    # 单音节
-                    pinyin_rep_map = {
-                        "ing": "ying",
-                        "i": "yi",
-                        "in": "yin",
-                        "u": "wu",
-                    }
-                    if pinyin in pinyin_rep_map.keys():
-                        pinyin = pinyin_rep_map[pinyin]
-                    else:
-                        single_rep_map = {
-                            "v": "yu",
-                            "e": "e",
-                            "i": "y",
-                            "u": "w",
-                        }
-                        if pinyin[0] in single_rep_map.keys():
-                            pinyin = single_rep_map[pinyin[0]] + pinyin[1:]
-                assert pinyin in pinyin_to_symbol_map.keys(), (pinyin, seg, raw_pinyin)
-                phone = pinyin_to_symbol_map[pinyin].split(" ")
-                word2ph.append(len(phone))
-            phones_list += phone
-            tones_list += [int(tone)] * len(phone)
-    return phones_list, tones_list, word2ph
-def text_normalize(text):
-    numbers = re.findall(r"\d+(?:\.?\d+)?", text)
-    for number in numbers:
-        text = text.replace(number, cn2an.an2cn(number), 1)
-    text = replace_punctuation(text)
-    return text
-def get_bert_feature(text, word2ph):
-    from text import chinese_bert
-    return chinese_bert.get_bert_feature(text, word2ph)
-if __name__ == "__main__":
-    from text.chinese_bert import get_bert_feature
-    text = "啊！但是《原神》是由,米哈\游自主，  [研发]的一款全.新开放世界.冒险游戏"
-    text = text_normalize(text)
-    print(text)
-    phones, tones, word2ph = g2p(text)
-    bert = get_bert_feature(text, word2ph)
-    print(phones, tones, word2ph, bert.shape)
-# # 示例用法
-# text = "这是一个示例文本：,你好！这是一个测试...."
-# print(g2p_paddle(text))  # 输出: 这是一个示例文本你好这是一个测试

text/chinese_bert.py DELETED Viewed

@@ -1,100 +0,0 @@
-import torch
-import sys
-from transformers import AutoTokenizer, AutoModelForMaskedLM
-tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext-large")
-models = dict()
-def get_bert_feature(text, word2ph, device=None):
-    if (
-        sys.platform == "darwin"
-        and torch.backends.mps.is_available()
-        and device == "cpu"
-    ):
-        device = "mps"
-    if not device:
-        device = "cuda"
-    if device not in models.keys():
-        models[device] = AutoModelForMaskedLM.from_pretrained(
-            "hfl/chinese-roberta-wwm-ext-large"
-        ).to(device)
-    with torch.no_grad():
-        inputs = tokenizer(text, return_tensors="pt")
-        for i in inputs:
-            inputs[i] = inputs[i].to(device)
-        res = models[device](**inputs, output_hidden_states=True)
-        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
-    assert len(word2ph) == len(text) + 2
-    word2phone = word2ph
-    phone_level_feature = []
-    for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    return phone_level_feature.T
-if __name__ == "__main__":
-    import torch
-    word_level_feature = torch.rand(38, 1024)  # 12个词,每个词1024维特征
-    word2phone = [
-        1,
-        2,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        2,
-        1,
-        1,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        2,
-        1,
-        2,
-        2,
-        2,
-        2,
-        1,
-    ]
-    # 计算总帧数
-    total_frames = sum(word2phone)
-    print(word_level_feature.shape)
-    print(word2phone)
-    phone_level_feature = []
-    for i in range(len(word2phone)):
-        print(word_level_feature[i].shape)
-        # 对每个词重复word2phone[i]次
-        repeat_feature = word_level_feature[i].repeat(word2phone[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    print(phone_level_feature.shape)  # torch.Size([36, 1024])

text/cleaner.py DELETED Viewed

@@ -1,28 +0,0 @@
-from text import chinese, japanese, cleaned_text_to_sequence
-language_module_map = {"ZH": chinese, "JP": japanese}
-def clean_text(text, language):
-    language_module = language_module_map[language]
-    norm_text = language_module.text_normalize(text)
-    phones, tones, word2ph = language_module.g2p(norm_text)
-    return norm_text, phones, tones, word2ph
-def clean_text_bert(text, language):
-    language_module = language_module_map[language]
-    norm_text = language_module.text_normalize(text)
-    phones, tones, word2ph = language_module.g2p(norm_text)
-    bert = language_module.get_bert_feature(norm_text, word2ph)
-    return phones, tones, bert
-def text_to_sequence(text, language):
-    norm_text, phones, tones, word2ph = clean_text(text, language)
-    return cleaned_text_to_sequence(phones, tones, language)
-if __name__ == "__main__":
-    pass

text/cmudict.rep DELETED Viewed

The diff for this file is too large to render. See raw diff

text/cmudict_cache.pickle DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b9b21b20325471934ba92f2e4a5976989e7d920caa32e7a286eacb027d197949
-size 6212655

text/english.py DELETED Viewed

@@ -1,214 +0,0 @@
-import pickle
-import os
-import re
-from g2p_en import G2p
-from text import symbols
-current_file_path = os.path.dirname(__file__)
-CMU_DICT_PATH = os.path.join(current_file_path, "cmudict.rep")
-CACHE_PATH = os.path.join(current_file_path, "cmudict_cache.pickle")
-_g2p = G2p()
-arpa = {
-    "AH0",
-    "S",
-    "AH1",
-    "EY2",
-    "AE2",
-    "EH0",
-    "OW2",
-    "UH0",
-    "NG",
-    "B",
-    "G",
-    "AY0",
-    "M",
-    "AA0",
-    "F",
-    "AO0",
-    "ER2",
-    "UH1",
-    "IY1",
-    "AH2",
-    "DH",
-    "IY0",
-    "EY1",
-    "IH0",
-    "K",
-    "N",
-    "W",
-    "IY2",
-    "T",
-    "AA1",
-    "ER1",
-    "EH2",
-    "OY0",
-    "UH2",
-    "UW1",
-    "Z",
-    "AW2",
-    "AW1",
-    "V",
-    "UW2",
-    "AA2",
-    "ER",
-    "AW0",
-    "UW0",
-    "R",
-    "OW1",
-    "EH1",
-    "ZH",
-    "AE0",
-    "IH2",
-    "IH",
-    "Y",
-    "JH",
-    "P",
-    "AY1",
-    "EY0",
-    "OY2",
-    "TH",
-    "HH",
-    "D",
-    "ER0",
-    "CH",
-    "AO1",
-    "AE1",
-    "AO2",
-    "OY1",
-    "AY2",
-    "IH1",
-    "OW0",
-    "L",
-    "SH",
-}
-def post_replace_ph(ph):
-    rep_map = {
-        "：": ",",
-        "；": ",",
-        "，": ",",
-        "。": ".",
-        "！": "!",
-        "？": "?",
-        "\n": ".",
-        "·": ",",
-        "、": ",",
-        "...": "…",
-        "v": "V",
-    }
-    if ph in rep_map.keys():
-        ph = rep_map[ph]
-    if ph in symbols:
-        return ph
-    if ph not in symbols:
-        ph = "UNK"
-    return ph
-def read_dict():
-    g2p_dict = {}
-    start_line = 49
-    with open(CMU_DICT_PATH) as f:
-        line = f.readline()
-        line_index = 1
-        while line:
-            if line_index >= start_line:
-                line = line.strip()
-                word_split = line.split("  ")
-                word = word_split[0]
-                syllable_split = word_split[1].split(" - ")
-                g2p_dict[word] = []
-                for syllable in syllable_split:
-                    phone_split = syllable.split(" ")
-                    g2p_dict[word].append(phone_split)
-            line_index = line_index + 1
-            line = f.readline()
-    return g2p_dict
-def cache_dict(g2p_dict, file_path):
-    with open(file_path, "wb") as pickle_file:
-        pickle.dump(g2p_dict, pickle_file)
-def get_dict():
-    if os.path.exists(CACHE_PATH):
-        with open(CACHE_PATH, "rb") as pickle_file:
-            g2p_dict = pickle.load(pickle_file)
-    else:
-        g2p_dict = read_dict()
-        cache_dict(g2p_dict, CACHE_PATH)
-    return g2p_dict
-eng_dict = get_dict()
-def refine_ph(phn):
-    tone = 0
-    if re.search(r"\d$", phn):
-        tone = int(phn[-1]) + 1
-        phn = phn[:-1]
-    return phn.lower(), tone
-def refine_syllables(syllables):
-    tones = []
-    phonemes = []
-    for phn_list in syllables:
-        for i in range(len(phn_list)):
-            phn = phn_list[i]
-            phn, tone = refine_ph(phn)
-            phonemes.append(phn)
-            tones.append(tone)
-    return phonemes, tones
-def text_normalize(text):
-    # todo: eng text normalize
-    return text
-def g2p(text):
-    phones = []
-    tones = []
-    words = re.split(r"([,;.\-\?\!\s+])", text)
-    for w in words:
-        if w.upper() in eng_dict:
-            phns, tns = refine_syllables(eng_dict[w.upper()])
-            phones += phns
-            tones += tns
-        else:
-            phone_list = list(filter(lambda p: p != " ", _g2p(w)))
-            for ph in phone_list:
-                if ph in arpa:
-                    ph, tn = refine_ph(ph)
-                    phones.append(ph)
-                    tones.append(tn)
-                else:
-                    phones.append(ph)
-                    tones.append(0)
-    # todo: implement word2ph
-    word2ph = [1 for i in phones]
-    phones = [post_replace_ph(i) for i in phones]
-    return phones, tones, word2ph
-if __name__ == "__main__":
-    # print(get_dict())
-    # print(eng_word_to_phoneme("hello"))
-    print(g2p("In this paper, we propose 1 DSPGAN, a GAN-based universal vocoder."))
-    # all_phones = set()
-    # for k, syllables in eng_dict.items():
-    #     for group in syllables:
-    #         for ph in group:
-    #             all_phones.add(ph)
-    # print(all_phones)

text/english_bert_mock.py DELETED Viewed

@@ -1,5 +0,0 @@
-import torch
-def get_bert_feature(norm_text, word2ph):
-    return torch.zeros(1024, sum(word2ph))

text/japanese.py DELETED Viewed

@@ -1,704 +0,0 @@
-# Convert Japanese text to phonemes which is
-# compatible with Julius https://github.com/julius-speech/segmentation-kit
-import re
-import unicodedata
-from transformers import AutoTokenizer
-from text import punctuation, symbols
-try:
-    import MeCab
-except ImportError as e:
-    raise ImportError("Japanese requires mecab-python3 and unidic-lite.") from e
-from num2words import num2words
-_CONVRULES = [
-    # Conversion of 2 letters
-    "アァ/ a a",
-    "イィ/ i i",
-    "イェ/ i e",
-    "イャ/ y a",
-    "ウゥ/ u:",
-    "エェ/ e e",
-    "オォ/ o:",
-    "カァ/ k a:",
-    "キィ/ k i:",
-    "クゥ/ k u:",
-    "クャ/ ky a",
-    "クュ/ ky u",
-    "クョ/ ky o",
-    "ケェ/ k e:",
-    "コォ/ k o:",
-    "ガァ/ g a:",
-    "ギィ/ g i:",
-    "グゥ/ g u:",
-    "グャ/ gy a",
-    "グュ/ gy u",
-    "グョ/ gy o",
-    "ゲェ/ g e:",
-    "ゴォ/ g o:",
-    "サァ/ s a:",
-    "シィ/ sh i:",
-    "スゥ/ s u:",
-    "スャ/ sh a",
-    "スュ/ sh u",
-    "スョ/ sh o",
-    "セェ/ s e:",
-    "ソォ/ s o:",
-    "ザァ/ z a:",
-    "ジィ/ j i:",
-    "ズゥ/ z u:",
-    "ズャ/ zy a",
-    "ズュ/ zy u",
-    "ズョ/ zy o",
-    "ゼェ/ z e:",
-    "ゾォ/ z o:",
-    "タァ/ t a:",
-    "チィ/ ch i:",
-    "ツァ/ ts a",
-    "ツィ/ ts i",
-    "ツゥ/ ts u:",
-    "ツャ/ ch a",
-    "ツュ/ ch u",
-    "ツョ/ ch o",
-    "ツェ/ ts e",
-    "ツォ/ ts o",
-    "テェ/ t e:",
-    "トォ/ t o:",
-    "ダァ/ d a:",
-    "ヂィ/ j i:",
-    "ヅゥ/ d u:",
-    "ヅャ/ zy a",
-    "ヅュ/ zy u",
-    "ヅョ/ zy o",
-    "デェ/ d e:",
-    "ドォ/ d o:",
-    "ナァ/ n a:",
-    "ニィ/ n i:",
-    "ヌゥ/ n u:",
-    "ヌャ/ ny a",
-    "ヌュ/ ny u",
-    "ヌョ/ ny o",
-    "ネェ/ n e:",
-    "ノォ/ n o:",
-    "ハァ/ h a:",
-    "ヒィ/ h i:",
-    "フゥ/ f u:",
-    "フャ/ hy a",
-    "フュ/ hy u",
-    "フョ/ hy o",
-    "ヘェ/ h e:",
-    "ホォ/ h o:",
-    "バァ/ b a:",
-    "ビィ/ b i:",
-    "ブゥ/ b u:",
-    "フャ/ hy a",
-    "ブュ/ by u",
-    "フョ/ hy o",
-    "ベェ/ b e:",
-    "ボォ/ b o:",
-    "パァ/ p a:",
-    "ピィ/ p i:",
-    "プゥ/ p u:",
-    "プャ/ py a",
-    "プュ/ py u",
-    "プョ/ py o",
-    "ペェ/ p e:",
-    "ポォ/ p o:",
-    "マァ/ m a:",
-    "ミィ/ m i:",
-    "ムゥ/ m u:",
-    "ムャ/ my a",
-    "ムュ/ my u",
-    "ムョ/ my o",
-    "メェ/ m e:",
-    "モォ/ m o:",
-    "ヤァ/ y a:",
-    "ユゥ/ y u:",
-    "ユャ/ y a:",
-    "ユュ/ y u:",
-    "ユョ/ y o:",
-    "ヨォ/ y o:",
-    "ラァ/ r a:",
-    "リィ/ r i:",
-    "ルゥ/ r u:",
-    "ルャ/ ry a",
-    "ルュ/ ry u",
-    "ルョ/ ry o",
-    "レェ/ r e:",
-    "ロォ/ r o:",
-    "ワァ/ w a:",
-    "ヲォ/ o:",
-    "ディ/ d i",
-    "デェ/ d e:",
-    "デャ/ dy a",
-    "デュ/ dy u",
-    "デョ/ dy o",
-    "ティ/ t i",
-    "テェ/ t e:",
-    "テャ/ ty a",
-    "テュ/ ty u",
-    "テョ/ ty o",
-    "スィ/ s i",
-    "ズァ/ z u a",
-    "ズィ/ z i",
-    "ズゥ/ z u",
-    "ズャ/ zy a",
-    "ズュ/ zy u",
-    "ズョ/ zy o",
-    "ズェ/ z e",
-    "ズォ/ z o",
-    "キャ/ ky a",
-    "キュ/ ky u",
-    "キョ/ ky o",
-    "シャ/ sh a",
-    "シュ/ sh u",
-    "シェ/ sh e",
-    "ショ/ sh o",
-    "チャ/ ch a",
-    "チュ/ ch u",
-    "チェ/ ch e",
-    "チョ/ ch o",
-    "トゥ/ t u",
-    "トャ/ ty a",
-    "トュ/ ty u",
-    "トョ/ ty o",
-    "ドァ/ d o a",
-    "ドゥ/ d u",
-    "ドャ/ dy a",
-    "ドュ/ dy u",
-    "ドョ/ dy o",
-    "ドォ/ d o:",
-    "ニャ/ ny a",
-    "ニュ/ ny u",
-    "ニョ/ ny o",
-    "ヒャ/ hy a",
-    "ヒュ/ hy u",
-    "ヒョ/ hy o",
-    "ミャ/ my a",
-    "ミュ/ my u",
-    "ミョ/ my o",
-    "リャ/ ry a",
-    "リュ/ ry u",
-    "リョ/ ry o",
-    "ギャ/ gy a",
-    "ギュ/ gy u",
-    "ギョ/ gy o",
-    "ヂェ/ j e",
-    "ヂャ/ j a",
-    "ヂュ/ j u",
-    "ヂョ/ j o",
-    "ジェ/ j e",
-    "ジャ/ j a",
-    "ジュ/ j u",
-    "ジョ/ j o",
-    "ビャ/ by a",
-    "ビュ/ by u",
-    "ビョ/ by o",
-    "ピャ/ py a",
-    "ピュ/ py u",
-    "ピョ/ py o",
-    "ウァ/ u a",
-    "ウィ/ w i",
-    "ウェ/ w e",
-    "ウォ/ w o",
-    "ファ/ f a",
-    "フィ/ f i",
-    "フゥ/ f u",
-    "フャ/ hy a",
-    "フュ/ hy u",
-    "フョ/ hy o",
-    "フェ/ f e",
-    "フォ/ f o",
-    "ヴァ/ b a",
-    "ヴィ/ b i",
-    "ヴェ/ b e",
-    "ヴォ/ b o",
-    "ヴュ/ by u",
-    "アー/ a:",
-    "イー/ i:",
-    "ウー/ u:",
-    "エー/ e:",
-    "オー/ o:",
-    "カー/ k a:",
-    "キー/ k i:",
-    "クー/ k u:",
-    "ケー/ k e:",
-    "コー/ k o:",
-    "サー/ s a:",
-    "シー/ sh i:",
-    "スー/ s u:",
-    "セー/ s e:",
-    "ソー/ s o:",
-    "ター/ t a:",
-    "チー/ ch i:",
-    "ツー/ ts u:",
-    "テー/ t e:",
-    "トー/ t o:",
-    "ナー/ n a:",
-    "ニー/ n i:",
-    "ヌ���/ n u:",
-    "ネー/ n e:",
-    "ノー/ n o:",
-    "ハー/ h a:",
-    "ヒー/ h i:",
-    "フー/ f u:",
-    "ヘー/ h e:",
-    "ホー/ h o:",
-    "マー/ m a:",
-    "ミー/ m i:",
-    "ムー/ m u:",
-    "メー/ m e:",
-    "モー/ m o:",
-    "ラー/ r a:",
-    "リー/ r i:",
-    "ルー/ r u:",
-    "レー/ r e:",
-    "ロー/ r o:",
-    "ガー/ g a:",
-    "ギー/ g i:",
-    "グー/ g u:",
-    "ゲー/ g e:",
-    "ゴー/ g o:",
-    "ザー/ z a:",
-    "ジー/ j i:",
-    "ズー/ z u:",
-    "ゼー/ z e:",
-    "ゾー/ z o:",
-    "ダー/ d a:",
-    "ヂー/ j i:",
-    "ヅー/ z u:",
-    "デー/ d e:",
-    "ドー/ d o:",
-    "バー/ b a:",
-    "ビー/ b i:",
-    "ブー/ b u:",
-    "ベー/ b e:",
-    "ボー/ b o:",
-    "パー/ p a:",
-    "ピー/ p i:",
-    "プー/ p u:",
-    "ペー/ p e:",
-    "ポー/ p o:",
-    "ヤー/ y a:",
-    "ユー/ y u:",
-    "ヨー/ y o:",
-    "ワー/ w a:",
-    "ヰー/ i:",
-    "ヱー/ e:",
-    "ヲー/ o:",
-    "ヴー/ b u:",
-    # Conversion of 1 letter
-    "ア/ a",
-    "イ/ i",
-    "ウ/ u",
-    "エ/ e",
-    "オ/ o",
-    "カ/ k a",
-    "キ/ k i",
-    "ク/ k u",
-    "ケ/ k e",
-    "コ/ k o",
-    "サ/ s a",
-    "シ/ sh i",
-    "ス/ s u",
-    "セ/ s e",
-    "ソ/ s o",
-    "タ/ t a",
-    "チ/ ch i",
-    "ツ/ ts u",
-    "テ/ t e",
-    "ト/ t o",
-    "ナ/ n a",
-    "ニ/ n i",
-    "ヌ/ n u",
-    "ネ/ n e",
-    "ノ/ n o",
-    "ハ/ h a",
-    "ヒ/ h i",
-    "フ/ f u",
-    "ヘ/ h e",
-    "ホ/ h o",
-    "マ/ m a",
-    "ミ/ m i",
-    "ム/ m u",
-    "メ/ m e",
-    "モ/ m o",
-    "ラ/ r a",
-    "リ/ r i",
-    "ル/ r u",
-    "レ/ r e",
-    "ロ/ r o",
-    "ガ/ g a",
-    "ギ/ g i",
-    "グ/ g u",
-    "ゲ/ g e",
-    "ゴ/ g o",
-    "ザ/ z a",
-    "ジ/ j i",
-    "ズ/ z u",
-    "ゼ/ z e",
-    "ゾ/ z o",
-    "ダ/ d a",
-    "ヂ/ j i",
-    "ヅ/ z u",
-    "デ/ d e",
-    "ド/ d o",
-    "バ/ b a",
-    "ビ/ b i",
-    "ブ/ b u",
-    "ベ/ b e",
-    "ボ/ b o",
-    "パ/ p a",
-    "ピ/ p i",
-    "プ/ p u",
-    "ペ/ p e",
-    "ポ/ p o",
-    "ヤ/ y a",
-    "ユ/ y u",
-    "ヨ/ y o",
-    "ワ/ w a",
-    "ヰ/ i",
-    "ヱ/ e",
-    "ヲ/ o",
-    "ン/ N",
-    "ッ/ q",
-    "ヴ/ b u",
-    "ー/:", #这个不起作用
-    # Try converting broken text
-    "ァ/ a",
-    "ィ/ i",
-    "ゥ/ u",
-    "ェ/ e",
-    "ォ/ o",
-    "ヮ/ w a",
-    "ォ/ o",
-    # Symbols
-    "、/ ,",
-    "。/ .",
-    "！/ !",
-    "？/ ?",
-    "・/ ,",
-]
-_COLON_RX = re.compile(":+")
-_REJECT_RX = re.compile("[^ a-zA-Z:,.?]")
-def _makerulemap():
-    l = [tuple(x.split("/")) for x in _CONVRULES]
-    return tuple({k: v for k, v in l if len(k) == i} for i in (1, 2))
-_RULEMAP1, _RULEMAP2 = _makerulemap()
-def kata2phoneme(text: str) -> str:
-    """Convert katakana text to phonemes."""
-    text = text.strip()
-    res = []
-    while text:
-        if len(text) >= 2:
-            x = _RULEMAP2.get(text[:2])
-            if x is not None:
-                text = text[2:]
-                res += x.split(" ")[1:]
-                continue
-        x = _RULEMAP1.get(text[0])
-        if x is not None:
-            text = text[1:]
-            res += x.split(" ")[1:]
-            continue
-        res.append(text[0])
-        text = text[1:]
-    # res = _COLON_RX.sub(":", res)
-    return res
-_KATAKANA = "".join(chr(ch) for ch in range(ord("ァ"), ord("ン") + 1))
-_HIRAGANA = "".join(chr(ch) for ch in range(ord("ぁ"), ord("ん") + 1))
-_HIRA2KATATRANS = str.maketrans(_HIRAGANA, _KATAKANA)
-def hira2kata(text: str) -> str:
-    text = text.translate(_HIRA2KATATRANS)
-    return text.replace("う゛", "ヴ")
-_SYMBOL_TOKENS = set(list("・、。？！"))
-_NO_YOMI_TOKENS = set(list("「」『』―（）［］[]"))
-_TAGGER = MeCab.Tagger()
-def text2kata(text: str) -> str:
-    parsed = _TAGGER.parse(text)
-    res = []
-    for line in parsed.split("\n"):
-        if line == "EOS":
-            break
-        parts = line.split("\t")
-        word, yomi = parts[0], parts[1]
-        if yomi:
-            res.append(yomi)
-        else:
-            if word in _SYMBOL_TOKENS:
-                res.append(word)
-            elif word in ("っ", "ッ"):
-                res.append("ッ")
-            elif word in _NO_YOMI_TOKENS:
-                pass
-            else:
-                res.append(word)
-    return hira2kata("".join(res))
-def text2sep_kata(text: str) -> (list, list):
-    parsed = _TAGGER.parse(text)
-    res = []
-    sep = []
-    for line in parsed.split("\n"):
-        if line == "EOS":
-            break
-        parts = line.split("\t")
-        word, yomi = parts[0], parts[1]
-        if yomi:
-            res.append(yomi)
-        else:
-            if word in _SYMBOL_TOKENS:
-                res.append(word)
-            elif word in ("っ", "ッ"):
-                res.append("ッ")
-            elif word in _NO_YOMI_TOKENS:
-                pass
-            else:
-                res.append(word)
-        sep.append(word)
-    return sep, [hira2kata(i) for i in res]
-_ALPHASYMBOL_YOMI = {
-    "#": "シャープ",
-    "%": "パーセント",
-    "&": "アンド",
-    "+": "プラス",
-    "-": "マイナス",
-    ":": "コロン",
-    ";": "セミコロン",
-    "<": "小なり",
-    "=": "イコール",
-    ">": "大なり",
-    "@": "アット",
-    "a": "エー",
-    "b": "ビー",
-    "c": "シー",
-    "d": "ディー",
-    "e": "イー",
-    "f": "エフ",
-    "g": "ジー",
-    "h": "エイチ",
-    "i": "アイ",
-    "j": "ジェー",
-    "k": "ケー",
-    "l": "エル",
-    "m": "エム",
-    "n": "エヌ",
-    "o": "オー",
-    "p": "ピー",
-    "q": "キュー",
-    "r": "アール",
-    "s": "エス",
-    "t": "ティー",
-    "u": "ユー",
-    "v": "ブイ",
-    "w": "ダブリュー",
-    "x": "エックス",
-    "y": "ワイ",
-    "z": "ゼット",
-    "α": "アルファ",
-    "β": "ベータ",
-    "γ": "ガンマ",
-    "δ": "デルタ",
-    "ε": "イプシロン",
-    "ζ": "ゼータ",
-    "η": "イータ",
-    "θ": "シータ",
-    "ι": "イオタ",
-    "κ": "カッパ",
-    "λ": "ラムダ",
-    "μ": "ミュー",
-    "ν": "ニュー",
-    "ξ": "クサイ",
-    "ο": "オミクロン",
-    "π": "パイ",
-    "ρ": "ロー",
-    "σ": "シグマ",
-    "τ": "タウ",
-    "υ": "ウプシロン",
-    "φ": "ファイ",
-    "χ": "カイ",
-    "ψ": "プサイ",
-    "ω": "オメガ",
-}
-_NUMBER_WITH_SEPARATOR_RX = re.compile("[0-9]{1,3}(,[0-9]{3})+")
-_CURRENCY_MAP = {"$": "ドル", "¥": "円", "£": "ポンド", "€": "ユーロ"}
-_CURRENCY_RX = re.compile(r"([$¥£€])([0-9.]*[0-9])")
-_NUMBER_RX = re.compile(r"[0-9]+(\.[0-9]+)?")
-def japanese_convert_numbers_to_words(text: str) -> str:
-    res = _NUMBER_WITH_SEPARATOR_RX.sub(lambda m: m[0].replace(",", ""), text)
-    res = _CURRENCY_RX.sub(lambda m: m[2] + _CURRENCY_MAP.get(m[1], m[1]), res)
-    res = _NUMBER_RX.sub(lambda m: num2words(m[0], lang="ja"), res)
-    return res
-def japanese_convert_alpha_symbols_to_words(text: str) -> str:
-    return "".join([_ALPHASYMBOL_YOMI.get(ch, ch) for ch in text.lower()])
-def japanese_text_to_phonemes(text: str) -> str:
-    """Convert Japanese text to phonemes."""
-    res = unicodedata.normalize("NFKC", text)
-    res = japanese_convert_numbers_to_words(res)
-    # res = japanese_convert_alpha_symbols_to_words(res)
-    res = text2kata(res)
-    res = kata2phoneme(res)
-    return res
-def is_japanese_character(char):
-    # 定义日语文字系统的 Unicode 范围
-    japanese_ranges = [
-        (0x3040, 0x309F),  # 平假名
-        (0x30A0, 0x30FF),  # 片假名
-        (0x4E00, 0x9FFF),  # 汉字 (CJK Unified Ideographs)
-        (0x3400, 0x4DBF),  # 汉字扩展 A
-        (0x20000, 0x2A6DF),  # 汉字扩展 B
-        # 可以根据需要添加其他汉字扩展范围
-    ]
-    # 将字符的 Unicode 编码转换为整数
-    char_code = ord(char)
-    # 检查字符是否在任何一个日语范围内
-    for start, end in japanese_ranges:
-        if start <= char_code <= end:
-            return True
-    return False
-rep_map = {
-    "：": ",",
-    "；": ",",
-    "，": ",",
-    "。": ".",
-    "！": "!",
-    "？": "?",
-    "\n": ".",
-    "·": ",",
-    "、": ",",
-    "…": "...",
-}
-def replace_punctuation(text):
-    pattern = re.compile("|".join(re.escape(p) for p in rep_map.keys()))
-    replaced_text = pattern.sub(lambda x: rep_map[x.group()], text)
-    replaced_text = re.sub(
-        r"[^\u3040-\u309F\u30A0-\u30FF\u4E00-\u9FFF\u3400-\u4DBF"
-        + "".join(punctuation)
-        + r"]+",
-        "",
-        replaced_text,
-    )
-    return replaced_text
-def text_normalize(text):
-    res = unicodedata.normalize("NFKC", text)
-    res = japanese_convert_numbers_to_words(res)
-    # res = "".join([i for i in res if is_japanese_character(i)])
-    res = replace_punctuation(res)
-    return res
-def distribute_phone(n_phone, n_word):
-    phones_per_word = [0] * n_word
-    for task in range(n_phone):
-        min_tasks = min(phones_per_word)
-        min_index = phones_per_word.index(min_tasks)
-        phones_per_word[min_index] += 1
-    return phones_per_word
-tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
-def g2p(norm_text):
-    sep_text, sep_kata = text2sep_kata(norm_text)
-    sep_tokenized = [tokenizer.tokenize(i) for i in sep_text]
-    sep_phonemes = [kata2phoneme(i) for i in sep_kata]
-    # 异常处理，MeCab不认识的词的话会一路传到这里来，然后炸掉。目前来看只有那些超级稀有的生僻词会出现这种情况
-    for i in sep_phonemes:
-        for j in i:
-            assert j in symbols, (sep_text, sep_kata, sep_phonemes)
-    word2ph = []
-    for token, phoneme in zip(sep_tokenized, sep_phonemes):
-        phone_len = len(phoneme)
-        word_len = len(token)
-        aaa = distribute_phone(phone_len, word_len)
-        word2ph += aaa
-    phones = ["_"] + [j for i in sep_phonemes for j in i] + ["_"]
-    tones = [0 for i in phones]
-    word2ph = [1] + word2ph + [1]
-    return phones, tones, word2ph
-if __name__ == "__main__":
-    tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
-    text = "だったら私、スズカさんと同じチームに入りたいです！ スズカさんの走りを毎日近くで、なんなら真横から見ていたいので！"
-    #print(_TAGGER.parse(text))
-   # nodes = [{"surface": "こんにちは", "pos": "感動詞:*:*:*", "pron": "コンニチワ", "c_type": "*", "c_form": "*", "accent_type": 0, "accent_con_type": "-1", "chain_flag": -1}]
-    nodes = [{"surface":"こんにちは","pron": "コンニチワ","pos": "感動詞:*:*:*",}]
-    from text.japanese_bert import get_bert_feature
-    import pyopenjtalk
-    from marine.predict import Predictor
-    from marine.utils.openjtalk_util import convert_njd_feature_to_marine_feature
-    text = text_normalize(text)
-    NJD_NODES = pyopenjtalk.run_frontend(text)
-    predictor = Predictor()
- #   important_info = [{"string":i["string"],"pron":i["pron"],"acc":i["acc"]}for i in pyopenjtalk.estimate_accent(NJD_NODES)]
-    print(text)
-    marine_feature = convert_njd_feature_to_marine_feature(NJD_NODES)
-    results = predictor.predict([marine_feature])
-    for mora,acc in zip(results["mora"][0],results["accent_status"][0]):
-        print(f"{mora}:{acc}")
-    # for i in pyopenjtalk.estimate_accent(NJD_NODES):
-    #     print(f"{i['string']}:{i['pron']}:{i['acc']}")
-#     info = pyopenjtalk.extract_fullcontext(text,run_marine=True)
-#     info_nomarine = pyopenjtalk.extract_fullcontext(text,run_marine=False)
-#    # nodes = pyopenjtalk
-#    # print(info)
-#     for i,j in zip(info,info_nomarine):
-#         print(i)
-#         print(j)
-#         print("\n")
-    # predictor = Predictor()
-    #print(pyopenjtalk.estimate_accent(text))
-    # output = predictor.predict([nodes],accent_represent_mode="high_low")
-    #print(output)
-    # phones, tones, word2ph = g2p(text)
-    # bert = get_bert_feature(text, word2ph)
-    # print(phones, tones, word2ph, bert.shape)

text/japanese_bert.py DELETED Viewed

@@ -1,87 +0,0 @@
-import torch
-from transformers import AutoTokenizer, AutoModelForMaskedLM
-import sys
-import os
-from text.japanese import text2sep_kata
-tokenizer = AutoTokenizer.from_pretrained("./bert/bert-base-japanese-v3")
-models = dict()
-def get_bert_feature(text, word2ph, device=None):
-    sep_text,_ = text2sep_kata(text)
-    sep_tokens = [tokenizer.tokenize(t) for t in sep_text]
-    sep_ids = [tokenizer.convert_tokens_to_ids(t) for t in sep_tokens]
-    sep_ids = [2]+[item for sublist in sep_ids for item in sublist]+[3]
-    return get_bert_feature_with_token(sep_ids, word2ph, device)
-# def get_bert_feature(text, word2ph, device=None):
-#     if (
-#         sys.platform == "darwin"
-#         and torch.backends.mps.is_available()
-#         and device == "cpu"
-#     ):
-#         device = "mps"
-#     if not device:
-#         device = "cuda"
-#     if device not in models.keys():
-#         models[device] = AutoModelForMaskedLM.from_pretrained(
-#             "cl-tohoku/bert-base-japanese-v3"
-#         ).to(device)
-#     with torch.no_grad():
-#         inputs = tokenizer(text, return_tensors="pt")
-#         for i in inputs:
-#             inputs[i] = inputs[i].to(device)
-#         res = models[device](**inputs, output_hidden_states=True)
-#         res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
-#     assert inputs["input_ids"].shape[-1] == len(word2ph)
-#     word2phone = word2ph
-#     phone_level_feature = []
-#     for i in range(len(word2phone)):
-#         repeat_feature = res[i].repeat(word2phone[i], 1)
-#         phone_level_feature.append(repeat_feature)
-#     phone_level_feature = torch.cat(phone_level_feature, dim=0)
-#     return phone_level_feature.T
-def get_bert_feature_with_token(tokens, word2ph, device=None):
-    if (
-        sys.platform == "darwin"
-        and torch.backends.mps.is_available()
-        and device == "cpu"
-    ):
-        device = "mps"
-    if not device:
-        device = "cuda"
-    if device not in models.keys():
-        models[device] = AutoModelForMaskedLM.from_pretrained(
-            "./bert/bert-base-japanese-v3"
-        ).to(device)
-    with torch.no_grad():
-        inputs = torch.tensor(tokens).to(device).unsqueeze(0)
-        token_type_ids = torch.zeros_like(inputs).to(device)
-        attention_mask = torch.ones_like(inputs).to(device)
-        inputs = {"input_ids": inputs, "token_type_ids": token_type_ids, "attention_mask": attention_mask}
-        # for i in inputs:
-        #     inputs[i] = inputs[i].to(device)
-        res = models[device](**inputs, output_hidden_states=True)
-        res = torch.cat(res["hidden_states"][-3:-2], -1)[0].cpu()
-    assert inputs["input_ids"].shape[-1] == len(word2ph)
-    word2phone = word2ph
-    phone_level_feature = []
-    for i in range(len(word2phone)):
-        repeat_feature = res[i].repeat(word2phone[i], 1)
-        phone_level_feature.append(repeat_feature)
-    phone_level_feature = torch.cat(phone_level_feature, dim=0)
-    return phone_level_feature.T
-if __name__ == "__main__":
-    print(get_bert_feature("観覧車",[4,2]))
-    pass

text/opencpop-strict.txt DELETED Viewed

@@ -1,429 +0,0 @@
-a	AA a
-ai	AA ai
-an	AA an
-ang	AA ang
-ao	AA ao
-ba	b a
-bai	b ai
-ban	b an
-bang	b ang
-bao	b ao
-bei	b ei
-ben	b en
-beng	b eng
-bi	b i
-bian	b ian
-biao	b iao
-bie	b ie
-bin	b in
-bing	b ing
-bo	b o
-bu	b u
-ca	c a
-cai	c ai
-can	c an
-cang	c ang
-cao	c ao
-ce	c e
-cei	c ei
-cen	c en
-ceng	c eng
-cha	ch a
-chai	ch ai
-chan	ch an
-chang	ch ang
-chao	ch ao
-che	ch e
-chen	ch en
-cheng	ch eng
-chi	ch ir
-chong	ch ong
-chou	ch ou
-chu	ch u
-chua	ch ua
-chuai	ch uai
-chuan	ch uan
-chuang	ch uang
-chui	ch ui
-chun	ch un
-chuo	ch uo
-ci	c i0
-cong	c ong
-cou	c ou
-cu	c u
-cuan	c uan
-cui	c ui
-cun	c un
-cuo	c uo
-da	d a
-dai	d ai
-dan	d an
-dang	d ang
-dao	d ao
-de	d e
-dei	d ei
-den	d en
-deng	d eng
-di	d i
-dia	d ia
-dian	d ian
-diao	d iao
-die	d ie
-ding	d ing
-diu	d iu
-dong	d ong
-dou	d ou
-du	d u
-duan	d uan
-dui	d ui
-dun	d un
-duo	d uo
-e	EE e
-ei	EE ei
-en	EE en
-eng	EE eng
-er	EE er
-fa	f a
-fan	f an
-fang	f ang
-fei	f ei
-fen	f en
-feng	f eng
-fo	f o
-fou	f ou
-fu	f u
-ga	g a
-gai	g ai
-gan	g an
-gang	g ang
-gao	g ao
-ge	g e
-gei	g ei
-gen	g en
-geng	g eng
-gong	g ong
-gou	g ou
-gu	g u
-gua	g ua
-guai	g uai
-guan	g uan
-guang	g uang
-gui	g ui
-gun	g un
-guo	g uo
-ha	h a
-hai	h ai
-han	h an
-hang	h ang
-hao	h ao
-he	h e
-hei	h ei
-hen	h en
-heng	h eng
-hong	h ong
-hou	h ou
-hu	h u
-hua	h ua
-huai	h uai
-huan	h uan
-huang	h uang
-hui	h ui
-hun	h un
-huo	h uo
-ji	j i
-jia	j ia
-jian	j ian
-jiang	j iang
-jiao	j iao
-jie	j ie
-jin	j in
-jing	j ing
-jiong	j iong
-jiu	j iu
-ju	j v
-jv	j v
-juan	j van
-jvan	j van
-jue	j ve
-jve	j ve
-jun	j vn
-jvn	j vn
-ka	k a
-kai	k ai
-kan	k an
-kang	k ang
-kao	k ao
-ke	k e
-kei	k ei
-ken	k en
-keng	k eng
-kong	k ong
-kou	k ou
-ku	k u
-kua	k ua
-kuai	k uai
-kuan	k uan
-kuang	k uang
-kui	k ui
-kun	k un
-kuo	k uo
-la	l a
-lai	l ai
-lan	l an
-lang	l ang
-lao	l ao
-le	l e
-lei	l ei
-leng	l eng
-li	l i
-lia	l ia
-lian	l ian
-liang	l iang
-liao	l iao
-lie	l ie
-lin	l in
-ling	l ing
-liu	l iu
-lo	l o
-long	l ong
-lou	l ou
-lu	l u
-luan	l uan
-lun	l un
-luo	l uo
-lv	l v
-lve	l ve
-ma	m a
-mai	m ai
-man	m an
-mang	m ang
-mao	m ao
-me	m e
-mei	m ei
-men	m en
-meng	m eng
-mi	m i
-mian	m ian
-miao	m iao
-mie	m ie
-min	m in
-ming	m ing
-miu	m iu
-mo	m o
-mou	m ou
-mu	m u
-na	n a
-nai	n ai
-nan	n an
-nang	n ang
-nao	n ao
-ne	n e
-nei	n ei
-nen	n en
-neng	n eng
-ni	n i
-nian	n ian
-niang	n iang
-niao	n iao
-nie	n ie
-nin	n in
-ning	n ing
-niu	n iu
-nong	n ong
-nou	n ou
-nu	n u
-nuan	n uan
-nun	n un
-nuo	n uo
-nv	n v
-nve	n ve
-o	OO o
-ou	OO ou
-pa	p a
-pai	p ai
-pan	p an
-pang	p ang
-pao	p ao
-pei	p ei
-pen	p en
-peng	p eng
-pi	p i
-pian	p ian
-piao	p iao
-pie	p ie
-pin	p in
-ping	p ing
-po	p o
-pou	p ou
-pu	p u
-qi	q i
-qia	q ia
-qian	q ian
-qiang	q iang
-qiao	q iao
-qie	q ie
-qin	q in
-qing	q ing
-qiong	q iong
-qiu	q iu
-qu	q v
-qv	q v
-quan	q van
-qvan	q van
-que	q ve
-qve	q ve
-qun	q vn
-qvn	q vn
-ran	r an
-rang	r ang
-rao	r ao
-re	r e
-ren	r en
-reng	r eng
-ri	r ir
-rong	r ong
-rou	r ou
-ru	r u
-rua	r ua
-ruan	r uan
-rui	r ui
-run	r un
-ruo	r uo
-sa	s a
-sai	s ai
-san	s an
-sang	s ang
-sao	s ao
-se	s e
-sen	s en
-seng	s eng
-sha	sh a
-shai	sh ai
-shan	sh an
-shang	sh ang
-shao	sh ao
-she	sh e
-shei	sh ei
-shen	sh en
-sheng	sh eng
-shi	sh ir
-shou	sh ou
-shu	sh u
-shua	sh ua
-shuai	sh uai
-shuan	sh uan
-shuang	sh uang
-shui	sh ui
-shun	sh un
-shuo	sh uo
-si	s i0
-song	s ong
-sou	s ou
-su	s u
-suan	s uan
-sui	s ui
-sun	s un
-suo	s uo
-ta	t a
-tai	t ai
-tan	t an
-tang	t ang
-tao	t ao
-te	t e
-tei	t ei
-teng	t eng
-ti	t i
-tian	t ian
-tiao	t iao
-tie	t ie
-ting	t ing
-tong	t ong
-tou	t ou
-tu	t u
-tuan	t uan
-tui	t ui
-tun	t un
-tuo	t uo
-wa	w a
-wai	w ai
-wan	w an
-wang	w ang
-wei	w ei
-wen	w en
-weng	w eng
-wo	w o
-wu	w u
-xi	x i
-xia	x ia
-xian	x ian
-xiang	x iang
-xiao	x iao
-xie	x ie
-xin	x in
-xing	x ing
-xiong	x iong
-xiu	x iu
-xu	x v
-xv	x v
-xuan	x van
-xvan	x van
-xue	x ve
-xve	x ve
-xun	x vn
-xvn	x vn
-ya	y a
-yan	y En
-yang	y ang
-yao	y ao
-ye	y E
-yi	y i
-yin	y in
-ying	y ing
-yo	y o
-yong	y ong
-you	y ou
-yu	y v
-yv	y v
-yuan	y van
-yvan	y van
-yue	y ve
-yve	y ve
-yun	y vn
-yvn	y vn
-za	z a
-zai	z ai
-zan	z an
-zang	z ang
-zao	z ao
-ze	z e
-zei	z ei
-zen	z en
-zeng	z eng
-zha	zh a
-zhai	zh ai
-zhan	zh an
-zhang	zh ang
-zhao	zh ao
-zhe	zh e
-zhei	zh ei
-zhen	zh en
-zheng	zh eng
-zhi	zh ir
-zhong	zh ong
-zhou	zh ou
-zhu	zh u
-zhua	zh ua
-zhuai	zh uai
-zhuan	zh uan
-zhuang	zh uang
-zhui	zh ui
-zhun	zh un
-zhuo	zh uo
-zi	z i0
-zong	z ong
-zou	z ou
-zu	z u
-zuan	z uan
-zui	z ui
-zun	z un
-zuo	z uo

text/symbols.py DELETED Viewed

@@ -1,188 +0,0 @@
-punctuation = ["!", "?", "…", ",", ".", "'", "-"]
-pu_symbols = punctuation + ["SP", "UNK"]
-pad = "_"
-# chinese
-zh_symbols = [
-    "E",
-    "En",
-    "a",
-    "ai",
-    "an",
-    "ang",
-    "ao",
-    "b",
-    "c",
-    "ch",
-    "d",
-    "e",
-    "ei",
-    "en",
-    "eng",
-    "er",
-    "f",
-    "g",
-    "h",
-    "i",
-    "i0",
-    "ia",
-    "ian",
-    "iang",
-    "iao",
-    "ie",
-    "in",
-    "ing",
-    "iong",
-    "ir",
-    "iu",
-    "j",
-    "k",
-    "l",
-    "m",
-    "n",
-    "o",
-    "ong",
-    "ou",
-    "p",
-    "q",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "u",
-    "ua",
-    "uai",
-    "uan",
-    "uang",
-    "ui",
-    "un",
-    "uo",
-    "v",
-    "van",
-    "ve",
-    "vn",
-    "w",
-    "x",
-    "y",
-    "z",
-    "zh",
-    "AA",
-    "EE",
-    "OO",
-]
-num_zh_tones = 6
-# japanese
-ja_symbols = [
-    "N",
-    "a",
-    "a:",
-    "b",
-    "by",
-    "ch",
-    "d",
-    "dy",
-    "e",
-    "e:",
-    "f",
-    "g",
-    "gy",
-    "h",
-    "hy",
-    "i",
-    "i:",
-    "j",
-    "k",
-    "ky",
-    "m",
-    "my",
-    "n",
-    "ny",
-    "o",
-    "o:",
-    "p",
-    "py",
-    "q",
-    "r",
-    "ry",
-    "s",
-    "sh",
-    "t",
-    "ts",
-    "ty",
-    "u",
-    "u:",
-    "w",
-    "y",
-    "z",
-    "zy",
-  #  ":"
-]
-num_ja_tones = 1
-# English
-en_symbols = [
-    "aa",
-    "ae",
-    "ah",
-    "ao",
-    "aw",
-    "ay",
-    "b",
-    "ch",
-    "d",
-    "dh",
-    "eh",
-    "er",
-    "ey",
-    "f",
-    "g",
-    "hh",
-    "ih",
-    "iy",
-    "jh",
-    "k",
-    "l",
-    "m",
-    "n",
-    "ng",
-    "ow",
-    "oy",
-    "p",
-    "r",
-    "s",
-    "sh",
-    "t",
-    "th",
-    "uh",
-    "uw",
-    "V",
-    "w",
-    "y",
-    "z",
-    "zh",
-]
-num_en_tones = 4
-# combine all symbols
-normal_symbols = sorted(set(zh_symbols + ja_symbols + en_symbols))
-symbols = [pad] + normal_symbols + pu_symbols
-sil_phonemes_ids = [symbols.index(i) for i in pu_symbols]
-# combine all tones
-num_tones = num_zh_tones + num_ja_tones + num_en_tones
-# language maps
-language_id_map = {"ZH": 0, "JP": 1, "EN": 2}
-num_languages = len(language_id_map.keys())
-language_tone_start_map = {
-    "ZH": 0,
-    "JP": num_zh_tones,
-    "EN": num_zh_tones + num_ja_tones,
-}
-if __name__ == "__main__":
-    a = set(zh_symbols)
-    b = set(en_symbols)
-    print(sorted(a & b))

text/tone_sandhi.py DELETED Viewed

@@ -1,769 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import List
-from typing import Tuple
-import jieba
-from pypinyin import lazy_pinyin
-from pypinyin import Style
-class ToneSandhi:
-    def __init__(self):
-        self.must_neural_tone_words = {
-            "麻烦",
-            "麻利",
-            "鸳鸯",
-            "高粱",
-            "骨头",
-            "骆驼",
-            "马虎",
-            "首饰",
-            "馒头",
-            "馄饨",
-            "风筝",
-            "难为",
-            "队伍",
-            "阔气",
-            "闺女",
-            "门道",
-            "锄头",
-            "铺盖",
-            "铃铛",
-            "铁匠",
-            "钥匙",
-            "里脊",
-            "里头",
-            "部分",
-            "那么",
-            "道士",
-            "造化",
-            "迷糊",
-            "连累",
-            "这么",
-            "这个",
-            "运气",
-            "过去",
-            "软和",
-            "转悠",
-            "踏实",
-            "跳蚤",
-            "跟头",
-            "趔趄",
-            "财主",
-            "豆腐",
-            "讲究",
-            "记性",
-            "记号",
-            "认识",
-            "规矩",
-            "见识",
-            "裁缝",
-            "补丁",
-            "衣裳",
-            "衣服",
-            "衙门",
-            "街坊",
-            "行李",
-            "行当",
-            "蛤蟆",
-            "蘑菇",
-            "薄荷",
-            "葫芦",
-            "葡萄",
-            "萝卜",
-            "荸荠",
-            "苗条",
-            "苗头",
-            "苍蝇",
-            "芝麻",
-            "舒服",
-            "舒坦",
-            "舌头",
-            "自在",
-            "膏药",
-            "脾气",
-            "脑袋",
-            "脊梁",
-            "能耐",
-            "胳膊",
-            "胭脂",
-            "胡萝",
-            "胡琴",
-            "胡同",
-            "聪明",
-            "耽误",
-            "耽搁",
-            "耷拉",
-            "耳朵",
-            "老爷",
-            "老实",
-            "老婆",
-            "老头",
-            "老太",
-            "翻腾",
-            "罗嗦",
-            "罐头",
-            "编辑",
-            "结实",
-            "红火",
-            "累赘",
-            "糨糊",
-            "糊涂",
-            "精神",
-            "粮食",
-            "簸箕",
-            "篱笆",
-            "算计",
-            "算盘",
-            "答应",
-            "笤帚",
-            "笑语",
-            "笑话",
-            "窟窿",
-            "窝囊",
-            "窗户",
-            "稳当",
-            "稀罕",
-            "称呼",
-            "秧歌",
-            "秀气",
-            "秀才",
-            "福气",
-            "祖宗",
-            "砚台",
-            "码头",
-            "石榴",
-            "石头",
-            "石匠",
-            "知识",
-            "眼睛",
-            "眯缝",
-            "眨巴",
-            "眉毛",
-            "相声",
-            "盘算",
-            "白净",
-            "痢疾",
-            "痛快",
-            "疟疾",
-            "疙瘩",
-            "疏忽",
-            "畜生",
-            "生意",
-            "甘蔗",
-            "琵琶",
-            "琢磨",
-            "琉璃",
-            "玻璃",
-            "玫瑰",
-            "玄乎",
-            "狐狸",
-            "状元",
-            "特务",
-            "牲口",
-            "牙碜",
-            "牌楼",
-            "爽快",
-            "爱人",
-            "热闹",
-            "烧饼",
-            "烟筒",
-            "烂糊",
-            "点心",
-            "炊帚",
-            "灯笼",
-            "火候",
-            "漂亮",
-            "滑溜",
-            "溜达",
-            "温和",
-            "清楚",
-            "消息",
-            "浪头",
-            "活泼",
-            "比方",
-            "正经",
-            "欺负",
-            "模糊",
-            "槟榔",
-            "棺材",
-            "棒槌",
-            "棉花",
-            "核桃",
-            "栅栏",
-            "柴火",
-            "架势",
-            "枕头",
-            "枇杷",
-            "机灵",
-            "本事",
-            "木头",
-            "木匠",
-            "朋友",
-            "月饼",
-            "月亮",
-            "暖和",
-            "明白",
-            "时候",
-            "新鲜",
-            "故事",
-            "收拾",
-            "收成",
-            "提防",
-            "挖苦",
-            "挑剔",
-            "指甲",
-            "指头",
-            "拾掇",
-            "拳头",
-            "拨弄",
-            "招牌",
-            "招呼",
-            "抬举",
-            "护士",
-            "折腾",
-            "扫帚",
-            "打量",
-            "打算",
-            "打点",
-            "打扮",
-            "打听",
-            "打发",
-            "扎实",
-            "扁担",
-            "戒指",
-            "懒得",
-            "意识",
-            "意思",
-            "情形",
-            "悟性",
-            "怪物",
-            "思量",
-            "怎么",
-            "念头",
-            "念叨",
-            "快活",
-            "忙活",
-            "志气",
-            "心思",
-            "得罪",
-            "张罗",
-            "弟兄",
-            "开通",
-            "应酬",
-            "庄稼",
-            "干事",
-            "帮手",
-            "帐篷",
-            "希罕",
-            "师父",
-            "师傅",
-            "巴结",
-            "巴掌",
-            "差事",
-            "工夫",
-            "岁数",
-            "屁股",
-            "尾巴",
-            "少爷",
-            "小气",
-            "小伙",
-            "将就",
-            "对头",
-            "对付",
-            "寡妇",
-            "家伙",
-            "客气",
-            "实在",
-            "官司",
-            "学问",
-            "学生",
-            "字号",
-            "嫁妆",
-            "媳妇",
-            "媒人",
-            "婆家",
-            "娘家",
-            "委屈",
-            "姑娘",
-            "姐夫",
-            "妯娌",
-            "妥当",
-            "妖精",
-            "奴才",
-            "女婿",
-            "头发",
-            "太阳",
-            "大爷",
-            "大方",
-            "大意",
-            "大夫",
-            "多少",
-            "多么",
-            "外甥",
-            "壮实",
-            "地道",
-            "地方",
-            "在乎",
-            "困难",
-            "嘴巴",
-            "嘱咐",
-            "嘟囔",
-            "嘀咕",
-            "喜欢",
-            "喇嘛",
-            "喇叭",
-            "商量",
-            "唾沫",
-            "哑巴",
-            "哈欠",
-            "哆嗦",
-            "咳嗽",
-            "和尚",
-            "告诉",
-            "告示",
-            "含糊",
-            "吓唬",
-            "后头",
-            "名字",
-            "名堂",
-            "合同",
-            "吆喝",
-            "叫唤",
-            "口袋",
-            "厚道",
-            "厉害",
-            "千斤",
-            "包袱",
-            "包涵",
-            "匀称",
-            "勤快",
-            "动静",
-            "动弹",
-            "功夫",
-            "力气",
-            "前头",
-            "刺猬",
-            "刺激",
-            "别扭",
-            "利落",
-            "利索",
-            "利害",
-            "分析",
-            "出息",
-            "凑合",
-            "凉快",
-            "冷战",
-            "冤枉",
-            "冒失",
-            "养活",
-            "关系",
-            "先生",
-            "兄弟",
-            "便宜",
-            "使唤",
-            "佩服",
-            "作坊",
-            "体面",
-            "位置",
-            "似的",
-            "伙计",
-            "休息",
-            "什么",
-            "人家",
-            "亲戚",
-            "亲家",
-            "交情",
-            "云彩",
-            "事情",
-            "买卖",
-            "主意",
-            "丫头",
-            "丧气",
-            "两口",
-            "东西",
-            "东家",
-            "世故",
-            "不由",
-            "不在",
-            "下水",
-            "下巴",
-            "上头",
-            "上司",
-            "丈夫",
-            "丈人",
-            "一辈",
-            "那个",
-            "菩萨",
-            "父亲",
-            "母亲",
-            "咕噜",
-            "邋遢",
-            "费用",
-            "冤家",
-            "甜头",
-            "介绍",
-            "荒唐",
-            "大人",
-            "泥鳅",
-            "幸福",
-            "熟悉",
-            "计划",
-            "扑腾",
-            "蜡烛",
-            "姥爷",
-            "照顾",
-            "喉咙",
-            "吉他",
-            "弄堂",
-            "蚂蚱",
-            "凤凰",
-            "拖沓",
-            "寒碜",
-            "糟蹋",
-            "倒腾",
-            "报复",
-            "逻辑",
-            "盘缠",
-            "喽啰",
-            "牢骚",
-            "咖喱",
-            "扫把",
-            "惦记",
-        }
-        self.must_not_neural_tone_words = {
-            "男子",
-            "女子",
-            "分子",
-            "原子",
-            "量子",
-            "莲子",
-            "石子",
-            "瓜子",
-            "电子",
-            "人人",
-            "虎虎",
-        }
-        self.punc = "：，；。？！“”‘’':,;.?!"
-    # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
-    # e.g.
-    # word: "家里"
-    # pos: "s"
-    # finals: ['ia1', 'i3']
-    def _neural_sandhi(self, word: str, pos: str, finals: List[str]) -> List[str]:
-        # reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
-        for j, item in enumerate(word):
-            if (
-                j - 1 >= 0
-                and item == word[j - 1]
-                and pos[0] in {"n", "v", "a"}
-                and word not in self.must_not_neural_tone_words
-            ):
-                finals[j] = finals[j][:-1] + "5"
-        ge_idx = word.find("个")
-        if len(word) >= 1 and word[-1] in "吧呢啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶":
-            finals[-1] = finals[-1][:-1] + "5"
-        elif len(word) >= 1 and word[-1] in "的地得":
-            finals[-1] = finals[-1][:-1] + "5"
-        # e.g. 走了, 看着, 去过
-        # elif len(word) == 1 and word in "了着过" and pos in {"ul", "uz", "ug"}:
-        #     finals[-1] = finals[-1][:-1] + "5"
-        elif (
-            len(word) > 1
-            and word[-1] in "们子"
-            and pos in {"r", "n"}
-            and word not in self.must_not_neural_tone_words
-        ):
-            finals[-1] = finals[-1][:-1] + "5"
-        # e.g. 桌上, 地下, 家里
-        elif len(word) > 1 and word[-1] in "上下里" and pos in {"s", "l", "f"}:
-            finals[-1] = finals[-1][:-1] + "5"
-        # e.g. 上来, 下去
-        elif len(word) > 1 and word[-1] in "来去" and word[-2] in "上下进出回过起开":
-            finals[-1] = finals[-1][:-1] + "5"
-        # 个做量词
-        elif (
-            ge_idx >= 1
-            and (word[ge_idx - 1].isnumeric() or word[ge_idx - 1] in "几有两半多各整每做是")
-        ) or word == "个":
-            finals[ge_idx] = finals[ge_idx][:-1] + "5"
-        else:
-            if (
-                word in self.must_neural_tone_words
-                or word[-2:] in self.must_neural_tone_words
-            ):
-                finals[-1] = finals[-1][:-1] + "5"
-        word_list = self._split_word(word)
-        finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
-        for i, word in enumerate(word_list):
-            # conventional neural in Chinese
-            if (
-                word in self.must_neural_tone_words
-                or word[-2:] in self.must_neural_tone_words
-            ):
-                finals_list[i][-1] = finals_list[i][-1][:-1] + "5"
-        finals = sum(finals_list, [])
-        return finals
-    def _bu_sandhi(self, word: str, finals: List[str]) -> List[str]:
-        # e.g. 看不懂
-        if len(word) == 3 and word[1] == "不":
-            finals[1] = finals[1][:-1] + "5"
-        else:
-            for i, char in enumerate(word):
-                # "不" before tone4 should be bu2, e.g. 不怕
-                if char == "不" and i + 1 < len(word) and finals[i + 1][-1] == "4":
-                    finals[i] = finals[i][:-1] + "2"
-        return finals
-    def _yi_sandhi(self, word: str, finals: List[str]) -> List[str]:
-        # "一" in number sequences, e.g. 一零零, 二一零
-        if word.find("一") != -1 and all(
-            [item.isnumeric() for item in word if item != "一"]
-        ):
-            return finals
-        # "一" between reduplication words should be yi5, e.g. 看一看
-        elif len(word) == 3 and word[1] == "一" and word[0] == word[-1]:
-            finals[1] = finals[1][:-1] + "5"
-        # when "一" is ordinal word, it should be yi1
-        elif word.startswith("第一"):
-            finals[1] = finals[1][:-1] + "1"
-        else:
-            for i, char in enumerate(word):
-                if char == "一" and i + 1 < len(word):
-                    # "一" before tone4 should be yi2, e.g. 一段
-                    if finals[i + 1][-1] == "4":
-                        finals[i] = finals[i][:-1] + "2"
-                    # "一" before non-tone4 should be yi4, e.g. 一天
-                    else:
-                        # "一" 后面如果是标点，还读一声
-                        if word[i + 1] not in self.punc:
-                            finals[i] = finals[i][:-1] + "4"
-        return finals
-    def _split_word(self, word: str) -> List[str]:
-        word_list = jieba.cut_for_search(word)
-        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
-        first_subword = word_list[0]
-        first_begin_idx = word.find(first_subword)
-        if first_begin_idx == 0:
-            second_subword = word[len(first_subword) :]
-            new_word_list = [first_subword, second_subword]
-        else:
-            second_subword = word[: -len(first_subword)]
-            new_word_list = [second_subword, first_subword]
-        return new_word_list
-    def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
-        if len(word) == 2 and self._all_tone_three(finals):
-            finals[0] = finals[0][:-1] + "2"
-        elif len(word) == 3:
-            word_list = self._split_word(word)
-            if self._all_tone_three(finals):
-                #  disyllabic + monosyllabic, e.g. 蒙古/包
-                if len(word_list[0]) == 2:
-                    finals[0] = finals[0][:-1] + "2"
-                    finals[1] = finals[1][:-1] + "2"
-                #  monosyllabic + disyllabic, e.g. 纸/老虎
-                elif len(word_list[0]) == 1:
-                    finals[1] = finals[1][:-1] + "2"
-            else:
-                finals_list = [finals[: len(word_list[0])], finals[len(word_list[0]) :]]
-                if len(finals_list) == 2:
-                    for i, sub in enumerate(finals_list):
-                        # e.g. 所有/人
-                        if self._all_tone_three(sub) and len(sub) == 2:
-                            finals_list[i][0] = finals_list[i][0][:-1] + "2"
-                        # e.g. 好/喜欢
-                        elif (
-                            i == 1
-                            and not self._all_tone_three(sub)
-                            and finals_list[i][0][-1] == "3"
-                            and finals_list[0][-1][-1] == "3"
-                        ):
-                            finals_list[0][-1] = finals_list[0][-1][:-1] + "2"
-                        finals = sum(finals_list, [])
-        # split idiom into two words who's length is 2
-        elif len(word) == 4:
-            finals_list = [finals[:2], finals[2:]]
-            finals = []
-            for sub in finals_list:
-                if self._all_tone_three(sub):
-                    sub[0] = sub[0][:-1] + "2"
-                finals += sub
-        return finals
-    def _all_tone_three(self, finals: List[str]) -> bool:
-        return all(x[-1] == "3" for x in finals)
-    # merge "不" and the word behind it
-    # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
-    def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        last_word = ""
-        for word, pos in seg:
-            if last_word == "不":
-                word = last_word + word
-            if word != "不":
-                new_seg.append((word, pos))
-            last_word = word[:]
-        if last_word == "不":
-            new_seg.append((last_word, "d"))
-            last_word = ""
-        return new_seg
-    # function 1: merge "一" and reduplication words in it's left and right, e.g. "听","一","听" ->"听一听"
-    # function 2: merge single  "一" and the word behind it
-    # if don't merge, "一" sometimes appears alone according to jieba, which may occur sandhi error
-    # e.g.
-    # input seg: [('听', 'v'), ('一', 'm'), ('听', 'v')]
-    # output seg: [['听一听', 'v']]
-    def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        # function 1
-        for i, (word, pos) in enumerate(seg):
-            if (
-                i - 1 >= 0
-                and word == "一"
-                and i + 1 < len(seg)
-                and seg[i - 1][0] == seg[i + 1][0]
-                and seg[i - 1][1] == "v"
-            ):
-                new_seg[i - 1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
-            else:
-                if (
-                    i - 2 >= 0
-                    and seg[i - 1][0] == "一"
-                    and seg[i - 2][0] == word
-                    and pos == "v"
-                ):
-                    continue
-                else:
-                    new_seg.append([word, pos])
-        seg = new_seg
-        new_seg = []
-        # function 2
-        for i, (word, pos) in enumerate(seg):
-            if new_seg and new_seg[-1][0] == "一":
-                new_seg[-1][0] = new_seg[-1][0] + word
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    # the first and the second words are all_tone_three
-    def _merge_continuous_three_tones(
-        self, seg: List[Tuple[str, str]]
-    ) -> List[Tuple[str, str]]:
-        new_seg = []
-        sub_finals_list = [
-            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
-            for (word, pos) in seg
-        ]
-        assert len(sub_finals_list) == len(seg)
-        merge_last = [False] * len(seg)
-        for i, (word, pos) in enumerate(seg):
-            if (
-                i - 1 >= 0
-                and self._all_tone_three(sub_finals_list[i - 1])
-                and self._all_tone_three(sub_finals_list[i])
-                and not merge_last[i - 1]
-            ):
-                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if (
-                    not self._is_reduplication(seg[i - 1][0])
-                    and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
-                ):
-                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-                    merge_last[i] = True
-                else:
-                    new_seg.append([word, pos])
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def _is_reduplication(self, word: str) -> bool:
-        return len(word) == 2 and word[0] == word[1]
-    # the last char of first word and the first char of second word is tone_three
-    def _merge_continuous_three_tones_2(
-        self, seg: List[Tuple[str, str]]
-    ) -> List[Tuple[str, str]]:
-        new_seg = []
-        sub_finals_list = [
-            lazy_pinyin(word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
-            for (word, pos) in seg
-        ]
-        assert len(sub_finals_list) == len(seg)
-        merge_last = [False] * len(seg)
-        for i, (word, pos) in enumerate(seg):
-            if (
-                i - 1 >= 0
-                and sub_finals_list[i - 1][-1][-1] == "3"
-                and sub_finals_list[i][0][-1] == "3"
-                and not merge_last[i - 1]
-            ):
-                # if the last word is reduplication, not merge, because reduplication need to be _neural_sandhi
-                if (
-                    not self._is_reduplication(seg[i - 1][0])
-                    and len(seg[i - 1][0]) + len(seg[i][0]) <= 3
-                ):
-                    new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-                    merge_last[i] = True
-                else:
-                    new_seg.append([word, pos])
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def _merge_er(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and word == "儿" and seg[i - 1][0] != "#":
-                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def _merge_reduplication(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        new_seg = []
-        for i, (word, pos) in enumerate(seg):
-            if new_seg and word == new_seg[-1][0]:
-                new_seg[-1][0] = new_seg[-1][0] + seg[i][0]
-            else:
-                new_seg.append([word, pos])
-        return new_seg
-    def pre_merge_for_modify(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
-        seg = self._merge_bu(seg)
-        try:
-            seg = self._merge_yi(seg)
-        except:
-            print("_merge_yi failed")
-        seg = self._merge_reduplication(seg)
-        seg = self._merge_continuous_three_tones(seg)
-        seg = self._merge_continuous_three_tones_2(seg)
-        seg = self._merge_er(seg)
-        return seg
-    def modified_tone(self, word: str, pos: str, finals: List[str]) -> List[str]:
-        finals = self._bu_sandhi(word, finals)
-        finals = self._yi_sandhi(word, finals)
-        finals = self._neural_sandhi(word, pos, finals)
-        finals = self._three_sandhi(word, finals)
-        return finals