Spaces:
Running
Running
| import json | |
| import re | |
| import warnings | |
| from pathlib import Path | |
| from kanjiconv import KanjiConv | |
| from pypinyin import lazy_pinyin | |
| from .resources.pinyin_dict import PINYIN_DICT | |
| kanji_to_kana = KanjiConv() | |
| yoon_map = { | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| } | |
| # ACE_phonemes | |
| with open(Path(__file__).parent / "resources" / "all_plans.json", "r") as f: | |
| ace_phonemes_all_plans = json.load(f) | |
| for plan in ace_phonemes_all_plans["plans"]: | |
| if plan["language"] == "zh": | |
| ace_phonemes_zh_plan = plan | |
| break | |
| def preprocess_text(text: str, language: str) -> list[str]: | |
| text = text.replace(" ", "") | |
| if language == "mandarin": | |
| text_list = to_pinyin(text) | |
| elif language == "japanese": | |
| text_list = to_kana(text) | |
| else: | |
| raise ValueError(f"Other languages are not supported") | |
| return text_list | |
| def to_pinyin(text: str) -> list[str]: | |
| pinyin_list = lazy_pinyin(text) | |
| text_list = [] | |
| for text in pinyin_list: | |
| if text[0] == "S" or text[0] == "A" or text[0] == "-": | |
| sp_strs = re.findall(r"-|AP|SP", text) | |
| for phn in sp_strs: | |
| text_list.append(phn) | |
| else: | |
| text_list.append(text) | |
| return text_list | |
| def replace_chouonpu(hiragana_text: str) -> str: | |
| """processใใผใsince the previous packages didn't support""" | |
| vowels = { | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใก": "ใ", | |
| "ใค": "ใ", | |
| "ใฆ": "ใ", | |
| "ใจ": "ใ", | |
| "ใช": "ใ", | |
| "ใซ": "ใ", | |
| "ใฌ": "ใ", | |
| "ใญ": "ใ", | |
| "ใฎ": "ใ", | |
| "ใฏ": "ใ", | |
| "ใฒ": "ใ", | |
| "ใต": "ใ", | |
| "ใธ": "ใ", | |
| "ใป": "ใ", | |
| "ใพ": "ใ", | |
| "ใฟ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| "ใ": "ใ", | |
| } | |
| new_text = [] | |
| for i, char in enumerate(hiragana_text): | |
| if char == "ใผ" and i > 0: | |
| prev_char = new_text[-1] | |
| if prev_char in yoon_map: | |
| prev_char = yoon_map[prev_char] | |
| new_text.append(vowels.get(prev_char, prev_char)) | |
| else: | |
| new_text.append(char) | |
| return "".join(new_text) | |
| def to_kana(text: str) -> list[str]: | |
| hiragana_text = kanji_to_kana.to_hiragana(text.replace(" ", "")) | |
| hiragana_text_wl = replace_chouonpu(hiragana_text).split(" ") | |
| final_ls = [] | |
| for subword in hiragana_text_wl: | |
| sl_prev = 0 | |
| for i in range(len(subword) - 1): | |
| if sl_prev >= len(subword) - 1: | |
| break | |
| sl = sl_prev + 1 | |
| if subword[sl] in yoon_map: | |
| final_ls.append(subword[sl_prev : sl + 1]) | |
| sl_prev += 2 | |
| else: | |
| final_ls.append(subword[sl_prev]) | |
| sl_prev += 1 | |
| final_ls.append(subword[sl_prev]) | |
| return final_ls | |
| def kana_to_phonemes_openjtalk(kana: str) -> list[str]: | |
| import pyopenjtalk | |
| with warnings.catch_warnings(record=True) as w: | |
| warnings.simplefilter("always") | |
| # add space between each character | |
| kana = " ".join(list(kana)) | |
| # phones is a str object separated by space | |
| phones = pyopenjtalk.g2p(kana, kana=False) | |
| if len(w) > 0: | |
| for warning in w: | |
| if "No phoneme" in str(warning.message): | |
| raise ValueError(f"No phoneme found for {kana}. {warning.message}") | |
| phones = phones.split(" ") | |
| return phones | |
| def pinyin_to_phonemes_opencpop(pinyin: str) -> list[str]: | |
| pinyin = pinyin.lower() | |
| if pinyin in ace_phonemes_zh_plan["dict"]: | |
| phns = ace_phonemes_zh_plan["dict"][pinyin] | |
| return phns | |
| elif pinyin in ace_phonemes_zh_plan["syllable_alias"]: | |
| phns = ace_phonemes_zh_plan["dict"][ | |
| ace_phonemes_zh_plan["syllable_alias"][pinyin] | |
| ] | |
| return phns | |
| else: | |
| raise ValueError(f"{pinyin} not registered in Opencpop phoneme dict") | |
| def pinyin_to_phonemes_ace(pinyin: str) -> list[str]: | |
| pinyin = pinyin.lower() | |
| if pinyin in PINYIN_DICT: | |
| phns = PINYIN_DICT[pinyin] | |
| return phns | |
| else: | |
| raise ValueError(f"{pinyin} not registered in ACE phoneme dict") | |