formosan-f5-tts / ipa /__init__.py
txya900619's picture
feat: rename EXTRA_G2P to EXTRA_FORMOSAN_G2P and update load_g2p function to handle lists, enhance text processing
3872616
raw
history blame
1.79 kB
import csv
from io import BytesIO
import requests
from omegaconf import OmegaConf
EXTRA_FORMOSAN_G2P = {
"z": "z",
"o": "o",
"h": "h",
"g": "g",
"y": "j",
"w": "w",
"c": "ʦ",
"u": "u",
"f": "f",
"v": "v",
"j": "ɟ",
"b": "b",
"q": "q",
"e": "e",
"l": "l",
"d": "d",
}
def gh_download(repo, path):
headers = {
"Accept": "application/vnd.github.raw+json",
}
url = f"https://api.github.com/repos/{repo}/contents/{path}"
response = requests.get(url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to download {path} from {repo}, response: {response}")
response.encoding = "utf-8-sig"
return response.text
def load_g2p(g2p_string):
g2p = dict()
csv_reader = csv.DictReader(g2p_string.split("\n"))
for row in csv_reader:
language = row["Language"]
dialect = row["Dialect"]
if dialect == "-":
lang_tag = f"{language}"
else:
lang_tag = f"{language}_{dialect}"
for key in row:
if key in ["Language", "Dialect"]:
continue
if row[key] == "-":
continue
g2p[lang_tag] = g2p.get(lang_tag, {})
g2p[lang_tag][key] = row[key].split(",")
for g, p in EXTRA_FORMOSAN_G2P.items():
if g not in g2p[lang_tag]:
g2p[lang_tag][g] = p
for lang_tag in g2p:
# 按照 key 的字元長度排序
g2p[lang_tag] = dict(
sorted(g2p[lang_tag].items(), key=lambda x: len(x[0]), reverse=True)
)
return g2p
OmegaConf.register_new_resolver("gh_download", gh_download)
OmegaConf.register_new_resolver("load_g2p", load_g2p)