Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,793 Bytes
0c074b9 3872616 0c074b9 3872616 0c074b9 3872616 0c074b9 3872616 0c074b9 3872616 0c074b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import csv
from io import BytesIO
import requests
from omegaconf import OmegaConf
EXTRA_FORMOSAN_G2P = {
"z": "z",
"o": "o",
"h": "h",
"g": "g",
"y": "j",
"w": "w",
"c": "ʦ",
"u": "u",
"f": "f",
"v": "v",
"j": "ɟ",
"b": "b",
"q": "q",
"e": "e",
"l": "l",
"d": "d",
}
def gh_download(repo, path):
headers = {
"Accept": "application/vnd.github.raw+json",
}
url = f"https://api.github.com/repos/{repo}/contents/{path}"
response = requests.get(url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to download {path} from {repo}, response: {response}")
response.encoding = "utf-8-sig"
return response.text
def load_g2p(g2p_string):
g2p = dict()
csv_reader = csv.DictReader(g2p_string.split("\n"))
for row in csv_reader:
language = row["Language"]
dialect = row["Dialect"]
if dialect == "-":
lang_tag = f"{language}"
else:
lang_tag = f"{language}_{dialect}"
for key in row:
if key in ["Language", "Dialect"]:
continue
if row[key] == "-":
continue
g2p[lang_tag] = g2p.get(lang_tag, {})
g2p[lang_tag][key] = row[key].split(",")
for g, p in EXTRA_FORMOSAN_G2P.items():
if g not in g2p[lang_tag]:
g2p[lang_tag][g] = p
for lang_tag in g2p:
# 按照 key 的字元長度排序
g2p[lang_tag] = dict(
sorted(g2p[lang_tag].items(), key=lambda x: len(x[0]), reverse=True)
)
return g2p
OmegaConf.register_new_resolver("gh_download", gh_download)
OmegaConf.register_new_resolver("load_g2p", load_g2p)
|