Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
3872616
1
Parent(s):
14f96db
feat: rename EXTRA_G2P to EXTRA_FORMOSAN_G2P and update load_g2p function to handle lists, enhance text processing
Browse files- ipa/__init__.py +11 -4
- ipa/ipa.py +116 -35
ipa/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from io import BytesIO
|
|
4 |
import requests
|
5 |
from omegaconf import OmegaConf
|
6 |
|
7 |
-
|
8 |
"z": "z",
|
9 |
"o": "o",
|
10 |
"h": "h",
|
@@ -19,7 +19,8 @@ EXTRA_G2P = {
|
|
19 |
"b": "b",
|
20 |
"q": "q",
|
21 |
"e": "e",
|
22 |
-
"
|
|
|
23 |
}
|
24 |
|
25 |
|
@@ -59,12 +60,18 @@ def load_g2p(g2p_string):
|
|
59 |
continue
|
60 |
|
61 |
g2p[lang_tag] = g2p.get(lang_tag, {})
|
62 |
-
g2p[lang_tag][key] = row[key].split(",")
|
63 |
|
64 |
-
for g, p in
|
65 |
if g not in g2p[lang_tag]:
|
66 |
g2p[lang_tag][g] = p
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
return g2p
|
69 |
|
70 |
|
|
|
4 |
import requests
|
5 |
from omegaconf import OmegaConf
|
6 |
|
7 |
+
EXTRA_FORMOSAN_G2P = {
|
8 |
"z": "z",
|
9 |
"o": "o",
|
10 |
"h": "h",
|
|
|
19 |
"b": "b",
|
20 |
"q": "q",
|
21 |
"e": "e",
|
22 |
+
"l": "l",
|
23 |
+
"d": "d",
|
24 |
}
|
25 |
|
26 |
|
|
|
60 |
continue
|
61 |
|
62 |
g2p[lang_tag] = g2p.get(lang_tag, {})
|
63 |
+
g2p[lang_tag][key] = row[key].split(",")
|
64 |
|
65 |
+
for g, p in EXTRA_FORMOSAN_G2P.items():
|
66 |
if g not in g2p[lang_tag]:
|
67 |
g2p[lang_tag][g] = p
|
68 |
|
69 |
+
for lang_tag in g2p:
|
70 |
+
# 按照 key 的字元長度排序
|
71 |
+
g2p[lang_tag] = dict(
|
72 |
+
sorted(g2p[lang_tag].items(), key=lambda x: len(x[0]), reverse=True)
|
73 |
+
)
|
74 |
+
|
75 |
return g2p
|
76 |
|
77 |
|
ipa/ipa.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import re
|
|
|
2 |
|
3 |
import gradio as gr
|
4 |
from omegaconf import OmegaConf
|
@@ -7,52 +8,132 @@ g2p_config = OmegaConf.load("configs/g2p.yaml")
|
|
7 |
g2p_object = OmegaConf.to_object(g2p_config)["g2p"]
|
8 |
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def text_to_ipa(
|
11 |
text: str, language: str, ignore_punctuation=False, ipa_with_ng=False
|
12 |
) -> str:
|
13 |
-
text = text
|
14 |
-
text = text.replace("'", "’")
|
15 |
text = re.sub(r"\s+", " ", text) # remove extra spaces
|
16 |
-
words = text.split() # change in future
|
17 |
-
|
18 |
-
print(f"text: {words}")
|
19 |
-
|
20 |
-
ipa = []
|
21 |
-
unknown_chars = set()
|
22 |
-
extended_g2p = {**g2p_object[language], ",": ",", ".": ".", "?": "?", "!": "!"}
|
23 |
-
extended_g2p_sorted_keys = sorted(extended_g2p.keys(), key=len, reverse=True)
|
24 |
-
for word in words:
|
25 |
-
unknown_char = word
|
26 |
-
converted_word = word
|
27 |
-
for key in extended_g2p_sorted_keys:
|
28 |
-
unknown_char = unknown_char.replace(key, "")
|
29 |
-
converted_word = converted_word.replace(key, extended_g2p[key])
|
30 |
-
|
31 |
-
if len(unknown_char) > 0: # If there are unknown characters
|
32 |
-
unknown_chars.update(set(unknown_char))
|
33 |
-
continue
|
34 |
|
35 |
-
|
36 |
|
37 |
if len(unknown_chars) > 0:
|
38 |
raise gr.Error(
|
39 |
f"Unknown characters: {', '.join(unknown_chars)}. Please remove them and try again."
|
40 |
)
|
41 |
|
42 |
-
ipa = (
|
43 |
-
" ".join(ipa)
|
44 |
-
.replace("ʦ", "t͡s")
|
45 |
-
.replace("ʨ", "t͡ɕ")
|
46 |
-
.replace("R", "ʀ")
|
47 |
-
.replace("ʤ", "d͡ʒ")
|
48 |
-
# .replace("g", "ɡ")
|
49 |
-
)
|
50 |
-
|
51 |
-
if ignore_punctuation:
|
52 |
-
ipa = re.sub(r"[.?!,]", "", ipa)
|
53 |
-
|
54 |
-
if ipa_with_ng:
|
55 |
-
ipa = ipa.replace("ŋ", "nɡ")
|
56 |
|
57 |
print(f"ipa: {ipa}")
|
58 |
return ipa
|
|
|
1 |
import re
|
2 |
+
from typing import Optional, Tuple
|
3 |
|
4 |
import gradio as gr
|
5 |
from omegaconf import OmegaConf
|
|
|
8 |
g2p_object = OmegaConf.to_object(g2p_config)["g2p"]
|
9 |
|
10 |
|
11 |
+
def lower_formosan_text(raw_text: str, language: str) -> str:
|
12 |
+
text = list(raw_text.strip())
|
13 |
+
if language == "賽夏":
|
14 |
+
for i, char in enumerate(text):
|
15 |
+
if char == "S":
|
16 |
+
if i == 0:
|
17 |
+
text[i] = char.lower()
|
18 |
+
else:
|
19 |
+
text[i] = char.lower()
|
20 |
+
elif language == "噶瑪蘭":
|
21 |
+
for i, char in enumerate(text):
|
22 |
+
if char == "R":
|
23 |
+
text[i] = char
|
24 |
+
else:
|
25 |
+
text[i] = char.lower()
|
26 |
+
else:
|
27 |
+
for i, char in enumerate(text):
|
28 |
+
text[i] = char.lower()
|
29 |
+
|
30 |
+
text = "".join(text)
|
31 |
+
|
32 |
+
return text
|
33 |
+
|
34 |
+
|
35 |
+
def replace_to_list(text: str, g2p: dict) -> Tuple[list, set]:
|
36 |
+
# 創建標記陣列,記錄哪些位置已被處理
|
37 |
+
marked = [False] * len(text)
|
38 |
+
|
39 |
+
# 創建結果列表和臨時緩衝區
|
40 |
+
result = []
|
41 |
+
buffer = ""
|
42 |
+
oovs = set()
|
43 |
+
|
44 |
+
# 處理文本
|
45 |
+
i = 0
|
46 |
+
while i < len(text):
|
47 |
+
# 如果當前位置已經被處理過,跳過
|
48 |
+
if marked[i]:
|
49 |
+
i += 1
|
50 |
+
continue
|
51 |
+
|
52 |
+
# 尋找匹配的 key
|
53 |
+
found_key = None
|
54 |
+
found_pos = -1
|
55 |
+
|
56 |
+
for key in g2p:
|
57 |
+
# 檢查當前位置是否匹配 key
|
58 |
+
if i + len(key) <= len(text) and text[i : i + len(key)] == key:
|
59 |
+
# 檢查這個範圍是否已有部分被處理過
|
60 |
+
if not any(marked[i : i + len(key)]):
|
61 |
+
found_key = key
|
62 |
+
found_pos = i
|
63 |
+
break
|
64 |
+
|
65 |
+
# 如果找到匹配的 key
|
66 |
+
if found_key:
|
67 |
+
# 先保存緩衝區中的內容(如果有)
|
68 |
+
if buffer:
|
69 |
+
result.append(buffer)
|
70 |
+
buffer = ""
|
71 |
+
|
72 |
+
# 添加替換後的值到結果列表
|
73 |
+
result.append(g2p[found_key][0])
|
74 |
+
|
75 |
+
# 標記已處理的位置
|
76 |
+
for j in range(found_pos, found_pos + len(found_key)):
|
77 |
+
marked[j] = True
|
78 |
+
|
79 |
+
# 移到下一個未處理的位置
|
80 |
+
i = found_pos + len(found_key)
|
81 |
+
else:
|
82 |
+
# 沒有匹配的 key,添加到緩衝區
|
83 |
+
buffer += text[i]
|
84 |
+
oovs.add(text[i])
|
85 |
+
i += 1
|
86 |
+
|
87 |
+
# 不要忘記添加最後的緩衝區內容
|
88 |
+
if buffer:
|
89 |
+
result.append(buffer)
|
90 |
+
|
91 |
+
return result, oovs
|
92 |
+
|
93 |
+
|
94 |
+
def convert_to_ipa(
|
95 |
+
text: str, g2p: dict, end_punctuations: list = ["!", "?", ".", ";", ","]
|
96 |
+
) -> Tuple[Optional[str], list]:
|
97 |
+
result_list = []
|
98 |
+
oovs_to_ipa = set()
|
99 |
+
|
100 |
+
for word in text.split():
|
101 |
+
ending_punct = ""
|
102 |
+
if word and word[-1] in end_punctuations:
|
103 |
+
ending_punct = word[-1]
|
104 |
+
word = word[:-1]
|
105 |
+
|
106 |
+
ipa_list, oovs = replace_to_list(word, g2p)
|
107 |
+
if len(oovs):
|
108 |
+
oovs_to_ipa.update(oovs)
|
109 |
+
continue
|
110 |
+
|
111 |
+
ipa_string = "".join(ipa_list) + ending_punct
|
112 |
+
result_list.append(ipa_string)
|
113 |
+
|
114 |
+
if len(oovs_to_ipa) or len(result_list) == 0:
|
115 |
+
return None, sorted(oovs_to_ipa)
|
116 |
+
|
117 |
+
result = " ".join(result_list)
|
118 |
+
|
119 |
+
return result, []
|
120 |
+
|
121 |
+
|
122 |
def text_to_ipa(
|
123 |
text: str, language: str, ignore_punctuation=False, ipa_with_ng=False
|
124 |
) -> str:
|
125 |
+
text = lower_formosan_text(text, language)
|
126 |
+
# text = text.replace("'", "’")
|
127 |
text = re.sub(r"\s+", " ", text) # remove extra spaces
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
+
ipa, unknown_chars = convert_to_ipa(text, g2p_object[language])
|
130 |
|
131 |
if len(unknown_chars) > 0:
|
132 |
raise gr.Error(
|
133 |
f"Unknown characters: {', '.join(unknown_chars)}. Please remove them and try again."
|
134 |
)
|
135 |
|
136 |
+
ipa = ipa.replace("ʦ", "t͡s").replace("ʨ", "t͡ɕ").replace("ʤ", "d͡ʒ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
print(f"ipa: {ipa}")
|
139 |
return ipa
|