Spaces:
Running
on
Zero
Running
on
Zero
feat: rename EXTRA_G2P to EXTRA_FORMOSAN_G2P and update load_g2p function to handle lists, enhance text processing
3872616
import re | |
from typing import Optional, Tuple | |
import gradio as gr | |
from omegaconf import OmegaConf | |
g2p_config = OmegaConf.load("configs/g2p.yaml") | |
g2p_object = OmegaConf.to_object(g2p_config)["g2p"] | |
def lower_formosan_text(raw_text: str, language: str) -> str: | |
text = list(raw_text.strip()) | |
if language == "賽夏": | |
for i, char in enumerate(text): | |
if char == "S": | |
if i == 0: | |
text[i] = char.lower() | |
else: | |
text[i] = char.lower() | |
elif language == "噶瑪蘭": | |
for i, char in enumerate(text): | |
if char == "R": | |
text[i] = char | |
else: | |
text[i] = char.lower() | |
else: | |
for i, char in enumerate(text): | |
text[i] = char.lower() | |
text = "".join(text) | |
return text | |
def replace_to_list(text: str, g2p: dict) -> Tuple[list, set]: | |
# 創建標記陣列,記錄哪些位置已被處理 | |
marked = [False] * len(text) | |
# 創建結果列表和臨時緩衝區 | |
result = [] | |
buffer = "" | |
oovs = set() | |
# 處理文本 | |
i = 0 | |
while i < len(text): | |
# 如果當前位置已經被處理過,跳過 | |
if marked[i]: | |
i += 1 | |
continue | |
# 尋找匹配的 key | |
found_key = None | |
found_pos = -1 | |
for key in g2p: | |
# 檢查當前位置是否匹配 key | |
if i + len(key) <= len(text) and text[i : i + len(key)] == key: | |
# 檢查這個範圍是否已有部分被處理過 | |
if not any(marked[i : i + len(key)]): | |
found_key = key | |
found_pos = i | |
break | |
# 如果找到匹配的 key | |
if found_key: | |
# 先保存緩衝區中的內容(如果有) | |
if buffer: | |
result.append(buffer) | |
buffer = "" | |
# 添加替換後的值到結果列表 | |
result.append(g2p[found_key][0]) | |
# 標記已處理的位置 | |
for j in range(found_pos, found_pos + len(found_key)): | |
marked[j] = True | |
# 移到下一個未處理的位置 | |
i = found_pos + len(found_key) | |
else: | |
# 沒有匹配的 key,添加到緩衝區 | |
buffer += text[i] | |
oovs.add(text[i]) | |
i += 1 | |
# 不要忘記添加最後的緩衝區內容 | |
if buffer: | |
result.append(buffer) | |
return result, oovs | |
def convert_to_ipa( | |
text: str, g2p: dict, end_punctuations: list = ["!", "?", ".", ";", ","] | |
) -> Tuple[Optional[str], list]: | |
result_list = [] | |
oovs_to_ipa = set() | |
for word in text.split(): | |
ending_punct = "" | |
if word and word[-1] in end_punctuations: | |
ending_punct = word[-1] | |
word = word[:-1] | |
ipa_list, oovs = replace_to_list(word, g2p) | |
if len(oovs): | |
oovs_to_ipa.update(oovs) | |
continue | |
ipa_string = "".join(ipa_list) + ending_punct | |
result_list.append(ipa_string) | |
if len(oovs_to_ipa) or len(result_list) == 0: | |
return None, sorted(oovs_to_ipa) | |
result = " ".join(result_list) | |
return result, [] | |
def text_to_ipa( | |
text: str, language: str, ignore_punctuation=False, ipa_with_ng=False | |
) -> str: | |
text = lower_formosan_text(text, language) | |
# text = text.replace("'", "’") | |
text = re.sub(r"\s+", " ", text) # remove extra spaces | |
ipa, unknown_chars = convert_to_ipa(text, g2p_object[language]) | |
if len(unknown_chars) > 0: | |
raise gr.Error( | |
f"Unknown characters: {', '.join(unknown_chars)}. Please remove them and try again." | |
) | |
ipa = ipa.replace("ʦ", "t͡s").replace("ʨ", "t͡ɕ").replace("ʤ", "d͡ʒ") | |
print(f"ipa: {ipa}") | |
return ipa | |