txya900619 commited on
Commit
3872616
·
1 Parent(s): 14f96db

feat: rename EXTRA_G2P to EXTRA_FORMOSAN_G2P and update load_g2p function to handle lists, enhance text processing

Browse files
Files changed (2) hide show
  1. ipa/__init__.py +11 -4
  2. ipa/ipa.py +116 -35
ipa/__init__.py CHANGED
@@ -4,7 +4,7 @@ from io import BytesIO
4
  import requests
5
  from omegaconf import OmegaConf
6
 
7
- EXTRA_G2P = {
8
  "z": "z",
9
  "o": "o",
10
  "h": "h",
@@ -19,7 +19,8 @@ EXTRA_G2P = {
19
  "b": "b",
20
  "q": "q",
21
  "e": "e",
22
- ",": ",",
 
23
  }
24
 
25
 
@@ -59,12 +60,18 @@ def load_g2p(g2p_string):
59
  continue
60
 
61
  g2p[lang_tag] = g2p.get(lang_tag, {})
62
- g2p[lang_tag][key] = row[key].split(",")[0]
63
 
64
- for g, p in EXTRA_G2P.items():
65
  if g not in g2p[lang_tag]:
66
  g2p[lang_tag][g] = p
67
 
 
 
 
 
 
 
68
  return g2p
69
 
70
 
 
4
  import requests
5
  from omegaconf import OmegaConf
6
 
7
+ EXTRA_FORMOSAN_G2P = {
8
  "z": "z",
9
  "o": "o",
10
  "h": "h",
 
19
  "b": "b",
20
  "q": "q",
21
  "e": "e",
22
+ "l": "l",
23
+ "d": "d",
24
  }
25
 
26
 
 
60
  continue
61
 
62
  g2p[lang_tag] = g2p.get(lang_tag, {})
63
+ g2p[lang_tag][key] = row[key].split(",")
64
 
65
+ for g, p in EXTRA_FORMOSAN_G2P.items():
66
  if g not in g2p[lang_tag]:
67
  g2p[lang_tag][g] = p
68
 
69
+ for lang_tag in g2p:
70
+ # 按照 key 的字元長度排序
71
+ g2p[lang_tag] = dict(
72
+ sorted(g2p[lang_tag].items(), key=lambda x: len(x[0]), reverse=True)
73
+ )
74
+
75
  return g2p
76
 
77
 
ipa/ipa.py CHANGED
@@ -1,4 +1,5 @@
1
  import re
 
2
 
3
  import gradio as gr
4
  from omegaconf import OmegaConf
@@ -7,52 +8,132 @@ g2p_config = OmegaConf.load("configs/g2p.yaml")
7
  g2p_object = OmegaConf.to_object(g2p_config)["g2p"]
8
 
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def text_to_ipa(
11
  text: str, language: str, ignore_punctuation=False, ipa_with_ng=False
12
  ) -> str:
13
- text = text.lower()
14
- text = text.replace("'", "’")
15
  text = re.sub(r"\s+", " ", text) # remove extra spaces
16
- words = text.split() # change in future
17
-
18
- print(f"text: {words}")
19
-
20
- ipa = []
21
- unknown_chars = set()
22
- extended_g2p = {**g2p_object[language], ",": ",", ".": ".", "?": "?", "!": "!"}
23
- extended_g2p_sorted_keys = sorted(extended_g2p.keys(), key=len, reverse=True)
24
- for word in words:
25
- unknown_char = word
26
- converted_word = word
27
- for key in extended_g2p_sorted_keys:
28
- unknown_char = unknown_char.replace(key, "")
29
- converted_word = converted_word.replace(key, extended_g2p[key])
30
-
31
- if len(unknown_char) > 0: # If there are unknown characters
32
- unknown_chars.update(set(unknown_char))
33
- continue
34
 
35
- ipa.append(converted_word)
36
 
37
  if len(unknown_chars) > 0:
38
  raise gr.Error(
39
  f"Unknown characters: {', '.join(unknown_chars)}. Please remove them and try again."
40
  )
41
 
42
- ipa = (
43
- " ".join(ipa)
44
- .replace("ʦ", "t͡s")
45
- .replace("ʨ", "t͡ɕ")
46
- .replace("R", "ʀ")
47
- .replace("ʤ", "d͡ʒ")
48
- # .replace("g", "ɡ")
49
- )
50
-
51
- if ignore_punctuation:
52
- ipa = re.sub(r"[.?!,]", "", ipa)
53
-
54
- if ipa_with_ng:
55
- ipa = ipa.replace("ŋ", "nɡ")
56
 
57
  print(f"ipa: {ipa}")
58
  return ipa
 
1
  import re
2
+ from typing import Optional, Tuple
3
 
4
  import gradio as gr
5
  from omegaconf import OmegaConf
 
8
  g2p_object = OmegaConf.to_object(g2p_config)["g2p"]
9
 
10
 
11
+ def lower_formosan_text(raw_text: str, language: str) -> str:
12
+ text = list(raw_text.strip())
13
+ if language == "賽夏":
14
+ for i, char in enumerate(text):
15
+ if char == "S":
16
+ if i == 0:
17
+ text[i] = char.lower()
18
+ else:
19
+ text[i] = char.lower()
20
+ elif language == "噶瑪蘭":
21
+ for i, char in enumerate(text):
22
+ if char == "R":
23
+ text[i] = char
24
+ else:
25
+ text[i] = char.lower()
26
+ else:
27
+ for i, char in enumerate(text):
28
+ text[i] = char.lower()
29
+
30
+ text = "".join(text)
31
+
32
+ return text
33
+
34
+
35
+ def replace_to_list(text: str, g2p: dict) -> Tuple[list, set]:
36
+ # 創建標記陣列,記錄哪些位置已被處理
37
+ marked = [False] * len(text)
38
+
39
+ # 創建結果列表和臨時緩衝區
40
+ result = []
41
+ buffer = ""
42
+ oovs = set()
43
+
44
+ # 處理文本
45
+ i = 0
46
+ while i < len(text):
47
+ # 如果當前位置已經被處理過,跳過
48
+ if marked[i]:
49
+ i += 1
50
+ continue
51
+
52
+ # 尋找匹配的 key
53
+ found_key = None
54
+ found_pos = -1
55
+
56
+ for key in g2p:
57
+ # 檢查當前位置是否匹配 key
58
+ if i + len(key) <= len(text) and text[i : i + len(key)] == key:
59
+ # 檢查這個範圍是否已有部分被處理過
60
+ if not any(marked[i : i + len(key)]):
61
+ found_key = key
62
+ found_pos = i
63
+ break
64
+
65
+ # 如果找到匹配的 key
66
+ if found_key:
67
+ # 先保存緩衝區中的內容(如果有)
68
+ if buffer:
69
+ result.append(buffer)
70
+ buffer = ""
71
+
72
+ # 添加替換後的值到結果列表
73
+ result.append(g2p[found_key][0])
74
+
75
+ # 標記已處理的位置
76
+ for j in range(found_pos, found_pos + len(found_key)):
77
+ marked[j] = True
78
+
79
+ # 移到下一個未處理的位置
80
+ i = found_pos + len(found_key)
81
+ else:
82
+ # 沒有匹配的 key,添加到緩衝區
83
+ buffer += text[i]
84
+ oovs.add(text[i])
85
+ i += 1
86
+
87
+ # 不要忘記添加最後的緩衝區內容
88
+ if buffer:
89
+ result.append(buffer)
90
+
91
+ return result, oovs
92
+
93
+
94
+ def convert_to_ipa(
95
+ text: str, g2p: dict, end_punctuations: list = ["!", "?", ".", ";", ","]
96
+ ) -> Tuple[Optional[str], list]:
97
+ result_list = []
98
+ oovs_to_ipa = set()
99
+
100
+ for word in text.split():
101
+ ending_punct = ""
102
+ if word and word[-1] in end_punctuations:
103
+ ending_punct = word[-1]
104
+ word = word[:-1]
105
+
106
+ ipa_list, oovs = replace_to_list(word, g2p)
107
+ if len(oovs):
108
+ oovs_to_ipa.update(oovs)
109
+ continue
110
+
111
+ ipa_string = "".join(ipa_list) + ending_punct
112
+ result_list.append(ipa_string)
113
+
114
+ if len(oovs_to_ipa) or len(result_list) == 0:
115
+ return None, sorted(oovs_to_ipa)
116
+
117
+ result = " ".join(result_list)
118
+
119
+ return result, []
120
+
121
+
122
  def text_to_ipa(
123
  text: str, language: str, ignore_punctuation=False, ipa_with_ng=False
124
  ) -> str:
125
+ text = lower_formosan_text(text, language)
126
+ # text = text.replace("'", "’")
127
  text = re.sub(r"\s+", " ", text) # remove extra spaces
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ ipa, unknown_chars = convert_to_ipa(text, g2p_object[language])
130
 
131
  if len(unknown_chars) > 0:
132
  raise gr.Error(
133
  f"Unknown characters: {', '.join(unknown_chars)}. Please remove them and try again."
134
  )
135
 
136
+ ipa = ipa.replace("ʦ", "t͡s").replace("ʨ", "t͡ɕ").replace("ʤ", "d͡ʒ")
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  print(f"ipa: {ipa}")
139
  return ipa