Spaces:
Running
Running
Use simple tokenizer (#34)
Browse files* Fix couple cases
* Update formatter.py
---------
Co-authored-by: Yurii Paniv <[email protected]>
- tests/test_formatter.py +27 -7
- ukrainian_tts/formatter.py +30 -6
tests/test_formatter.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
from ukrainian_tts.formatter import preprocess_text
|
2 |
import pytest
|
3 |
|
4 |
-
|
5 |
@pytest.mark.parametrize(
|
6 |
"text,expected",
|
7 |
[
|
@@ -21,22 +20,43 @@ import pytest
|
|
21 |
),
|
22 |
(
|
23 |
"10000$, 15000 корупціонерів",
|
24 |
-
"десять тисяч
|
25 |
-
),
|
|
|
|
|
|
|
|
|
26 |
(
|
27 |
"$10000, 15000 корупціонерів",
|
28 |
-
"
|
29 |
-
),
|
30 |
(
|
31 |
"10000$ у еквіваленті борщових заправок",
|
32 |
"десять тисяч доларів у еквіваленті борщових заправок",
|
33 |
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
# this is wrong case, should be "це дев'ятнадцятирічне вино."
|
35 |
# Implementing this, require to have proper parsing of words into the token stream
|
36 |
# which reqiure reworking of current approach.
|
37 |
("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
|
38 |
-
("10-30-40-50-5-9-5", "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"),
|
39 |
],
|
40 |
)
|
41 |
-
def
|
42 |
assert preprocess_text(text) == expected
|
|
|
1 |
from ukrainian_tts.formatter import preprocess_text
|
2 |
import pytest
|
3 |
|
|
|
4 |
@pytest.mark.parametrize(
|
5 |
"text,expected",
|
6 |
[
|
|
|
20 |
),
|
21 |
(
|
22 |
"10000$, 15000 корупціонерів",
|
23 |
+
"десять тисяч доларів, п'ятнадцять тисяч корупціонерів",
|
24 |
+
),
|
25 |
+
(
|
26 |
+
"10000 $, 15000 корупціонерів",
|
27 |
+
"десять тисяч доларів, п'ятнадцять тисяч корупціонерів",
|
28 |
+
),
|
29 |
(
|
30 |
"$10000, 15000 корупціонерів",
|
31 |
+
"десять тисяч доларів, п'ятнадцять тисяч корупціонерів",
|
32 |
+
),
|
33 |
(
|
34 |
"10000$ у еквіваленті борщових заправок",
|
35 |
"десять тисяч доларів у еквіваленті борщових заправок",
|
36 |
),
|
37 |
+
("10-30-40-50-5-9-5", "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"),
|
38 |
+
],
|
39 |
+
)
|
40 |
+
def test_formatter(text, expected):
|
41 |
+
assert preprocess_text(text) == expected
|
42 |
+
|
43 |
+
# Purspose of these tests, to have clearly separate list of issues
|
44 |
+
# in the conversion. Once fixed, these cases should move to test_formatter
|
45 |
+
# We still want make sure that no changes happens there, as any regressions
|
46 |
+
# is bad, or interesting.
|
47 |
+
@pytest.mark.parametrize(
|
48 |
+
"text,expected",
|
49 |
+
[
|
50 |
+
# Should be два долара
|
51 |
+
(
|
52 |
+
"2 $, 15000 корупціонерів",
|
53 |
+
"два доларів, п'ятнадцять тисяч корупціонерів",
|
54 |
+
),
|
55 |
# this is wrong case, should be "це дев'ятнадцятирічне вино."
|
56 |
# Implementing this, require to have proper parsing of words into the token stream
|
57 |
# which reqiure reworking of current approach.
|
58 |
("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
|
|
|
59 |
],
|
60 |
)
|
61 |
+
def test_planned_formatter_issues(text, expected):
|
62 |
assert preprocess_text(text) == expected
|
ukrainian_tts/formatter.py
CHANGED
@@ -29,6 +29,28 @@ def replace_currency_with_words(text, currency, num_form):
|
|
29 |
text = text.replace("€", CURRENCY[currency][num_form])
|
30 |
return text
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
def preprocess_text(text):
|
34 |
text = text.lower()
|
@@ -89,9 +111,12 @@ def preprocess_text(text):
|
|
89 |
cleaned_part = part
|
90 |
|
91 |
for part_currency in currencies:
|
92 |
-
cleaned_part
|
93 |
-
|
94 |
-
|
|
|
|
|
|
|
95 |
|
96 |
part = " ".join(
|
97 |
[
|
@@ -119,9 +144,8 @@ def preprocess_text(text):
|
|
119 |
result.append(part)
|
120 |
return "-".join(result)
|
121 |
|
122 |
-
# print([detect_num_and_convert(word) for word in text
|
123 |
-
text = "
|
124 |
-
|
125 |
text = replace_currency_with_words(text, currency, num_form)
|
126 |
|
127 |
# fallback numbers
|
|
|
29 |
text = text.replace("€", CURRENCY[currency][num_form])
|
30 |
return text
|
31 |
|
32 |
+
def find_any_char(text: str, find: str, start: int):
|
33 |
+
result = -1
|
34 |
+
for c in find:
|
35 |
+
index = text.find(c, start)
|
36 |
+
if (index >= 0) and (result > index or result == -1):
|
37 |
+
result = index
|
38 |
+
|
39 |
+
return result
|
40 |
+
|
41 |
+
# Have to check if I can use https://github.com/lang-uk/tokenize-uk
|
42 |
+
def simple_tokenizer(text: str):
|
43 |
+
start = 0
|
44 |
+
index = find_any_char(text, " ,", start)
|
45 |
+
while (index >= 0):
|
46 |
+
word = text[start:index]
|
47 |
+
yield word
|
48 |
+
separator = text[index]
|
49 |
+
yield separator
|
50 |
+
start = index + 1
|
51 |
+
index = find_any_char(text, " ,", start)
|
52 |
+
|
53 |
+
yield text[start:]
|
54 |
|
55 |
def preprocess_text(text):
|
56 |
text = text.lower()
|
|
|
111 |
cleaned_part = part
|
112 |
|
113 |
for part_currency in currencies:
|
114 |
+
if cleaned_part[0] == part_currency:
|
115 |
+
cleaned_part = cleaned_part[1:] + " " + part_currency
|
116 |
+
else:
|
117 |
+
cleaned_part = cleaned_part.replace(
|
118 |
+
part_currency, f" {part_currency} "
|
119 |
+
).strip() # TODO: replace with regex
|
120 |
|
121 |
part = " ".join(
|
122 |
[
|
|
|
144 |
result.append(part)
|
145 |
return "-".join(result)
|
146 |
|
147 |
+
# print([detect_num_and_convert(word) for word in simple_tokenizer(text)])
|
148 |
+
text = "".join([detect_num_and_convert(word) for word in simple_tokenizer(text)])
|
|
|
149 |
text = replace_currency_with_words(text, currency, num_form)
|
150 |
|
151 |
# fallback numbers
|