Spaces:
Running
Running
Yurii Paniv
commited on
Commit
·
cf5d96f
1
Parent(s):
7b54849
Format code
Browse files- tests/test_formatter.py +25 -21
- ukrainian_tts/formatter.py +15 -12
tests/test_formatter.py
CHANGED
@@ -1,26 +1,30 @@
|
|
1 |
from ukrainian_tts.formatter import preprocess_text
|
2 |
import pytest
|
3 |
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
"
|
13 |
-
"
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
25 |
def test_formatter(text, expected):
|
26 |
assert preprocess_text(text) == expected
|
|
|
1 |
from ukrainian_tts.formatter import preprocess_text
|
2 |
import pytest
|
3 |
|
4 |
+
|
5 |
+
@pytest.mark.parametrize(
|
6 |
+
"text,expected",
|
7 |
+
[
|
8 |
+
("Quality of life update", "кваліті оф ліфе юпдате"),
|
9 |
+
("Він украв 20000000 $", "він украв двадцять мільйонів доларів"),
|
10 |
+
("Він украв 20000000", "він украв двадцять мільйонів"),
|
11 |
+
("Він украв 1 $", "він украв один долар"),
|
12 |
+
("Він украв 2 $", "він украв два долари"),
|
13 |
+
("Він украв 2 ₴", "він украв дві гривні"),
|
14 |
+
(
|
15 |
+
"111 000 000 000 доларів державного боргу.",
|
16 |
+
"сто одинадцять мільярдів доларів державного боргу.",
|
17 |
+
),
|
18 |
+
(
|
19 |
+
"11100000001 доларів державного боргу.",
|
20 |
+
"одинадцять мільярдів сто мільйонів один доларів державного боргу.",
|
21 |
+
),
|
22 |
+
# this is wrong case, should be "це дев'ятнадцяти-річне вино."
|
23 |
+
# Implementing this, require to have proper parsing of words into the token stream
|
24 |
+
# which reqiure reworking of current approach.
|
25 |
+
("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
|
26 |
+
("10-30-40-50-5-9-5", "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"),
|
27 |
+
],
|
28 |
+
)
|
29 |
def test_formatter(text, expected):
|
30 |
assert preprocess_text(text) == expected
|
ukrainian_tts/formatter.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from num2words import num2words
|
2 |
import re
|
3 |
|
|
|
4 |
def number_form(number):
|
5 |
if number[-1] == "1":
|
6 |
return 0
|
@@ -9,27 +10,29 @@ def number_form(number):
|
|
9 |
else:
|
10 |
return 2
|
11 |
|
|
|
12 |
CURRENCY = {
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
}
|
17 |
|
|
|
18 |
def preprocess_text(text):
|
19 |
text = text.lower()
|
20 |
# currencies
|
21 |
if "$" in text:
|
22 |
currency = "USD"
|
23 |
-
gender =
|
24 |
elif "₴" in text:
|
25 |
currency = "UAH"
|
26 |
-
gender =
|
27 |
elif "€" in text:
|
28 |
currency = "EUR"
|
29 |
-
gender =
|
30 |
else:
|
31 |
currency = ""
|
32 |
-
gender =
|
33 |
|
34 |
num_form = 0
|
35 |
# replace apostrophe
|
@@ -73,13 +76,13 @@ def preprocess_text(text):
|
|
73 |
|
74 |
# print([detect_num_and_convert(word) for word in text.split(" ")])
|
75 |
text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
|
76 |
-
if
|
77 |
text = text.replace("$", CURRENCY[currency][num_form])
|
78 |
-
|
79 |
-
if
|
80 |
text = text.replace("₴", CURRENCY[currency][num_form])
|
81 |
-
|
82 |
-
if
|
83 |
text = text.replace("€", CURRENCY[currency][num_form])
|
84 |
|
85 |
# fallback numbers
|
|
|
1 |
from num2words import num2words
|
2 |
import re
|
3 |
|
4 |
+
|
5 |
def number_form(number):
|
6 |
if number[-1] == "1":
|
7 |
return 0
|
|
|
10 |
else:
|
11 |
return 2
|
12 |
|
13 |
+
|
14 |
CURRENCY = {
|
15 |
+
"USD": ("долар", "долари", "доларів"),
|
16 |
+
"UAH": ("гривня", "гривні", "гривень"),
|
17 |
+
"EUR": ("євро", "євро", "євро"),
|
18 |
}
|
19 |
|
20 |
+
|
21 |
def preprocess_text(text):
|
22 |
text = text.lower()
|
23 |
# currencies
|
24 |
if "$" in text:
|
25 |
currency = "USD"
|
26 |
+
gender = "masculine"
|
27 |
elif "₴" in text:
|
28 |
currency = "UAH"
|
29 |
+
gender = "feminine"
|
30 |
elif "€" in text:
|
31 |
currency = "EUR"
|
32 |
+
gender = "masculine"
|
33 |
else:
|
34 |
currency = ""
|
35 |
+
gender = "masculine"
|
36 |
|
37 |
num_form = 0
|
38 |
# replace apostrophe
|
|
|
76 |
|
77 |
# print([detect_num_and_convert(word) for word in text.split(" ")])
|
78 |
text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
|
79 |
+
if currency == "USD":
|
80 |
text = text.replace("$", CURRENCY[currency][num_form])
|
81 |
+
|
82 |
+
if currency == "UAH":
|
83 |
text = text.replace("₴", CURRENCY[currency][num_form])
|
84 |
+
|
85 |
+
if currency == "EUR":
|
86 |
text = text.replace("€", CURRENCY[currency][num_form])
|
87 |
|
88 |
# fallback numbers
|