Spaces:
Running
Running
Yurii Paniv
commited on
Commit
·
d663a5a
1
Parent(s):
953871c
Fix formatter
Browse files- tests/test_formatter.py +12 -3
- ukrainian_tts/formatter.py +28 -10
tests/test_formatter.py
CHANGED
@@ -19,9 +19,18 @@ import pytest
|
|
19 |
"11100000001 доларів державного боргу.",
|
20 |
"одинадцять мільярдів сто мільйонів один доларів державного боргу.",
|
21 |
),
|
22 |
-
(
|
23 |
-
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
# this is wrong case, should be "це дев'ятнадцятирічне вино."
|
26 |
# Implementing this, require to have proper parsing of words into the token stream
|
27 |
# which reqiure reworking of current approach.
|
|
|
19 |
"11100000001 доларів державного боргу.",
|
20 |
"одинадцять мільярдів сто мільйонів один доларів державного боргу.",
|
21 |
),
|
22 |
+
(
|
23 |
+
"10000$, 15000 корупціонерів",
|
24 |
+
"десять тисяч доларів , п'ятнадцять тисяч корупціонерів",
|
25 |
+
), # TODO: fix space before comma
|
26 |
+
(
|
27 |
+
"$10000, 15000 корупціонерів",
|
28 |
+
"доларів десять тисяч, п'ятнадцять тисяч корупціонерів",
|
29 |
+
), # fix order
|
30 |
+
(
|
31 |
+
"10000$ у еквіваленті борщових заправок",
|
32 |
+
"десять тисяч доларів у еквіваленті борщових заправок",
|
33 |
+
),
|
34 |
# this is wrong case, should be "це дев'ятнадцятирічне вино."
|
35 |
# Implementing this, require to have proper parsing of words into the token stream
|
36 |
# which reqiure reworking of current approach.
|
ukrainian_tts/formatter.py
CHANGED
@@ -29,6 +29,7 @@ def replace_currency_with_words(text, currency, num_form):
|
|
29 |
text = text.replace("€", CURRENCY[currency][num_form])
|
30 |
return text
|
31 |
|
|
|
32 |
def preprocess_text(text):
|
33 |
text = text.lower()
|
34 |
# currencies
|
@@ -75,23 +76,40 @@ def preprocess_text(text):
|
|
75 |
nonlocal num_form
|
76 |
parts = word.split("-") # for handling complex words
|
77 |
for part in parts:
|
78 |
-
is_number = all(map(lambda x: x in numbers, part)) or (
|
79 |
-
|
|
|
|
|
|
|
|
|
|
|
80 |
if is_number or is_currency:
|
81 |
try:
|
82 |
if is_currency:
|
83 |
cleaned_part = part
|
84 |
-
|
85 |
-
for part_currency in currencies:
|
86 |
-
cleaned_part = cleaned_part.replace(part_currency, f" {part_currency} ").strip() # TODO: replace with regex
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
ends_with_comma = part.endswith(",")
|
92 |
if ends_with_comma or ends_with_dot:
|
93 |
part = part[:-1]
|
94 |
-
part = " ".join(
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
num_form = number_form(part)
|
97 |
result.append(num2words(part.strip(), lang="uk", gender=gender))
|
@@ -122,7 +140,7 @@ def preprocess_text(text):
|
|
122 |
"qu": "кв",
|
123 |
"ch": "ч",
|
124 |
"sh": "ш",
|
125 |
-
"шч": "щ",
|
126 |
"ph": "ф",
|
127 |
"kh": "х",
|
128 |
"yo": "йо",
|
|
|
29 |
text = text.replace("€", CURRENCY[currency][num_form])
|
30 |
return text
|
31 |
|
32 |
+
|
33 |
def preprocess_text(text):
|
34 |
text = text.lower()
|
35 |
# currencies
|
|
|
76 |
nonlocal num_form
|
77 |
parts = word.split("-") # for handling complex words
|
78 |
for part in parts:
|
79 |
+
is_number = all(map(lambda x: x in numbers, part)) or (
|
80 |
+
any(map(lambda x: x in numbers, part))
|
81 |
+
and any(map(lambda x: x in splits, part))
|
82 |
+
)
|
83 |
+
is_currency = any(map(lambda x: x in currencies, part)) and any(
|
84 |
+
map(lambda x: x in numbers, part)
|
85 |
+
) # contains both number and currency symbol
|
86 |
if is_number or is_currency:
|
87 |
try:
|
88 |
if is_currency:
|
89 |
cleaned_part = part
|
|
|
|
|
|
|
90 |
|
91 |
+
for part_currency in currencies:
|
92 |
+
cleaned_part = cleaned_part.replace(
|
93 |
+
part_currency, f" {part_currency} "
|
94 |
+
).strip() # TODO: replace with regex
|
95 |
+
|
96 |
+
part = " ".join(
|
97 |
+
[
|
98 |
+
detect_num_and_convert(part_word)
|
99 |
+
for part_word in cleaned_part.split(" ")
|
100 |
+
]
|
101 |
+
)
|
102 |
+
|
103 |
+
ends_with_dot = part.endswith(".") # ugly
|
104 |
ends_with_comma = part.endswith(",")
|
105 |
if ends_with_comma or ends_with_dot:
|
106 |
part = part[:-1]
|
107 |
+
part = " ".join(
|
108 |
+
[
|
109 |
+
detect_num_and_convert(part_word)
|
110 |
+
for part_word in part.split(" ")
|
111 |
+
]
|
112 |
+
) + ("." if ends_with_dot else ",")
|
113 |
|
114 |
num_form = number_form(part)
|
115 |
result.append(num2words(part.strip(), lang="uk", gender=gender))
|
|
|
140 |
"qu": "кв",
|
141 |
"ch": "ч",
|
142 |
"sh": "ш",
|
143 |
+
"шч": "щ", # after previous cases
|
144 |
"ph": "ф",
|
145 |
"kh": "х",
|
146 |
"yo": "йо",
|