Spaces:
Running
Running
from num2words import num2words | |
import re | |
def number_form(number): | |
if number[-1] == "1": | |
return 0 | |
elif number[-1] in ("2", "3", "4"): | |
return 1 | |
else: | |
return 2 | |
CURRENCY = { | |
"USD": ("долар", "долари", "доларів"), | |
"UAH": ("гривня", "гривні", "гривень"), | |
"EUR": ("євро", "євро", "євро"), | |
} | |
def replace_currency_with_words(text, currency, num_form): | |
if currency == "USD": | |
text = text.replace("$", CURRENCY[currency][num_form]) | |
if currency == "UAH": | |
text = text.replace("₴", CURRENCY[currency][num_form]) | |
if currency == "EUR": | |
text = text.replace("€", CURRENCY[currency][num_form]) | |
return text | |
def find_any_char(text: str, find: str, start: int): | |
result = -1 | |
for c in find: | |
index = text.find(c, start) | |
if (index >= 0) and (result > index or result == -1): | |
result = index | |
return result | |
# Have to check if I can use https://github.com/lang-uk/tokenize-uk | |
def simple_tokenizer(text: str): | |
start = 0 | |
index = find_any_char(text, " ,", start) | |
while index >= 0: | |
word = text[start:index] | |
yield word | |
separator = text[index] | |
yield separator | |
start = index + 1 | |
index = find_any_char(text, " ,", start) | |
yield text[start:] | |
def preprocess_text(text): | |
text = text.lower() | |
# currencies | |
if "$" in text: | |
currency = "USD" | |
gender = "masculine" | |
elif "₴" in text: | |
currency = "UAH" | |
gender = "feminine" | |
elif "€" in text: | |
currency = "EUR" | |
gender = "masculine" | |
else: | |
currency = "" | |
gender = "masculine" | |
num_form = 0 | |
# replace apostrophe | |
text = text.replace("`", "'") | |
text = text.replace("ʼ", "'") | |
text = text.replace("…", "...") | |
symbols = { | |
"”": '"', | |
"“": '"', | |
"’": '"', | |
"‘": '"', | |
"«": '"', | |
"»": '"', | |
"–": "-", | |
"—": "-", | |
"―": "-", | |
} | |
for symbol, value in symbols.items(): | |
text = text.replace(symbol, value) | |
# numbers | |
text = re.sub(r"(\d)\s+(\d)", r"\1\2", text) | |
def detect_num_and_convert(word): | |
numbers = "0123456789" | |
splits = ",." | |
currencies = "$₴€" | |
result = [] | |
nonlocal num_form | |
parts = word.split("-") # for handling complex words | |
for part in parts: | |
is_number = all(map(lambda x: x in numbers, part)) or ( | |
any(map(lambda x: x in numbers, part)) | |
and any(map(lambda x: x in splits, part)) | |
) | |
is_currency = any(map(lambda x: x in currencies, part)) and any( | |
map(lambda x: x in numbers, part) | |
) # contains both number and currency symbol | |
if is_number or is_currency: | |
try: | |
if is_currency: | |
cleaned_part = part | |
for part_currency in currencies: | |
if cleaned_part[0] == part_currency: | |
cleaned_part = cleaned_part[1:] + " " + part_currency | |
else: | |
cleaned_part = cleaned_part.replace( | |
part_currency, f" {part_currency} " | |
).strip() # TODO: replace with regex | |
part = " ".join( | |
[ | |
detect_num_and_convert(part_word) | |
for part_word in cleaned_part.split(" ") | |
] | |
) | |
ends_with_dot = part.endswith(".") # ugly | |
ends_with_comma = part.endswith(",") | |
if ends_with_comma or ends_with_dot: | |
part = part[:-1] | |
part = " ".join( | |
[ | |
detect_num_and_convert(part_word) | |
for part_word in part.split(" ") | |
] | |
) + ("." if ends_with_dot else ",") | |
num_form = number_form(part) | |
result.append(num2words(part.strip(), lang="uk", gender=gender)) | |
except: | |
result.append(part) | |
else: | |
result.append(part) | |
return "-".join(result) | |
# print([detect_num_and_convert(word) for word in simple_tokenizer(text)]) | |
text = "".join([detect_num_and_convert(word) for word in simple_tokenizer(text)]) | |
text = replace_currency_with_words(text, currency, num_form) | |
# fallback numbers | |
text = text.replace("1", "один ") | |
text = text.replace("2", "два ") | |
text = text.replace("3", "три ") | |
text = text.replace("4", "чотири ") | |
text = text.replace("5", "п'ять ") | |
text = text.replace("6", "шість ") | |
text = text.replace("7", "сім ") | |
text = text.replace("8", "вісім ") | |
text = text.replace("9", "дев'ять ") | |
text = text.replace("0", "нуль ") | |
# speak english alphabet using brute force transliteration | |
english = { | |
"qu": "кв", | |
"ch": "ч", | |
"sh": "ш", | |
"шч": "щ", # after previous cases | |
"ph": "ф", | |
"kh": "х", | |
"yo": "йо", | |
"yu": "ю", | |
"ya": "я", | |
"ye": "є", | |
"yi": "ї", | |
"zh": "ж", | |
"ts": "ц", | |
"th": "т", | |
"a": "а", | |
"b": "б", | |
"c": "ц", | |
"d": "д", | |
"e": "е", | |
"f": "ф", | |
"g": "ґ", | |
"h": "г", | |
"i": "і", | |
"j": "дж", | |
"k": "к", | |
"l": "л", | |
"m": "м", | |
"n": "н", | |
"o": "о", | |
"p": "п", | |
"q": "кв", | |
"r": "р", | |
"s": "с", | |
"t": "т", | |
"u": "ю", | |
"v": "в", | |
"w": "в", | |
"x": "кс", | |
"y": "і", | |
"z": "з", | |
} | |
for english_char, english_value in english.items(): | |
# uppercase | |
text = text.replace(english_char.upper(), english_value.upper()) | |
text = text.replace(english_char, english_value) | |
return text | |