File size: 5,517 Bytes
15ce174 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
# Adapted from the source code of UDPipe 2
# https://github.com/ufal/udpipe/blob/82a9bd82ae7e947897304177e0390b3f191b01cb/udpipe2_dataset.py
from typing import List, Tuple, Union
class RuleProcessor:
def __init__(self, verbose: bool = True):
self.verbose = verbose
@staticmethod
def gen_lemma_rule(form: str, lemma: str, allow_copy: bool) -> str:
form = form.lower()
# change back to original maybe
previous_case = -1
lemma_casing = ""
for i, c in enumerate(lemma):
# prevent non-alphabetic characters from breaking spans in casing rules
if not c.islower() and not c.isupper(): # wrong condition?
if previous_case == -1:
case = "↓"
else:
case = previous_case
else:
case = "↑" if c.lower() != c else "↓"
if case != previous_case:
lemma_casing += "{}{}{}".format(
"¦" if lemma_casing else "",
case,
i if i <= len(lemma) // 2 else i - len(lemma),
)
previous_case = case
lemma = lemma.lower()
best, best_form, best_lemma = 0, 0, 0
for l in range(len(lemma)):
for f in range(len(form)):
cpl = 0
while (
f + cpl < len(form)
and l + cpl < len(lemma)
and form[f + cpl] == lemma[l + cpl]
):
cpl += 1
if cpl > best:
best = cpl
best_form = f
best_lemma = l
rule = lemma_casing + ";"
if not best:
rule += "a" + lemma
else:
rule += "d{}¦{}".format(
min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy),
min_edit_script(
form[best_form + best :], lemma[best_lemma + best :], allow_copy
),
)
return rule
def apply_lemma_rule(self, form: str, lemma_rule: str) -> str:
if ";" not in lemma_rule:
raise ValueError("Invalid rule format: ';' not in rule")
casing, rule = lemma_rule.split(";", 1)
if rule.startswith("a"):
lemma = rule[1:]
else:
if "¦" not in rule:
raise ValueError("Invalid rule format: '¦' not in rule")
form = form.lower()
rules, rule_sources = rule[1:].split("¦"), []
assert len(rules) == 2
for rule in rules:
source, i = 0, 0
while i < len(rule):
if rule[i] == "→" or rule[i] == "-":
source += 1
else:
assert rule[i] == "+"
i += 1
i += 1
rule_sources.append(source)
try:
lemma, form_offset = "", 0
for i in range(2):
j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
while j < len(rules[i]):
if rules[i][j] == "→":
lemma += form[offset]
offset += 1
elif rules[i][j] == "-":
offset += 1
else:
assert rules[i][j] == "+"
lemma += rules[i][j + 1]
j += 1
j += 1
if i == 0:
lemma += form[rule_sources[0] : len(form) - rule_sources[1]]
except Exception as e:
if self.verbose:
print(
f"Caught an error: `{type(e).__name__}` with form: `{form}` and rule: `{lemma_rule}`, message: `{e}`"
)
lemma = form
for rule in casing.split("¦"):
# The lemma is lowercased initially
if rule == "↓0":
continue
# Empty lemma might generate empty casing rule
if not rule:
continue
case, offset = rule[0], int(rule[1:])
lemma = lemma[:offset] + (
lemma[offset:].upper() if case == "↑" else lemma[offset:].lower()
)
return lemma
def min_edit_script(source: str, target: str, allow_copy: bool) -> str:
a: List[List[Tuple[int, Union[None, str]]]] = [
[(len(source) + len(target) + 1, None)] * (len(target) + 1)
for _ in range(len(source) + 1)
]
for i in range(0, len(source) + 1):
for j in range(0, len(target) + 1):
if i == 0 and j == 0:
a[i][j] = (0, "")
else:
if (
allow_copy
and i
and j
and source[i - 1] == target[j - 1]
and a[i - 1][j - 1][0] < a[i][j][0]
):
a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→")
if i and a[i - 1][j][0] < a[i][j][0]:
a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-")
if j and a[i][j - 1][0] < a[i][j][0]:
a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1])
return a[-1][-1][1]
|