Add demo files
Browse files- demo.py +103 -0
- requirements.txt +3 -0
- rule_processor.py +150 -0
- utils.py +7 -0
- vabamorf_lemmatizer.py +61 -0
demo.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from gliner import GLiNER
|
2 |
+
import gradio as gr
|
3 |
+
|
4 |
+
from rule_processor import RuleProcessor
|
5 |
+
from vabamorf_lemmatizer import Lemmatizer
|
6 |
+
from utils import sentence_to_spans
|
7 |
+
|
8 |
+
|
9 |
+
examples = [
|
10 |
+
"4. koha tõenäsus on täpselt 0, seda sõltumata lisakoha tulekust või mittetulekust.",
|
11 |
+
"WordPressi puhul tasub see sokutada oma kujundusteema kataloogi ning kui lisada functions.php-sse järgmised kaks rida peakski kõik toimima:",
|
12 |
+
]
|
13 |
+
|
14 |
+
rule_processor = RuleProcessor()
|
15 |
+
model = GLiNER.from_pretrained("tartuNLP/glilem-vabamorf-disambiguator")
|
16 |
+
lemmatizer = Lemmatizer(
|
17 |
+
disambiguate=False, use_context=False, proper_name=True, separate_punctuation=True
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
def process_text(text):
|
22 |
+
lemmas, tokens = lemmatizer(text, return_tokens=True)
|
23 |
+
lemmas = [list(set(el)) for el in lemmas]
|
24 |
+
tokens = [el[0] for el in tokens]
|
25 |
+
# serves as input for GliNER to remain consistent with Vabamorf tokenization
|
26 |
+
processed_text = " ".join(tokens)
|
27 |
+
labels = []
|
28 |
+
# contains the token id for each span
|
29 |
+
span_to_token_id = sentence_to_spans(tokens)
|
30 |
+
# produce a transofrmation rule for each lemma candidate
|
31 |
+
for token, lemma_list in zip(tokens, lemmas):
|
32 |
+
for lemma in lemma_list:
|
33 |
+
labels.append(
|
34 |
+
rule_processor.gen_lemma_rule(form=token, lemma=lemma, allow_copy=True)
|
35 |
+
)
|
36 |
+
# we only consider unique rules
|
37 |
+
labels = list(set(labels))
|
38 |
+
predicted_entities = model.predict_entities(
|
39 |
+
text=processed_text, labels=labels, flat_ner=True, threshold=0.5
|
40 |
+
)
|
41 |
+
|
42 |
+
predictions = tokens.copy()
|
43 |
+
for entity in predicted_entities:
|
44 |
+
cur_start = entity["start"]
|
45 |
+
cur_end = entity["end"]
|
46 |
+
token = processed_text[cur_start:cur_end]
|
47 |
+
if f"{cur_start}-{cur_end}" in span_to_token_id:
|
48 |
+
token_id = span_to_token_id[f"{cur_start}-{cur_end}"]
|
49 |
+
token = tokens[token_id]
|
50 |
+
# if there are multiple lemma candidates, apply the highest scoring rule
|
51 |
+
if len(lemmas[token_id]) > 1:
|
52 |
+
result = rule_processor.apply_lemma_rule(token, entity["label"])
|
53 |
+
# otherwise, we trust the Vabamorf lemma
|
54 |
+
else:
|
55 |
+
result = lemmas[token_id][0]
|
56 |
+
predictions[token_id] = result
|
57 |
+
# store labels to highlight changed word forms
|
58 |
+
lemma_labels = []
|
59 |
+
for pred, token in zip(predictions, tokens):
|
60 |
+
lemma_labels.append(pred != token)
|
61 |
+
# expected input format for HighlightedText component
|
62 |
+
processed_entities = {
|
63 |
+
"text": processed_text,
|
64 |
+
"entities": [
|
65 |
+
{
|
66 |
+
"entity": entity["label"],
|
67 |
+
"word": entity["text"],
|
68 |
+
"start": entity["start"],
|
69 |
+
"end": entity["end"],
|
70 |
+
"score": entity["score"],
|
71 |
+
}
|
72 |
+
for entity in predicted_entities
|
73 |
+
],
|
74 |
+
}
|
75 |
+
processed_lemmas = [(pred, label) for pred, label in zip(predictions, lemma_labels)]
|
76 |
+
|
77 |
+
return processed_entities, processed_lemmas
|
78 |
+
|
79 |
+
|
80 |
+
if __name__ == "__main__":
|
81 |
+
|
82 |
+
with gr.Blocks() as demo:
|
83 |
+
input_text = gr.Textbox(
|
84 |
+
label="Text input", placeholder="Enter your text in Estonian here"
|
85 |
+
)
|
86 |
+
label_output = gr.HighlightedText(label="Predicted Transformation Rules")
|
87 |
+
lemma_output = gr.HighlightedText(label="Predicted Lemmas")
|
88 |
+
submit_btn = gr.Button("Submit")
|
89 |
+
input_text.submit(
|
90 |
+
fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
|
91 |
+
)
|
92 |
+
submit_btn.click(
|
93 |
+
fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
|
94 |
+
)
|
95 |
+
examples = gr.Examples(
|
96 |
+
examples,
|
97 |
+
fn=process_text,
|
98 |
+
inputs=input_text,
|
99 |
+
outputs=[label_output, lemma_output],
|
100 |
+
cache_examples=False,
|
101 |
+
)
|
102 |
+
theme = gr.themes.Base()
|
103 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=5.9.1
|
2 |
+
gliner~=0.2.8
|
3 |
+
estnltk==1.7.3
|
rule_processor.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Adapted from the source code of UDPipe 2
|
2 |
+
# https://github.com/ufal/udpipe/blob/82a9bd82ae7e947897304177e0390b3f191b01cb/udpipe2_dataset.py
|
3 |
+
|
4 |
+
|
5 |
+
from typing import List, Tuple, Union
|
6 |
+
|
7 |
+
|
8 |
+
class RuleProcessor:
|
9 |
+
|
10 |
+
def __init__(self, verbose: bool = True):
|
11 |
+
self.verbose = verbose
|
12 |
+
|
13 |
+
@staticmethod
|
14 |
+
def gen_lemma_rule(form: str, lemma: str, allow_copy: bool) -> str:
|
15 |
+
form = form.lower()
|
16 |
+
|
17 |
+
# change back to original maybe
|
18 |
+
previous_case = -1
|
19 |
+
lemma_casing = ""
|
20 |
+
for i, c in enumerate(lemma):
|
21 |
+
# prevent non-alphabetic characters from breaking spans in casing rules
|
22 |
+
if not c.islower() and not c.isupper(): # wrong condition?
|
23 |
+
if previous_case == -1:
|
24 |
+
case = "↓"
|
25 |
+
else:
|
26 |
+
case = previous_case
|
27 |
+
else:
|
28 |
+
case = "↑" if c.lower() != c else "↓"
|
29 |
+
if case != previous_case:
|
30 |
+
lemma_casing += "{}{}{}".format(
|
31 |
+
"¦" if lemma_casing else "",
|
32 |
+
case,
|
33 |
+
i if i <= len(lemma) // 2 else i - len(lemma),
|
34 |
+
)
|
35 |
+
previous_case = case
|
36 |
+
lemma = lemma.lower()
|
37 |
+
|
38 |
+
best, best_form, best_lemma = 0, 0, 0
|
39 |
+
for l in range(len(lemma)):
|
40 |
+
for f in range(len(form)):
|
41 |
+
cpl = 0
|
42 |
+
while (
|
43 |
+
f + cpl < len(form)
|
44 |
+
and l + cpl < len(lemma)
|
45 |
+
and form[f + cpl] == lemma[l + cpl]
|
46 |
+
):
|
47 |
+
cpl += 1
|
48 |
+
if cpl > best:
|
49 |
+
best = cpl
|
50 |
+
best_form = f
|
51 |
+
best_lemma = l
|
52 |
+
|
53 |
+
rule = lemma_casing + ";"
|
54 |
+
if not best:
|
55 |
+
rule += "a" + lemma
|
56 |
+
else:
|
57 |
+
rule += "d{}¦{}".format(
|
58 |
+
min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy),
|
59 |
+
min_edit_script(
|
60 |
+
form[best_form + best :], lemma[best_lemma + best :], allow_copy
|
61 |
+
),
|
62 |
+
)
|
63 |
+
return rule
|
64 |
+
|
65 |
+
def apply_lemma_rule(self, form: str, lemma_rule: str) -> str:
|
66 |
+
if ";" not in lemma_rule:
|
67 |
+
raise ValueError("Invalid rule format: ';' not in rule")
|
68 |
+
casing, rule = lemma_rule.split(";", 1)
|
69 |
+
if rule.startswith("a"):
|
70 |
+
lemma = rule[1:]
|
71 |
+
else:
|
72 |
+
if "¦" not in rule:
|
73 |
+
raise ValueError("Invalid rule format: '¦' not in rule")
|
74 |
+
form = form.lower()
|
75 |
+
rules, rule_sources = rule[1:].split("¦"), []
|
76 |
+
assert len(rules) == 2
|
77 |
+
for rule in rules:
|
78 |
+
source, i = 0, 0
|
79 |
+
while i < len(rule):
|
80 |
+
if rule[i] == "→" or rule[i] == "-":
|
81 |
+
source += 1
|
82 |
+
else:
|
83 |
+
assert rule[i] == "+"
|
84 |
+
i += 1
|
85 |
+
i += 1
|
86 |
+
rule_sources.append(source)
|
87 |
+
|
88 |
+
try:
|
89 |
+
lemma, form_offset = "", 0
|
90 |
+
for i in range(2):
|
91 |
+
j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
|
92 |
+
while j < len(rules[i]):
|
93 |
+
if rules[i][j] == "→":
|
94 |
+
lemma += form[offset]
|
95 |
+
offset += 1
|
96 |
+
elif rules[i][j] == "-":
|
97 |
+
offset += 1
|
98 |
+
else:
|
99 |
+
assert rules[i][j] == "+"
|
100 |
+
lemma += rules[i][j + 1]
|
101 |
+
j += 1
|
102 |
+
j += 1
|
103 |
+
if i == 0:
|
104 |
+
lemma += form[rule_sources[0] : len(form) - rule_sources[1]]
|
105 |
+
except Exception as e:
|
106 |
+
if self.verbose:
|
107 |
+
print(
|
108 |
+
f"Caught an error: `{type(e).__name__}` with form: `{form}` and rule: `{lemma_rule}`, message: `{e}`"
|
109 |
+
)
|
110 |
+
lemma = form
|
111 |
+
|
112 |
+
for rule in casing.split("¦"):
|
113 |
+
# The lemma is lowercased initially
|
114 |
+
if rule == "↓0":
|
115 |
+
continue
|
116 |
+
# Empty lemma might generate empty casing rule
|
117 |
+
if not rule:
|
118 |
+
continue
|
119 |
+
case, offset = rule[0], int(rule[1:])
|
120 |
+
lemma = lemma[:offset] + (
|
121 |
+
lemma[offset:].upper() if case == "↑" else lemma[offset:].lower()
|
122 |
+
)
|
123 |
+
|
124 |
+
return lemma
|
125 |
+
|
126 |
+
|
127 |
+
def min_edit_script(source: str, target: str, allow_copy: bool) -> str:
|
128 |
+
a: List[List[Tuple[int, Union[None, str]]]] = [
|
129 |
+
[(len(source) + len(target) + 1, None)] * (len(target) + 1)
|
130 |
+
for _ in range(len(source) + 1)
|
131 |
+
]
|
132 |
+
|
133 |
+
for i in range(0, len(source) + 1):
|
134 |
+
for j in range(0, len(target) + 1):
|
135 |
+
if i == 0 and j == 0:
|
136 |
+
a[i][j] = (0, "")
|
137 |
+
else:
|
138 |
+
if (
|
139 |
+
allow_copy
|
140 |
+
and i
|
141 |
+
and j
|
142 |
+
and source[i - 1] == target[j - 1]
|
143 |
+
and a[i - 1][j - 1][0] < a[i][j][0]
|
144 |
+
):
|
145 |
+
a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→")
|
146 |
+
if i and a[i - 1][j][0] < a[i][j][0]:
|
147 |
+
a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-")
|
148 |
+
if j and a[i][j - 1][0] < a[i][j][0]:
|
149 |
+
a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1])
|
150 |
+
return a[-1][-1][1]
|
utils.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def sentence_to_spans(tokenized_sentence: list[str]) -> dict[str, int]:
|
2 |
+
span_to_token_id_ = dict()
|
3 |
+
start = 0
|
4 |
+
for index, token_ in enumerate(tokenized_sentence):
|
5 |
+
span_to_token_id_[f"{start + index}-{start + index + len(token_)}"] = index
|
6 |
+
start += len(token_)
|
7 |
+
return span_to_token_id_
|
vabamorf_lemmatizer.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from estnltk import Text
|
2 |
+
from estnltk.taggers import (
|
3 |
+
VabamorfTagger,
|
4 |
+
WhiteSpaceTokensTagger,
|
5 |
+
PretokenizedTextCompoundTokensTagger,
|
6 |
+
TokensTagger,
|
7 |
+
)
|
8 |
+
|
9 |
+
|
10 |
+
class Lemmatizer:
|
11 |
+
|
12 |
+
def __init__(
|
13 |
+
self,
|
14 |
+
disambiguate: bool = False,
|
15 |
+
use_context: bool = False,
|
16 |
+
proper_name: bool = True,
|
17 |
+
guess: bool = False,
|
18 |
+
separate_punctuation: bool = False,
|
19 |
+
):
|
20 |
+
self.disambiguate = disambiguate
|
21 |
+
self.use_context = use_context
|
22 |
+
self.proper_name = proper_name
|
23 |
+
self.guess = guess
|
24 |
+
self.tagger = VabamorfTagger(
|
25 |
+
compound=False,
|
26 |
+
disambiguate=self.disambiguate,
|
27 |
+
guess=self.guess,
|
28 |
+
slang_lex=False,
|
29 |
+
phonetic=False,
|
30 |
+
use_postanalysis=True,
|
31 |
+
use_reorderer=True,
|
32 |
+
propername=self.proper_name,
|
33 |
+
predisambiguate=self.use_context,
|
34 |
+
postdisambiguate=self.use_context,
|
35 |
+
)
|
36 |
+
self.separate_punctuation = separate_punctuation
|
37 |
+
if self.separate_punctuation:
|
38 |
+
self.tokens_tagger = TokensTagger()
|
39 |
+
else:
|
40 |
+
self.tokens_tagger = WhiteSpaceTokensTagger()
|
41 |
+
self.compound_token_tagger = PretokenizedTextCompoundTokensTagger()
|
42 |
+
|
43 |
+
def __call__(self, text: str, return_tokens: bool = False) -> list[list[str]]:
|
44 |
+
text = Text(text)
|
45 |
+
self.tokens_tagger.tag(text)
|
46 |
+
self.compound_token_tagger.tag(text)
|
47 |
+
text.tag_layer(self.tagger.input_layers)
|
48 |
+
self.tagger.tag(text)
|
49 |
+
if return_tokens:
|
50 |
+
return list(text["morph_analysis"].lemma), list(
|
51 |
+
text["morph_analysis"].normalized_text
|
52 |
+
)
|
53 |
+
return list(text["morph_analysis"].lemma)
|
54 |
+
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
sample = "India köök: riisi-dhal, köögivilju ja roti-papad?"
|
58 |
+
lemmatizer = Lemmatizer(
|
59 |
+
proper_name=True, use_context=True, disambiguate=True, separate_punctuation=True
|
60 |
+
)
|
61 |
+
print(lemmatizer(sample))
|