adorkin commited on
Commit
15ce174
·
verified ·
1 Parent(s): 9042e03

Add demo files

Browse files
Files changed (5) hide show
  1. demo.py +103 -0
  2. requirements.txt +3 -0
  3. rule_processor.py +150 -0
  4. utils.py +7 -0
  5. vabamorf_lemmatizer.py +61 -0
demo.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gliner import GLiNER
2
+ import gradio as gr
3
+
4
+ from rule_processor import RuleProcessor
5
+ from vabamorf_lemmatizer import Lemmatizer
6
+ from utils import sentence_to_spans
7
+
8
+
9
+ examples = [
10
+ "4. koha tõenäsus on täpselt 0, seda sõltumata lisakoha tulekust või mittetulekust.",
11
+ "WordPressi puhul tasub see sokutada oma kujundusteema kataloogi ning kui lisada functions.php-sse järgmised kaks rida peakski kõik toimima:",
12
+ ]
13
+
14
+ rule_processor = RuleProcessor()
15
+ model = GLiNER.from_pretrained("tartuNLP/glilem-vabamorf-disambiguator")
16
+ lemmatizer = Lemmatizer(
17
+ disambiguate=False, use_context=False, proper_name=True, separate_punctuation=True
18
+ )
19
+
20
+
21
+ def process_text(text):
22
+ lemmas, tokens = lemmatizer(text, return_tokens=True)
23
+ lemmas = [list(set(el)) for el in lemmas]
24
+ tokens = [el[0] for el in tokens]
25
+ # serves as input for GliNER to remain consistent with Vabamorf tokenization
26
+ processed_text = " ".join(tokens)
27
+ labels = []
28
+ # contains the token id for each span
29
+ span_to_token_id = sentence_to_spans(tokens)
30
+ # produce a transofrmation rule for each lemma candidate
31
+ for token, lemma_list in zip(tokens, lemmas):
32
+ for lemma in lemma_list:
33
+ labels.append(
34
+ rule_processor.gen_lemma_rule(form=token, lemma=lemma, allow_copy=True)
35
+ )
36
+ # we only consider unique rules
37
+ labels = list(set(labels))
38
+ predicted_entities = model.predict_entities(
39
+ text=processed_text, labels=labels, flat_ner=True, threshold=0.5
40
+ )
41
+
42
+ predictions = tokens.copy()
43
+ for entity in predicted_entities:
44
+ cur_start = entity["start"]
45
+ cur_end = entity["end"]
46
+ token = processed_text[cur_start:cur_end]
47
+ if f"{cur_start}-{cur_end}" in span_to_token_id:
48
+ token_id = span_to_token_id[f"{cur_start}-{cur_end}"]
49
+ token = tokens[token_id]
50
+ # if there are multiple lemma candidates, apply the highest scoring rule
51
+ if len(lemmas[token_id]) > 1:
52
+ result = rule_processor.apply_lemma_rule(token, entity["label"])
53
+ # otherwise, we trust the Vabamorf lemma
54
+ else:
55
+ result = lemmas[token_id][0]
56
+ predictions[token_id] = result
57
+ # store labels to highlight changed word forms
58
+ lemma_labels = []
59
+ for pred, token in zip(predictions, tokens):
60
+ lemma_labels.append(pred != token)
61
+ # expected input format for HighlightedText component
62
+ processed_entities = {
63
+ "text": processed_text,
64
+ "entities": [
65
+ {
66
+ "entity": entity["label"],
67
+ "word": entity["text"],
68
+ "start": entity["start"],
69
+ "end": entity["end"],
70
+ "score": entity["score"],
71
+ }
72
+ for entity in predicted_entities
73
+ ],
74
+ }
75
+ processed_lemmas = [(pred, label) for pred, label in zip(predictions, lemma_labels)]
76
+
77
+ return processed_entities, processed_lemmas
78
+
79
+
80
+ if __name__ == "__main__":
81
+
82
+ with gr.Blocks() as demo:
83
+ input_text = gr.Textbox(
84
+ label="Text input", placeholder="Enter your text in Estonian here"
85
+ )
86
+ label_output = gr.HighlightedText(label="Predicted Transformation Rules")
87
+ lemma_output = gr.HighlightedText(label="Predicted Lemmas")
88
+ submit_btn = gr.Button("Submit")
89
+ input_text.submit(
90
+ fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
91
+ )
92
+ submit_btn.click(
93
+ fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
94
+ )
95
+ examples = gr.Examples(
96
+ examples,
97
+ fn=process_text,
98
+ inputs=input_text,
99
+ outputs=[label_output, lemma_output],
100
+ cache_examples=False,
101
+ )
102
+ theme = gr.themes.Base()
103
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ gradio>=5.9.1
2
+ gliner~=0.2.8
3
+ estnltk==1.7.3
rule_processor.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adapted from the source code of UDPipe 2
2
+ # https://github.com/ufal/udpipe/blob/82a9bd82ae7e947897304177e0390b3f191b01cb/udpipe2_dataset.py
3
+
4
+
5
+ from typing import List, Tuple, Union
6
+
7
+
8
+ class RuleProcessor:
9
+
10
+ def __init__(self, verbose: bool = True):
11
+ self.verbose = verbose
12
+
13
+ @staticmethod
14
+ def gen_lemma_rule(form: str, lemma: str, allow_copy: bool) -> str:
15
+ form = form.lower()
16
+
17
+ # change back to original maybe
18
+ previous_case = -1
19
+ lemma_casing = ""
20
+ for i, c in enumerate(lemma):
21
+ # prevent non-alphabetic characters from breaking spans in casing rules
22
+ if not c.islower() and not c.isupper(): # wrong condition?
23
+ if previous_case == -1:
24
+ case = "↓"
25
+ else:
26
+ case = previous_case
27
+ else:
28
+ case = "↑" if c.lower() != c else "↓"
29
+ if case != previous_case:
30
+ lemma_casing += "{}{}{}".format(
31
+ "¦" if lemma_casing else "",
32
+ case,
33
+ i if i <= len(lemma) // 2 else i - len(lemma),
34
+ )
35
+ previous_case = case
36
+ lemma = lemma.lower()
37
+
38
+ best, best_form, best_lemma = 0, 0, 0
39
+ for l in range(len(lemma)):
40
+ for f in range(len(form)):
41
+ cpl = 0
42
+ while (
43
+ f + cpl < len(form)
44
+ and l + cpl < len(lemma)
45
+ and form[f + cpl] == lemma[l + cpl]
46
+ ):
47
+ cpl += 1
48
+ if cpl > best:
49
+ best = cpl
50
+ best_form = f
51
+ best_lemma = l
52
+
53
+ rule = lemma_casing + ";"
54
+ if not best:
55
+ rule += "a" + lemma
56
+ else:
57
+ rule += "d{}¦{}".format(
58
+ min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy),
59
+ min_edit_script(
60
+ form[best_form + best :], lemma[best_lemma + best :], allow_copy
61
+ ),
62
+ )
63
+ return rule
64
+
65
+ def apply_lemma_rule(self, form: str, lemma_rule: str) -> str:
66
+ if ";" not in lemma_rule:
67
+ raise ValueError("Invalid rule format: ';' not in rule")
68
+ casing, rule = lemma_rule.split(";", 1)
69
+ if rule.startswith("a"):
70
+ lemma = rule[1:]
71
+ else:
72
+ if "¦" not in rule:
73
+ raise ValueError("Invalid rule format: '¦' not in rule")
74
+ form = form.lower()
75
+ rules, rule_sources = rule[1:].split("¦"), []
76
+ assert len(rules) == 2
77
+ for rule in rules:
78
+ source, i = 0, 0
79
+ while i < len(rule):
80
+ if rule[i] == "→" or rule[i] == "-":
81
+ source += 1
82
+ else:
83
+ assert rule[i] == "+"
84
+ i += 1
85
+ i += 1
86
+ rule_sources.append(source)
87
+
88
+ try:
89
+ lemma, form_offset = "", 0
90
+ for i in range(2):
91
+ j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
92
+ while j < len(rules[i]):
93
+ if rules[i][j] == "→":
94
+ lemma += form[offset]
95
+ offset += 1
96
+ elif rules[i][j] == "-":
97
+ offset += 1
98
+ else:
99
+ assert rules[i][j] == "+"
100
+ lemma += rules[i][j + 1]
101
+ j += 1
102
+ j += 1
103
+ if i == 0:
104
+ lemma += form[rule_sources[0] : len(form) - rule_sources[1]]
105
+ except Exception as e:
106
+ if self.verbose:
107
+ print(
108
+ f"Caught an error: `{type(e).__name__}` with form: `{form}` and rule: `{lemma_rule}`, message: `{e}`"
109
+ )
110
+ lemma = form
111
+
112
+ for rule in casing.split("¦"):
113
+ # The lemma is lowercased initially
114
+ if rule == "↓0":
115
+ continue
116
+ # Empty lemma might generate empty casing rule
117
+ if not rule:
118
+ continue
119
+ case, offset = rule[0], int(rule[1:])
120
+ lemma = lemma[:offset] + (
121
+ lemma[offset:].upper() if case == "↑" else lemma[offset:].lower()
122
+ )
123
+
124
+ return lemma
125
+
126
+
127
+ def min_edit_script(source: str, target: str, allow_copy: bool) -> str:
128
+ a: List[List[Tuple[int, Union[None, str]]]] = [
129
+ [(len(source) + len(target) + 1, None)] * (len(target) + 1)
130
+ for _ in range(len(source) + 1)
131
+ ]
132
+
133
+ for i in range(0, len(source) + 1):
134
+ for j in range(0, len(target) + 1):
135
+ if i == 0 and j == 0:
136
+ a[i][j] = (0, "")
137
+ else:
138
+ if (
139
+ allow_copy
140
+ and i
141
+ and j
142
+ and source[i - 1] == target[j - 1]
143
+ and a[i - 1][j - 1][0] < a[i][j][0]
144
+ ):
145
+ a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→")
146
+ if i and a[i - 1][j][0] < a[i][j][0]:
147
+ a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-")
148
+ if j and a[i][j - 1][0] < a[i][j][0]:
149
+ a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1])
150
+ return a[-1][-1][1]
utils.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ def sentence_to_spans(tokenized_sentence: list[str]) -> dict[str, int]:
2
+ span_to_token_id_ = dict()
3
+ start = 0
4
+ for index, token_ in enumerate(tokenized_sentence):
5
+ span_to_token_id_[f"{start + index}-{start + index + len(token_)}"] = index
6
+ start += len(token_)
7
+ return span_to_token_id_
vabamorf_lemmatizer.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from estnltk import Text
2
+ from estnltk.taggers import (
3
+ VabamorfTagger,
4
+ WhiteSpaceTokensTagger,
5
+ PretokenizedTextCompoundTokensTagger,
6
+ TokensTagger,
7
+ )
8
+
9
+
10
+ class Lemmatizer:
11
+
12
+ def __init__(
13
+ self,
14
+ disambiguate: bool = False,
15
+ use_context: bool = False,
16
+ proper_name: bool = True,
17
+ guess: bool = False,
18
+ separate_punctuation: bool = False,
19
+ ):
20
+ self.disambiguate = disambiguate
21
+ self.use_context = use_context
22
+ self.proper_name = proper_name
23
+ self.guess = guess
24
+ self.tagger = VabamorfTagger(
25
+ compound=False,
26
+ disambiguate=self.disambiguate,
27
+ guess=self.guess,
28
+ slang_lex=False,
29
+ phonetic=False,
30
+ use_postanalysis=True,
31
+ use_reorderer=True,
32
+ propername=self.proper_name,
33
+ predisambiguate=self.use_context,
34
+ postdisambiguate=self.use_context,
35
+ )
36
+ self.separate_punctuation = separate_punctuation
37
+ if self.separate_punctuation:
38
+ self.tokens_tagger = TokensTagger()
39
+ else:
40
+ self.tokens_tagger = WhiteSpaceTokensTagger()
41
+ self.compound_token_tagger = PretokenizedTextCompoundTokensTagger()
42
+
43
+ def __call__(self, text: str, return_tokens: bool = False) -> list[list[str]]:
44
+ text = Text(text)
45
+ self.tokens_tagger.tag(text)
46
+ self.compound_token_tagger.tag(text)
47
+ text.tag_layer(self.tagger.input_layers)
48
+ self.tagger.tag(text)
49
+ if return_tokens:
50
+ return list(text["morph_analysis"].lemma), list(
51
+ text["morph_analysis"].normalized_text
52
+ )
53
+ return list(text["morph_analysis"].lemma)
54
+
55
+
56
+ if __name__ == "__main__":
57
+ sample = "India köök: riisi-dhal, köögivilju ja roti-papad?"
58
+ lemmatizer = Lemmatizer(
59
+ proper_name=True, use_context=True, disambiguate=True, separate_punctuation=True
60
+ )
61
+ print(lemmatizer(sample))