Spaces:

adorkin
/

GliLem

Sleeping

App Files Files Community

adorkin commited on Dec 29, 2024

Commit

15ce174

verified ·

1 Parent(s): 9042e03

Add demo files

Browse files

Files changed (5) hide show

demo.py +103 -0
requirements.txt +3 -0
rule_processor.py +150 -0
utils.py +7 -0
vabamorf_lemmatizer.py +61 -0

demo.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from gliner import GLiNER
+import gradio as gr
+from rule_processor import RuleProcessor
+from vabamorf_lemmatizer import Lemmatizer
+from utils import sentence_to_spans
+examples = [
+    "4. koha tõenäsus on täpselt 0, seda sõltumata lisakoha tulekust või mittetulekust.",
+    "WordPressi puhul tasub see sokutada oma kujundusteema kataloogi ning kui lisada functions.php-sse järgmised kaks rida peakski kõik toimima:",
+]
+rule_processor = RuleProcessor()
+model = GLiNER.from_pretrained("tartuNLP/glilem-vabamorf-disambiguator")
+lemmatizer = Lemmatizer(
+    disambiguate=False, use_context=False, proper_name=True, separate_punctuation=True
+)
+def process_text(text):
+    lemmas, tokens = lemmatizer(text, return_tokens=True)
+    lemmas = [list(set(el)) for el in lemmas]
+    tokens = [el[0] for el in tokens]
+    # serves as input for GliNER to remain consistent with Vabamorf tokenization
+    processed_text = " ".join(tokens)
+    labels = []
+    # contains the token id for each span
+    span_to_token_id = sentence_to_spans(tokens)
+    # produce a transofrmation rule for each lemma candidate
+    for token, lemma_list in zip(tokens, lemmas):
+        for lemma in lemma_list:
+            labels.append(
+                rule_processor.gen_lemma_rule(form=token, lemma=lemma, allow_copy=True)
+            )
+    # we only consider unique rules
+    labels = list(set(labels))
+    predicted_entities = model.predict_entities(
+        text=processed_text, labels=labels, flat_ner=True, threshold=0.5
+    )
+    predictions = tokens.copy()
+    for entity in predicted_entities:
+        cur_start = entity["start"]
+        cur_end = entity["end"]
+        token = processed_text[cur_start:cur_end]
+        if f"{cur_start}-{cur_end}" in span_to_token_id:
+            token_id = span_to_token_id[f"{cur_start}-{cur_end}"]
+            token = tokens[token_id]
+            # if there are multiple lemma candidates, apply the highest scoring rule
+            if len(lemmas[token_id]) > 1:
+                result = rule_processor.apply_lemma_rule(token, entity["label"])
+            # otherwise, we trust the Vabamorf lemma
+            else:
+                result = lemmas[token_id][0]
+            predictions[token_id] = result
+    # store labels to highlight changed word forms
+    lemma_labels = []
+    for pred, token in zip(predictions, tokens):
+        lemma_labels.append(pred != token)
+    # expected input format for HighlightedText component
+    processed_entities = {
+        "text": processed_text,
+        "entities": [
+            {
+                "entity": entity["label"],
+                "word": entity["text"],
+                "start": entity["start"],
+                "end": entity["end"],
+                "score": entity["score"],
+            }
+            for entity in predicted_entities
+        ],
+    }
+    processed_lemmas = [(pred, label) for pred, label in zip(predictions, lemma_labels)]
+    return processed_entities, processed_lemmas
+if __name__ == "__main__":
+    with gr.Blocks() as demo:
+        input_text = gr.Textbox(
+            label="Text input", placeholder="Enter your text in Estonian here"
+        )
+        label_output = gr.HighlightedText(label="Predicted Transformation Rules")
+        lemma_output = gr.HighlightedText(label="Predicted Lemmas")
+        submit_btn = gr.Button("Submit")
+        input_text.submit(
+            fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
+        )
+        submit_btn.click(
+            fn=process_text, inputs=input_text, outputs=[label_output, lemma_output]
+        )
+        examples = gr.Examples(
+            examples,
+            fn=process_text,
+            inputs=input_text,
+            outputs=[label_output, lemma_output],
+            cache_examples=False,
+        )
+        theme = gr.themes.Base()
+        demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+gradio>=5.9.1
+gliner~=0.2.8
+estnltk==1.7.3

rule_processor.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Adapted from the source code of UDPipe 2
+# https://github.com/ufal/udpipe/blob/82a9bd82ae7e947897304177e0390b3f191b01cb/udpipe2_dataset.py
+from typing import List, Tuple, Union
+class RuleProcessor:
+    def __init__(self, verbose: bool = True):
+        self.verbose = verbose
+    @staticmethod
+    def gen_lemma_rule(form: str, lemma: str, allow_copy: bool) -> str:
+        form = form.lower()
+        # change back to original maybe
+        previous_case = -1
+        lemma_casing = ""
+        for i, c in enumerate(lemma):
+            # prevent non-alphabetic characters from breaking spans in casing rules
+            if not c.islower() and not c.isupper():  # wrong condition?
+                if previous_case == -1:
+                    case = "↓"
+                else:
+                    case = previous_case
+            else:
+                case = "↑" if c.lower() != c else "↓"
+            if case != previous_case:
+                lemma_casing += "{}{}{}".format(
+                    "¦" if lemma_casing else "",
+                    case,
+                    i if i <= len(lemma) // 2 else i - len(lemma),
+                )
+            previous_case = case
+        lemma = lemma.lower()
+        best, best_form, best_lemma = 0, 0, 0
+        for l in range(len(lemma)):
+            for f in range(len(form)):
+                cpl = 0
+                while (
+                    f + cpl < len(form)
+                    and l + cpl < len(lemma)
+                    and form[f + cpl] == lemma[l + cpl]
+                ):
+                    cpl += 1
+                if cpl > best:
+                    best = cpl
+                    best_form = f
+                    best_lemma = l
+        rule = lemma_casing + ";"
+        if not best:
+            rule += "a" + lemma
+        else:
+            rule += "d{}¦{}".format(
+                min_edit_script(form[:best_form], lemma[:best_lemma], allow_copy),
+                min_edit_script(
+                    form[best_form + best :], lemma[best_lemma + best :], allow_copy
+                ),
+            )
+        return rule
+    def apply_lemma_rule(self, form: str, lemma_rule: str) -> str:
+        if ";" not in lemma_rule:
+            raise ValueError("Invalid rule format: ';' not in rule")
+        casing, rule = lemma_rule.split(";", 1)
+        if rule.startswith("a"):
+            lemma = rule[1:]
+        else:
+            if "¦" not in rule:
+                raise ValueError("Invalid rule format: '¦' not in rule")
+            form = form.lower()
+            rules, rule_sources = rule[1:].split("¦"), []
+            assert len(rules) == 2
+            for rule in rules:
+                source, i = 0, 0
+                while i < len(rule):
+                    if rule[i] == "→" or rule[i] == "-":
+                        source += 1
+                    else:
+                        assert rule[i] == "+"
+                        i += 1
+                    i += 1
+                rule_sources.append(source)
+            try:
+                lemma, form_offset = "", 0
+                for i in range(2):
+                    j, offset = 0, (0 if i == 0 else len(form) - rule_sources[1])
+                    while j < len(rules[i]):
+                        if rules[i][j] == "→":
+                            lemma += form[offset]
+                            offset += 1
+                        elif rules[i][j] == "-":
+                            offset += 1
+                        else:
+                            assert rules[i][j] == "+"
+                            lemma += rules[i][j + 1]
+                            j += 1
+                        j += 1
+                    if i == 0:
+                        lemma += form[rule_sources[0] : len(form) - rule_sources[1]]
+            except Exception as e:
+                if self.verbose:
+                    print(
+                        f"Caught an error: `{type(e).__name__}` with form: `{form}` and rule: `{lemma_rule}`, message: `{e}`"
+                    )
+                lemma = form
+        for rule in casing.split("¦"):
+            # The lemma is lowercased initially
+            if rule == "↓0":
+                continue
+            # Empty lemma might generate empty casing rule
+            if not rule:
+                continue
+            case, offset = rule[0], int(rule[1:])
+            lemma = lemma[:offset] + (
+                lemma[offset:].upper() if case == "↑" else lemma[offset:].lower()
+            )
+        return lemma
+def min_edit_script(source: str, target: str, allow_copy: bool) -> str:
+    a: List[List[Tuple[int, Union[None, str]]]] = [
+        [(len(source) + len(target) + 1, None)] * (len(target) + 1)
+        for _ in range(len(source) + 1)
+    ]
+    for i in range(0, len(source) + 1):
+        for j in range(0, len(target) + 1):
+            if i == 0 and j == 0:
+                a[i][j] = (0, "")
+            else:
+                if (
+                    allow_copy
+                    and i
+                    and j
+                    and source[i - 1] == target[j - 1]
+                    and a[i - 1][j - 1][0] < a[i][j][0]
+                ):
+                    a[i][j] = (a[i - 1][j - 1][0], a[i - 1][j - 1][1] + "→")
+                if i and a[i - 1][j][0] < a[i][j][0]:
+                    a[i][j] = (a[i - 1][j][0] + 1, a[i - 1][j][1] + "-")
+                if j and a[i][j - 1][0] < a[i][j][0]:
+                    a[i][j] = (a[i][j - 1][0] + 1, a[i][j - 1][1] + "+" + target[j - 1])
+    return a[-1][-1][1]

utils.py ADDED Viewed

	@@ -0,0 +1,7 @@

+def sentence_to_spans(tokenized_sentence: list[str]) -> dict[str, int]:
+    span_to_token_id_ = dict()
+    start = 0
+    for index, token_ in enumerate(tokenized_sentence):
+        span_to_token_id_[f"{start + index}-{start + index + len(token_)}"] = index
+        start += len(token_)
+    return span_to_token_id_

vabamorf_lemmatizer.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from estnltk import Text
+from estnltk.taggers import (
+    VabamorfTagger,
+    WhiteSpaceTokensTagger,
+    PretokenizedTextCompoundTokensTagger,
+    TokensTagger,
+)
+class Lemmatizer:
+    def __init__(
+        self,
+        disambiguate: bool = False,
+        use_context: bool = False,
+        proper_name: bool = True,
+        guess: bool = False,
+        separate_punctuation: bool = False,
+    ):
+        self.disambiguate = disambiguate
+        self.use_context = use_context
+        self.proper_name = proper_name
+        self.guess = guess
+        self.tagger = VabamorfTagger(
+            compound=False,
+            disambiguate=self.disambiguate,
+            guess=self.guess,
+            slang_lex=False,
+            phonetic=False,
+            use_postanalysis=True,
+            use_reorderer=True,
+            propername=self.proper_name,
+            predisambiguate=self.use_context,
+            postdisambiguate=self.use_context,
+        )
+        self.separate_punctuation = separate_punctuation
+        if self.separate_punctuation:
+            self.tokens_tagger = TokensTagger()
+        else:
+            self.tokens_tagger = WhiteSpaceTokensTagger()
+        self.compound_token_tagger = PretokenizedTextCompoundTokensTagger()
+    def __call__(self, text: str, return_tokens: bool = False) -> list[list[str]]:
+        text = Text(text)
+        self.tokens_tagger.tag(text)
+        self.compound_token_tagger.tag(text)
+        text.tag_layer(self.tagger.input_layers)
+        self.tagger.tag(text)
+        if return_tokens:
+            return list(text["morph_analysis"].lemma), list(
+                text["morph_analysis"].normalized_text
+            )
+        return list(text["morph_analysis"].lemma)
+if __name__ == "__main__":
+    sample = "India köök: riisi-dhal, köögivilju ja roti-papad?"
+    lemmatizer = Lemmatizer(
+        proper_name=True, use_context=True, disambiguate=True, separate_punctuation=True
+    )
+    print(lemmatizer(sample))