| from estnltk import Text | |
| from estnltk.taggers import ( | |
| VabamorfTagger, | |
| WhiteSpaceTokensTagger, | |
| PretokenizedTextCompoundTokensTagger, | |
| TokensTagger, | |
| ) | |
| class Lemmatizer: | |
| def __init__( | |
| self, | |
| disambiguate: bool = False, | |
| use_context: bool = False, | |
| proper_name: bool = True, | |
| guess: bool = False, | |
| separate_punctuation: bool = False, | |
| ): | |
| self.disambiguate = disambiguate | |
| self.use_context = use_context | |
| self.proper_name = proper_name | |
| self.guess = guess | |
| self.tagger = VabamorfTagger( | |
| compound=False, | |
| disambiguate=self.disambiguate, | |
| guess=self.guess, | |
| slang_lex=False, | |
| phonetic=False, | |
| use_postanalysis=True, | |
| use_reorderer=True, | |
| propername=self.proper_name, | |
| predisambiguate=self.use_context, | |
| postdisambiguate=self.use_context, | |
| ) | |
| self.separate_punctuation = separate_punctuation | |
| if self.separate_punctuation: | |
| self.tokens_tagger = TokensTagger() | |
| else: | |
| self.tokens_tagger = WhiteSpaceTokensTagger() | |
| self.compound_token_tagger = PretokenizedTextCompoundTokensTagger() | |
| def __call__(self, text: str, return_tokens: bool = False) -> list[list[str]]: | |
| text = Text(text) | |
| self.tokens_tagger.tag(text) | |
| self.compound_token_tagger.tag(text) | |
| text.tag_layer(self.tagger.input_layers) | |
| self.tagger.tag(text) | |
| if return_tokens: | |
| return list(text["morph_analysis"].lemma), list( | |
| text["morph_analysis"].normalized_text | |
| ) | |
| return list(text["morph_analysis"].lemma) | |
| if __name__ == "__main__": | |
| sample = "India köök: riisi-dhal, köögivilju ja roti-papad?" | |
| lemmatizer = Lemmatizer( | |
| proper_name=True, use_context=True, disambiguate=True, separate_punctuation=True | |
| ) | |
| print(lemmatizer(sample)) | |