|
from estnltk import Text |
|
from estnltk.taggers import ( |
|
VabamorfTagger, |
|
WhiteSpaceTokensTagger, |
|
PretokenizedTextCompoundTokensTagger, |
|
TokensTagger, |
|
) |
|
|
|
|
|
class Lemmatizer: |
|
|
|
def __init__( |
|
self, |
|
disambiguate: bool = False, |
|
use_context: bool = False, |
|
proper_name: bool = True, |
|
guess: bool = False, |
|
separate_punctuation: bool = False, |
|
): |
|
self.disambiguate = disambiguate |
|
self.use_context = use_context |
|
self.proper_name = proper_name |
|
self.guess = guess |
|
self.tagger = VabamorfTagger( |
|
compound=False, |
|
disambiguate=self.disambiguate, |
|
guess=self.guess, |
|
slang_lex=False, |
|
phonetic=False, |
|
use_postanalysis=True, |
|
use_reorderer=True, |
|
propername=self.proper_name, |
|
predisambiguate=self.use_context, |
|
postdisambiguate=self.use_context, |
|
) |
|
self.separate_punctuation = separate_punctuation |
|
if self.separate_punctuation: |
|
self.tokens_tagger = TokensTagger() |
|
else: |
|
self.tokens_tagger = WhiteSpaceTokensTagger() |
|
self.compound_token_tagger = PretokenizedTextCompoundTokensTagger() |
|
|
|
def __call__(self, text: str, return_tokens: bool = False) -> list[list[str]]: |
|
text = Text(text) |
|
self.tokens_tagger.tag(text) |
|
self.compound_token_tagger.tag(text) |
|
text.tag_layer(self.tagger.input_layers) |
|
self.tagger.tag(text) |
|
if return_tokens: |
|
return list(text["morph_analysis"].lemma), list( |
|
text["morph_analysis"].normalized_text |
|
) |
|
return list(text["morph_analysis"].lemma) |
|
|
|
|
|
if __name__ == "__main__": |
|
sample = "India köök: riisi-dhal, köögivilju ja roti-papad?" |
|
lemmatizer = Lemmatizer( |
|
proper_name=True, use_context=True, disambiguate=True, separate_punctuation=True |
|
) |
|
print(lemmatizer(sample)) |
|
|