from estnltk import Text from estnltk.taggers import ( VabamorfTagger, WhiteSpaceTokensTagger, PretokenizedTextCompoundTokensTagger, TokensTagger, ) class Lemmatizer: def __init__( self, disambiguate: bool = False, use_context: bool = False, proper_name: bool = True, guess: bool = False, separate_punctuation: bool = False, ): self.disambiguate = disambiguate self.use_context = use_context self.proper_name = proper_name self.guess = guess self.tagger = VabamorfTagger( compound=False, disambiguate=self.disambiguate, guess=self.guess, slang_lex=False, phonetic=False, use_postanalysis=True, use_reorderer=True, propername=self.proper_name, predisambiguate=self.use_context, postdisambiguate=self.use_context, ) self.separate_punctuation = separate_punctuation if self.separate_punctuation: self.tokens_tagger = TokensTagger() else: self.tokens_tagger = WhiteSpaceTokensTagger() self.compound_token_tagger = PretokenizedTextCompoundTokensTagger() def __call__(self, text: str, return_tokens: bool = False) -> list[list[str]]: text = Text(text) self.tokens_tagger.tag(text) self.compound_token_tagger.tag(text) text.tag_layer(self.tagger.input_layers) self.tagger.tag(text) if return_tokens: return list(text["morph_analysis"].lemma), list( text["morph_analysis"].normalized_text ) return list(text["morph_analysis"].lemma) if __name__ == "__main__": sample = "India köök: riisi-dhal, köögivilju ja roti-papad?" lemmatizer = Lemmatizer( proper_name=True, use_context=True, disambiguate=True, separate_punctuation=True ) print(lemmatizer(sample))