Spaces:
Running
Running
| import hazm | |
| import typing | |
| normalizer = hazm.Normalizer() | |
| sent_tokenizer = hazm.SentenceTokenizer() | |
| word_tokenizer = hazm.WordTokenizer() | |
| tagger = hazm.POSTagger( | |
| model=str("gyroing/PersianTextCorrection_Hazm/pos_tagger.model") | |
| ) | |
| def preprocess_text(text: str) -> typing.List[typing.List[str]]: | |
| """Split/normalize text into sentences/words with hazm""" | |
| text = normalizer.normalize(text) | |
| processed_sentences = [] | |
| for sentence in sent_tokenizer.tokenize(text): | |
| words = word_tokenizer.tokenize(sentence) | |
| processed_words = fix_words(words) | |
| processed_sentences.append(" ".join(processed_words)) | |
| return " ".join(processed_sentences) | |
| def fix_words(words: typing.List[str]) -> typing.List[str]: | |
| fixed_words = [] | |
| for word, pos in tagger.tag(words): | |
| if pos[-1] == "Z": | |
| if word[-1] != "ِ": | |
| if (word[-1] == "ه") and (word[-2] != "ا"): | |
| word += "ی" | |
| word += "ِ" | |
| fixed_words.append(word) | |
| return fixed_words |