Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -74,9 +74,11 @@ stop_words.update('ask','much','thank','etc.', 'e', 'We', 'In', 'ed','pa', 'This
|
|
| 74 |
|
| 75 |
def clean_text(text):
|
| 76 |
'''
|
| 77 |
-
|
| 78 |
'''
|
| 79 |
text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
|
|
|
|
|
|
|
| 80 |
text = re.sub(r"\n", " ", text)
|
| 81 |
text = re.sub(r"\n\n", " ", text)
|
| 82 |
text = re.sub(r"\t", " ", text)
|
|
@@ -84,7 +86,7 @@ def clean_text(text):
|
|
| 84 |
text = text.strip(" ")
|
| 85 |
text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
|
| 86 |
|
| 87 |
-
text = [word for word in text.split() if word not in
|
| 88 |
text = ' '.join(text)
|
| 89 |
return text
|
| 90 |
|
|
|
|
| 74 |
|
| 75 |
def clean_text(text):
|
| 76 |
'''
|
| 77 |
+
The function which returns clean text
|
| 78 |
'''
|
| 79 |
text = text.encode("ascii", errors="ignore").decode("ascii") # remove non-asciicharacters
|
| 80 |
+
text=unidecode.unidecode(text)# diacritics remove
|
| 81 |
+
text=contractions.fix(text) # contraction fix
|
| 82 |
text = re.sub(r"\n", " ", text)
|
| 83 |
text = re.sub(r"\n\n", " ", text)
|
| 84 |
text = re.sub(r"\t", " ", text)
|
|
|
|
| 86 |
text = text.strip(" ")
|
| 87 |
text = re.sub(" +", " ", text).strip() # get rid of multiple spaces and replace with a single
|
| 88 |
|
| 89 |
+
text = [word for word in text.split() if word not in stop_words]
|
| 90 |
text = ' '.join(text)
|
| 91 |
return text
|
| 92 |
|