Spaces:
Sleeping
Sleeping
| import re | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| from nltk.corpus import stopwords | |
| nltk.download('punkt') | |
| nltk.download('corpus') | |
| nltk.download('stopwords') | |
| nltk.download('wordnet') | |
| nltk.download('omw-1.4') | |
| stop_words = set(stopwords.words('english')) # set of English stop words | |
| lemmatizer = WordNetLemmatizer() | |
| def preprocess(text,target_language='en'): | |
| if not isinstance(text, str): | |
| try: | |
| text = str(text) | |
| except: | |
| raise TypeError('Input must be a string or a float') | |
| # convert to lowercase | |
| text = text.lower() | |
| # Remove URLs | |
| text = re.sub(r'http\S+', '', text) | |
| # Remove special characters and punctuation | |
| text = re.sub(r'[^a-zA-Z\s]', '', text) | |
| # Removing repeated characters | |
| text = re.sub(r'(.)\1{2,}', r'\1', text) | |
| words = word_tokenize(text) | |
| words = [lemmatizer.lemmatize(w) for w in words] | |
| words = [w for w in words if not w in stop_words] | |
| return words |