Spaces:

TAgroup5
/

demo-News_classifier

Sleeping

TAgroup5 commited on Mar 28

Commit

ff871dc

verified ·

1 Parent(s): 5cdc45c

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import io
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 # Load fine-tuned model and tokenizer
-model_name = "TAgroup5/daily-mirror-news-classifier"
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -36,10 +36,12 @@ if uploaded_file is not None:
         # Preprocessing function
         def preprocess_text(text):
-            text = text.lower()  # Ensure consistent casing
-            text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
-            text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
-            return text
         # Apply preprocessing and classification
         df['processed_content'] = df['content'].apply(preprocess_text)

 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
 # Load fine-tuned model and tokenizer
+model_name = "TAgroup5/news-classification-model"
 model = AutoModelForSequenceClassification.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
         # Preprocessing function
         def preprocess_text(text):
+            text = text.lower()  # Convert to lowercase
+            text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters & numbers
+            tokens = word_tokenize(text)  # Tokenization
+            tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
+            tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
+            return " ".join(tokens)
         # Apply preprocessing and classification
         df['processed_content'] = df['content'].apply(preprocess_text)