Spaces:

TharushiPerera
/

News_Classification

Sleeping

App Files Files Community

MihanTilk commited on Apr 29

Commit

0cc6138

verified ·

1 Parent(s): b76d6c6

Fixed a minor error

Browse files

Files changed (1) hide show

app.py +44 -6

app.py CHANGED Viewed

@@ -1,10 +1,23 @@
 import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
-# ✅ MUST be first Streamlit command
 st.set_page_config(page_title="📰 News Classifier & Q&A App", layout="wide")
 # ----------------- Model Loader -----------------
@@ -99,8 +112,24 @@ if uploaded_file:
         st.error("❌ The uploaded CSV must contain a 'content' column.")
         st.stop()
-    # Preprocess text
-    df['cleaned_text'] = df['content'].astype(str).str.lower().str.strip()
     st.write("📊 Preview of Uploaded Data:", df.head())
     # ----------------- Classification -----------------
@@ -169,7 +198,7 @@ if uploaded_file:
     with main_col2:
         # ----------------- Compact Word Cloud -----------------
         st.markdown("*Word Cloud*")
-        text = " ".join(df['content'].tolist())
         wordcloud = WordCloud(
             width=300,
             height=200,
@@ -207,6 +236,11 @@ if uploaded_file:
 # ----------------- News Category Explorer -----------------
     st.subheader("🔍 Explore News by Category")
     # Get unique categories
     categories = df['class'].unique()
@@ -214,7 +248,11 @@ if uploaded_file:
     cols = st.columns(5)
     # Create a dictionary to store category articles
-    category_articles = {category: df[df['class'] == category] for category in categories}
     # Place each category button in its own column
     for i, category in enumerate(categories):
@@ -229,7 +267,7 @@ if uploaded_file:
                     for idx, row in articles.iterrows():
                         with st.expander(f"Article {idx + 1}: {row['content'][:50]}...", expanded=False):
                             st.write(row['content'])
-                            st.caption(f"Classification confidence: {classifier(row['content'])[0]['score']:.2f}")
 # ----------------- Footer -----------------
 st.markdown("---")

 import streamlit as st
 import pandas as pd
+import torch
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
+import re
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+import nltk
+nltk.download('stopwords')
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('omw-1.4')
+# Initialize lemmatizer
+lemmatizer = WordNetLemmatizer()
 st.set_page_config(page_title="📰 News Classifier & Q&A App", layout="wide")
 # ----------------- Model Loader -----------------
         st.error("❌ The uploaded CSV must contain a 'content' column.")
         st.stop()
+    # Define preprocessing function
+    def preprocess_text(text):
+        # Lowercasing
+        text = text.lower()
+        # Remove punctuation and special characters
+        text = re.sub(r'[^\w\s]', '', text)
+        # Tokenize
+        tokens = word_tokenize(text)
+        # Remove stopwords
+        tokens = [word for word in tokens if word not in stopwords.words('english')]
+        # Lemmatize tokens
+        tokens = [lemmatizer.lemmatize(word) for word in tokens]
+        # Join tokens back to string
+        return " ".join(tokens)
+    # Apply preprocessing
+    df['cleaned_text'] = df['content'].astype(str).apply(preprocess_text)
     st.write("📊 Preview of Uploaded Data:", df.head())
     # ----------------- Classification -----------------
     with main_col2:
         # ----------------- Compact Word Cloud -----------------
         st.markdown("*Word Cloud*")
+        text = " ".join(df['cleaned_text'].tolist())
         wordcloud = WordCloud(
             width=300,
             height=200,
 # ----------------- News Category Explorer -----------------
     st.subheader("🔍 Explore News by Category")
+    with st.spinner("🔍 Classifying news articles..."):
+        classified = df['cleaned_text'].apply(lambda text: classifier(text)[0])
+        df['class'] = classified.apply(lambda x: x['label'])
+        df['confidence'] = classified.apply(lambda x: x['score'])
     # Get unique categories
     categories = df['class'].unique()
     cols = st.columns(5)
     # Create a dictionary to store category articles
+    @st.cache_data
+    def get_category_articles(df):
+        return {category: df[df['class'] == category] for category in df['class'].unique()}
+    category_articles = get_category_articles(df)
     # Place each category button in its own column
     for i, category in enumerate(categories):
                     for idx, row in articles.iterrows():
                         with st.expander(f"Article {idx + 1}: {row['content'][:50]}...", expanded=False):
                             st.write(row['content'])
+                            st.caption(f"Classification confidence: {row['confidence']:.2f}")
 # ----------------- Footer -----------------
 st.markdown("---")