Fixed a minor error
Browse files
app.py
CHANGED
@@ -1,10 +1,23 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
3 |
import matplotlib.pyplot as plt
|
4 |
from wordcloud import WordCloud
|
5 |
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
# β
MUST be first Streamlit command
|
8 |
st.set_page_config(page_title="π° News Classifier & Q&A App", layout="wide")
|
9 |
|
10 |
# ----------------- Model Loader -----------------
|
@@ -99,8 +112,24 @@ if uploaded_file:
|
|
99 |
st.error("β The uploaded CSV must contain a 'content' column.")
|
100 |
st.stop()
|
101 |
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
st.write("π Preview of Uploaded Data:", df.head())
|
105 |
|
106 |
# ----------------- Classification -----------------
|
@@ -169,7 +198,7 @@ if uploaded_file:
|
|
169 |
with main_col2:
|
170 |
# ----------------- Compact Word Cloud -----------------
|
171 |
st.markdown("*Word Cloud*")
|
172 |
-
text = " ".join(df['
|
173 |
wordcloud = WordCloud(
|
174 |
width=300,
|
175 |
height=200,
|
@@ -207,6 +236,11 @@ if uploaded_file:
|
|
207 |
# ----------------- News Category Explorer -----------------
|
208 |
st.subheader("π Explore News by Category")
|
209 |
|
|
|
|
|
|
|
|
|
|
|
210 |
# Get unique categories
|
211 |
categories = df['class'].unique()
|
212 |
|
@@ -214,7 +248,11 @@ if uploaded_file:
|
|
214 |
cols = st.columns(5)
|
215 |
|
216 |
# Create a dictionary to store category articles
|
217 |
-
|
|
|
|
|
|
|
|
|
218 |
|
219 |
# Place each category button in its own column
|
220 |
for i, category in enumerate(categories):
|
@@ -229,7 +267,7 @@ if uploaded_file:
|
|
229 |
for idx, row in articles.iterrows():
|
230 |
with st.expander(f"Article {idx + 1}: {row['content'][:50]}...", expanded=False):
|
231 |
st.write(row['content'])
|
232 |
-
st.caption(f"Classification confidence: {
|
233 |
|
234 |
# ----------------- Footer -----------------
|
235 |
st.markdown("---")
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
+
import torch
|
4 |
import matplotlib.pyplot as plt
|
5 |
from wordcloud import WordCloud
|
6 |
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
7 |
+
import re
|
8 |
+
from nltk.corpus import stopwords
|
9 |
+
from nltk.tokenize import word_tokenize
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
import nltk
|
12 |
+
|
13 |
+
nltk.download('stopwords')
|
14 |
+
nltk.download('punkt')
|
15 |
+
nltk.download('wordnet')
|
16 |
+
nltk.download('omw-1.4')
|
17 |
+
|
18 |
+
# Initialize lemmatizer
|
19 |
+
lemmatizer = WordNetLemmatizer()
|
20 |
|
|
|
21 |
st.set_page_config(page_title="π° News Classifier & Q&A App", layout="wide")
|
22 |
|
23 |
# ----------------- Model Loader -----------------
|
|
|
112 |
st.error("β The uploaded CSV must contain a 'content' column.")
|
113 |
st.stop()
|
114 |
|
115 |
+
|
116 |
+
# Define preprocessing function
|
117 |
+
def preprocess_text(text):
|
118 |
+
# Lowercasing
|
119 |
+
text = text.lower()
|
120 |
+
# Remove punctuation and special characters
|
121 |
+
text = re.sub(r'[^\w\s]', '', text)
|
122 |
+
# Tokenize
|
123 |
+
tokens = word_tokenize(text)
|
124 |
+
# Remove stopwords
|
125 |
+
tokens = [word for word in tokens if word not in stopwords.words('english')]
|
126 |
+
# Lemmatize tokens
|
127 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
128 |
+
# Join tokens back to string
|
129 |
+
return " ".join(tokens)
|
130 |
+
|
131 |
+
# Apply preprocessing
|
132 |
+
df['cleaned_text'] = df['content'].astype(str).apply(preprocess_text)
|
133 |
st.write("π Preview of Uploaded Data:", df.head())
|
134 |
|
135 |
# ----------------- Classification -----------------
|
|
|
198 |
with main_col2:
|
199 |
# ----------------- Compact Word Cloud -----------------
|
200 |
st.markdown("*Word Cloud*")
|
201 |
+
text = " ".join(df['cleaned_text'].tolist())
|
202 |
wordcloud = WordCloud(
|
203 |
width=300,
|
204 |
height=200,
|
|
|
236 |
# ----------------- News Category Explorer -----------------
|
237 |
st.subheader("π Explore News by Category")
|
238 |
|
239 |
+
with st.spinner("π Classifying news articles..."):
|
240 |
+
classified = df['cleaned_text'].apply(lambda text: classifier(text)[0])
|
241 |
+
df['class'] = classified.apply(lambda x: x['label'])
|
242 |
+
df['confidence'] = classified.apply(lambda x: x['score'])
|
243 |
+
|
244 |
# Get unique categories
|
245 |
categories = df['class'].unique()
|
246 |
|
|
|
248 |
cols = st.columns(5)
|
249 |
|
250 |
# Create a dictionary to store category articles
|
251 |
+
@st.cache_data
|
252 |
+
def get_category_articles(df):
|
253 |
+
return {category: df[df['class'] == category] for category in df['class'].unique()}
|
254 |
+
|
255 |
+
category_articles = get_category_articles(df)
|
256 |
|
257 |
# Place each category button in its own column
|
258 |
for i, category in enumerate(categories):
|
|
|
267 |
for idx, row in articles.iterrows():
|
268 |
with st.expander(f"Article {idx + 1}: {row['content'][:50]}...", expanded=False):
|
269 |
st.write(row['content'])
|
270 |
+
st.caption(f"Classification confidence: {row['confidence']:.2f}")
|
271 |
|
272 |
# ----------------- Footer -----------------
|
273 |
st.markdown("---")
|