MihanTilk commited on
Commit
0cc6138
Β·
verified Β·
1 Parent(s): b76d6c6

Fixed a minor error

Browse files
Files changed (1) hide show
  1. app.py +44 -6
app.py CHANGED
@@ -1,10 +1,23 @@
1
  import streamlit as st
2
  import pandas as pd
 
3
  import matplotlib.pyplot as plt
4
  from wordcloud import WordCloud
5
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # βœ… MUST be first Streamlit command
8
  st.set_page_config(page_title="πŸ“° News Classifier & Q&A App", layout="wide")
9
 
10
  # ----------------- Model Loader -----------------
@@ -99,8 +112,24 @@ if uploaded_file:
99
  st.error("❌ The uploaded CSV must contain a 'content' column.")
100
  st.stop()
101
 
102
- # Preprocess text
103
- df['cleaned_text'] = df['content'].astype(str).str.lower().str.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  st.write("πŸ“Š Preview of Uploaded Data:", df.head())
105
 
106
  # ----------------- Classification -----------------
@@ -169,7 +198,7 @@ if uploaded_file:
169
  with main_col2:
170
  # ----------------- Compact Word Cloud -----------------
171
  st.markdown("*Word Cloud*")
172
- text = " ".join(df['content'].tolist())
173
  wordcloud = WordCloud(
174
  width=300,
175
  height=200,
@@ -207,6 +236,11 @@ if uploaded_file:
207
  # ----------------- News Category Explorer -----------------
208
  st.subheader("πŸ” Explore News by Category")
209
 
 
 
 
 
 
210
  # Get unique categories
211
  categories = df['class'].unique()
212
 
@@ -214,7 +248,11 @@ if uploaded_file:
214
  cols = st.columns(5)
215
 
216
  # Create a dictionary to store category articles
217
- category_articles = {category: df[df['class'] == category] for category in categories}
 
 
 
 
218
 
219
  # Place each category button in its own column
220
  for i, category in enumerate(categories):
@@ -229,7 +267,7 @@ if uploaded_file:
229
  for idx, row in articles.iterrows():
230
  with st.expander(f"Article {idx + 1}: {row['content'][:50]}...", expanded=False):
231
  st.write(row['content'])
232
- st.caption(f"Classification confidence: {classifier(row['content'])[0]['score']:.2f}")
233
 
234
  # ----------------- Footer -----------------
235
  st.markdown("---")
 
1
  import streamlit as st
2
  import pandas as pd
3
+ import torch
4
  import matplotlib.pyplot as plt
5
  from wordcloud import WordCloud
6
  from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
7
+ import re
8
+ from nltk.corpus import stopwords
9
+ from nltk.tokenize import word_tokenize
10
+ from nltk.stem import WordNetLemmatizer
11
+ import nltk
12
+
13
+ nltk.download('stopwords')
14
+ nltk.download('punkt')
15
+ nltk.download('wordnet')
16
+ nltk.download('omw-1.4')
17
+
18
+ # Initialize lemmatizer
19
+ lemmatizer = WordNetLemmatizer()
20
 
 
21
  st.set_page_config(page_title="πŸ“° News Classifier & Q&A App", layout="wide")
22
 
23
  # ----------------- Model Loader -----------------
 
112
  st.error("❌ The uploaded CSV must contain a 'content' column.")
113
  st.stop()
114
 
115
+
116
+ # Define preprocessing function
117
+ def preprocess_text(text):
118
+ # Lowercasing
119
+ text = text.lower()
120
+ # Remove punctuation and special characters
121
+ text = re.sub(r'[^\w\s]', '', text)
122
+ # Tokenize
123
+ tokens = word_tokenize(text)
124
+ # Remove stopwords
125
+ tokens = [word for word in tokens if word not in stopwords.words('english')]
126
+ # Lemmatize tokens
127
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
128
+ # Join tokens back to string
129
+ return " ".join(tokens)
130
+
131
+ # Apply preprocessing
132
+ df['cleaned_text'] = df['content'].astype(str).apply(preprocess_text)
133
  st.write("πŸ“Š Preview of Uploaded Data:", df.head())
134
 
135
  # ----------------- Classification -----------------
 
198
  with main_col2:
199
  # ----------------- Compact Word Cloud -----------------
200
  st.markdown("*Word Cloud*")
201
+ text = " ".join(df['cleaned_text'].tolist())
202
  wordcloud = WordCloud(
203
  width=300,
204
  height=200,
 
236
  # ----------------- News Category Explorer -----------------
237
  st.subheader("πŸ” Explore News by Category")
238
 
239
+ with st.spinner("πŸ” Classifying news articles..."):
240
+ classified = df['cleaned_text'].apply(lambda text: classifier(text)[0])
241
+ df['class'] = classified.apply(lambda x: x['label'])
242
+ df['confidence'] = classified.apply(lambda x: x['score'])
243
+
244
  # Get unique categories
245
  categories = df['class'].unique()
246
 
 
248
  cols = st.columns(5)
249
 
250
  # Create a dictionary to store category articles
251
+ @st.cache_data
252
+ def get_category_articles(df):
253
+ return {category: df[df['class'] == category] for category in df['class'].unique()}
254
+
255
+ category_articles = get_category_articles(df)
256
 
257
  # Place each category button in its own column
258
  for i, category in enumerate(categories):
 
267
  for idx, row in articles.iterrows():
268
  with st.expander(f"Article {idx + 1}: {row['content'][:50]}...", expanded=False):
269
  st.write(row['content'])
270
+ st.caption(f"Classification confidence: {row['confidence']:.2f}")
271
 
272
  # ----------------- Footer -----------------
273
  st.markdown("---")