Spaces:
Running
Running
File size: 7,742 Bytes
b2c9005 6ce73bd 1651757 b2c9005 1651757 b2c9005 849a4de b2c9005 6ce73bd b2c9005 1818159 b2c9005 1818159 b2c9005 1651757 b2c9005 6ce73bd b2c9005 6ce73bd b2c9005 6ce73bd b2c9005 ec85722 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import streamlit as st
import pandas as pd
from transformers import pipeline
from sentence_transformers import CrossEncoder
from sentence_transformers import SentenceTransformer
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
# Download NLTK resources (run this once if not already downloaded)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
# Set modern page configuration
st.set_page_config(page_title="News Analyzer", layout="wide")
# Inject custom CSS for sleek dark blue theme with black fonts
st.markdown("""
<style>
/* Global Styling */
body {
background: #0b132b;
font-family: 'Arial', sans-serif;
margin: 0;
padding: 0;
}
/* Header Styling */
.custom-header {
background: linear-gradient(to right, #1f4068, #1b1b2f);
padding: 1.5rem;
margin-bottom: 1.5rem;
border-radius: 12px;
text-align: center;
font-size: 30px;
font-weight: bold;
box-shadow: 0px 4px 15px rgba(0, 217, 255, 0.3);
}
/* Buttons */
.stButton>button {
background: linear-gradient(45deg, #0072ff, #00c6ff);
border-radius: 8px;
padding: 14px 28px;
font-size: 18px;
transition: 0.3s ease;
border: none;
}
.stButton>button:hover {
transform: scale(1.05);
box-shadow: 0px 4px 10px rgba(0, 255, 255, 0.5);
}
/* Text Input */
.stTextInput>div>div>input {
background-color: rgba(255, 255, 255, 0.1);
border-radius: 8px;
padding: 12px;
font-size: 18px;
}
/* Dataframe Container */
.dataframe-container {
background: rgba(255, 255, 255, 0.1);
padding: 15px;
border-radius: 12px;
}
/* Answer Display Box - Larger */
.answer-box {
background: rgba(0, 217, 255, 0.15);
padding: 35px;
border-radius: 15px;
border: 2px solid rgba(0, 217, 255, 0.6);
font-size: 22px;
text-align: center;
margin-bottom: 20px;
min-height: 150px;
box-shadow: 0px 2px 12px rgba(0, 217, 255, 0.3);
display: flex;
align-items: center;
justify-content: center;
transition: all 0.3s ease;
}
/* CSV Display Box */
.csv-box {
background: rgba(255, 255, 255, 0.1);
padding: 15px;
border-radius: 12px;
margin-top: 20px;
box-shadow: 0px 2px 12px rgba(0, 217, 255, 0.3);
}
</style>
""", unsafe_allow_html=True)
# Modern Header
st.markdown("<div class='custom-header'> ๐งฉ AI-Powered News Analyzer</div>", unsafe_allow_html=True)
# Load the Hugging Face models
classifier = pipeline("text-classification", model="Sandini/news-classifier") # Classification pipeline
qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad") # QA pipeline
# Initialize Cross-Encoder for QA relevance scoring
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # Pre-trained Cross-Encoder model
# Define preprocessing functions for classification
def preprocess_text(text):
if not isinstance(text, str):
text = ""
# Step 1: Lowercase the text
text = text.lower()
# Step 2: Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# Step 3: Tokenize the text
tokens = word_tokenize(text)
# Step 4: Remove stopwords
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
# Step 5: Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
# Step 6: Join tokens back into a single string
preprocessed_text = " ".join(tokens)
return preprocessed_text
# Reverse mapping (numeric label -> category name)
label_mapping = {
"Business": 0,
"Opinion": 1,
"Sports": 2,
"Political_gossip": 3,
"World_news": 4
}
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
# Define a function to predict the category for a single text
def predict_category(text):
prediction = classifier(text)
predicted_label_id = int(prediction[0]['label'].split('_')[-1]) # Extract numeric label from 'LABEL_X'
return reverse_label_mapping[predicted_label_id]
# Responsive Layout - Uses full width
col1, col2 = st.columns([1.1, 1])
# Left Section - File Upload & CSV/Excel Display
with col1:
st.subheader("๐ Upload News Data")
uploaded_file = st.file_uploader("Upload a CSV or Excel file", type=["csv", "xlsx"])
if uploaded_file is not None:
# Determine the file extension
file_extension = uploaded_file.name.split('.')[-1]
if file_extension == 'csv':
df = pd.read_csv(uploaded_file)
elif file_extension == 'xlsx':
df = pd.read_excel(uploaded_file)
# Preprocess the content column and predict categories
if 'content' in df.columns:
df['content'] = df['content'].fillna("").astype(str)
df['preprocessed_content'] = df['content'].apply(preprocess_text)
df['class'] = df['preprocessed_content'].apply(predict_category)
# Drop the preprocessed_content column before displaying or saving
df_for_display = df.drop(columns=['preprocessed_content'], errors='ignore')
df_for_download = df.drop(columns=['preprocessed_content'], errors='ignore')
# Download button
st.download_button(
label="โฌ๏ธ Download Processed Data",
data=df_for_download.to_csv(index=False).encode('utf-8'),
file_name="output.csv",
mime="text/csv"
)
# CSV Preview Box
st.markdown("<div class='csv-box'><h4>๐ CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
st.dataframe(df_for_display, use_container_width=True)
# Right Section - Q&A Interface
with col2:
st.subheader("๐ค AI Assistant")
# Answer Display Box (Initially Empty)
answer_placeholder = st.empty()
answer_placeholder.markdown("<div class='answer-box'></div>", unsafe_allow_html=True)
# Question Input
st.markdown("### ๐ Ask Your Question:")
user_question = st.text_input("Enter your question here", label_visibility="hidden") # Hides the label
# Button & Answer Display
if st.button("๐ฎ Get Answer"):
if user_question.strip() and uploaded_file is not None:
# Ensure the DataFrame has the required content column
if 'content' in df.columns:
context = df['content'].dropna().tolist() # Use the content column as context
# Prepare pairs of (question, context)
pairs = [(user_question, c) for c in context]
# Score each pair using the Cross-Encoder
scores = cross_encoder.predict(pairs)
# Get top matches based on scores
top_indices = scores.argsort()[-5:][::-1] # Get indices of top 5 matches
top_context = "\n".join([context[i] for i in top_indices])
# Get answer from Hugging Face model using top context
result = qa_pipeline(question=user_question, context=top_context)
answer = result['answer']
else:
answer = "โ ๏ธ File does not contain a 'content' column!"
else:
answer = "โ ๏ธ Please upload a valid file first!"
answer_placeholder.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True) |