import streamlit as st import pandas as pd from transformers import pipeline from sentence_transformers import CrossEncoder from sentence_transformers import SentenceTransformer import string from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer import nltk # Download NLTK resources (run this once if not already downloaded) nltk.download('punkt') nltk.download('punkt_tab') nltk.download('stopwords') nltk.download('wordnet') # Set modern page configuration st.set_page_config(page_title="News Analyzer", layout="wide") # Inject custom CSS for sleek dark blue theme with black fonts st.markdown(""" """, unsafe_allow_html=True) # Modern Header st.markdown("

🧩 AI-Powered News Analyzer

", unsafe_allow_html=True) # Load the Hugging Face models classifier = pipeline("text-classification", model="Sandini/news-classifier") # Classification pipeline qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad") # QA pipeline # Initialize Cross-Encoder for QA relevance scoring cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2') # Pre-trained Cross-Encoder model # Define preprocessing functions for classification def preprocess_text(text): if not isinstance(text, str): text = "" # Step 1: Lowercase the text text = text.lower() # Step 2: Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Step 3: Tokenize the text tokens = word_tokenize(text) # Step 4: Remove stopwords stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in stop_words] # Step 5: Lemmatization lemmatizer = WordNetLemmatizer() tokens = [lemmatizer.lemmatize(word) for word in tokens] # Step 6: Join tokens back into a single string preprocessed_text = " ".join(tokens) return preprocessed_text # Reverse mapping (numeric label -> category name) label_mapping = { "Business": 0, "Opinion": 1, "Sports": 2, "Political_gossip": 3, "World_news": 4 } reverse_label_mapping = {v: k for k, v in label_mapping.items()} # Define a function to predict the category for a single text def predict_category(text): prediction = classifier(text) predicted_label_id = int(prediction[0]['label'].split('_')[-1]) # Extract numeric label from 'LABEL_X' return reverse_label_mapping[predicted_label_id] # Responsive Layout - Uses full width col1, col2 = st.columns([1.1, 1]) # Left Section - File Upload & CSV/Excel Display with col1: st.subheader("📂 Upload News Data") uploaded_file = st.file_uploader("Upload a CSV or Excel file", type=["csv", "xlsx"]) if uploaded_file is not None: # Determine the file extension file_extension = uploaded_file.name.split('.')[-1] if file_extension == 'csv': df = pd.read_csv(uploaded_file) elif file_extension == 'xlsx': df = pd.read_excel(uploaded_file) # Preprocess the content column and predict categories if 'content' in df.columns: df['content'] = df['content'].fillna("").astype(str) df['preprocessed_content'] = df['content'].apply(preprocess_text) df['class'] = df['preprocessed_content'].apply(predict_category) # Drop the preprocessed_content column before displaying or saving df_for_display = df.drop(columns=['preprocessed_content'], errors='ignore') df_for_download = df.drop(columns=['preprocessed_content'], errors='ignore') # Download button st.download_button( label="⬇️ Download Processed Data", data=df_for_download.to_csv(index=False).encode('utf-8'), file_name="output.csv", mime="text/csv" ) # CSV Preview Box st.markdown("

📜 CSV/Excel Preview

", unsafe_allow_html=True) st.dataframe(df_for_display, use_container_width=True) # Right Section - Q&A Interface with col2: st.subheader("🤖 AI Assistant") # Answer Display Box (Initially Empty) answer_placeholder = st.empty() answer_placeholder.markdown("

", unsafe_allow_html=True) # Question Input st.markdown("### 🔍 Ask Your Question:") user_question = st.text_input("Enter your question here", label_visibility="hidden") # Hides the label # Button & Answer Display if st.button("🔮 Get Answer"): if user_question.strip() and uploaded_file is not None: # Ensure the DataFrame has the required content column if 'content' in df.columns: context = df['content'].dropna().tolist() # Use the content column as context # Prepare pairs of (question, context) pairs = [(user_question, c) for c in context] # Score each pair using the Cross-Encoder scores = cross_encoder.predict(pairs) # Get top matches based on scores top_indices = scores.argsort()[-5:][::-1] # Get indices of top 5 matches top_context = "\n".join([context[i] for i in top_indices]) # Get answer from Hugging Face model using top context result = qa_pipeline(question=user_question, context=top_context) answer = result['answer'] else: answer = "⚠️ File does not contain a 'content' column!" else: answer = "⚠️ Please upload a valid file first!" answer_placeholder.markdown(f"

{answer}

", unsafe_allow_html=True)