Spaces:

Sandini
/

News_Analyzer

Running

App Files Files Community

Sandini commited on Mar 29

Commit

ec85722

verified ·

1 Parent(s): d141fe1

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -23

app.py CHANGED Viewed

@@ -1,17 +1,14 @@
 import streamlit as st
 import pandas as pd
 from transformers import pipeline
-from sklearn.metrics.pairwise import cosine_similarity
-from sentence_transformers import SentenceTransformer
-import string
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
-import nltk
 # Download NLTK resources (run this once if not already downloaded)
 nltk.download('punkt')
-nltk.download('punkt_tab')
 nltk.download('stopwords')
 nltk.download('wordnet')
@@ -28,7 +25,6 @@ st.markdown("""
         margin: 0;
         padding: 0;
     }
     /* Header Styling */
     .custom-header {
         background: linear-gradient(to right, #1f4068, #1b1b2f);
@@ -40,7 +36,6 @@ st.markdown("""
         font-weight: bold;
         box-shadow: 0px 4px 15px rgba(0, 217, 255, 0.3);
     }
     /* Buttons */
     .stButton>button {
         background: linear-gradient(45deg, #0072ff, #00c6ff);
@@ -54,7 +49,6 @@ st.markdown("""
         transform: scale(1.05);
         box-shadow: 0px 4px 10px rgba(0, 255, 255, 0.5);
     }
     /* Text Input */
     .stTextInput>div>div>input {
         background-color: rgba(255, 255, 255, 0.1);
@@ -62,14 +56,12 @@ st.markdown("""
         padding: 12px;
         font-size: 18px;
     }
     /* Dataframe Container */
     .dataframe-container {
         background: rgba(255, 255, 255, 0.1);
         padding: 15px;
         border-radius: 12px;
     }
     /* Answer Display Box - Larger */
     .answer-box {
         background: rgba(0, 217, 255, 0.15);
@@ -86,7 +78,6 @@ st.markdown("""
         justify-content: center;
         transition: all 0.3s ease;
     }
     /* CSV Display Box */
     .csv-box {
         background: rgba(255, 255, 255, 0.1);
@@ -105,8 +96,8 @@ st.markdown("<div class='custom-header'> 🧩 AI-Powered News Analyzer</div>", u
 classifier = pipeline("text-classification", model="Sandini/news-classifier")  # Classification pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad")  # QA pipeline
-# Initialize sentence transformer model for QA similarity
-sentence_model = SentenceTransformer('all-MiniLM-L6-v2')  # Pre-trained sentence model
 # Define preprocessing functions for classification
 def preprocess_text(text):
@@ -186,7 +177,6 @@ with col1:
         st.markdown("<div class='csv-box'><h4>📜 CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
         st.dataframe(df_for_display, use_container_width=True)
 # Right Section - Q&A Interface
 with col2:
     st.subheader("🤖 AI Assistant")
@@ -206,15 +196,14 @@ with col2:
             if 'content' in df.columns:
                 context = df['content'].dropna().tolist()  # Use the content column as context
-                # Generate embeddings for the context and the question
-                context_embeddings = sentence_model.encode(context)
-                question_embedding = sentence_model.encode([user_question])
-                # Calculate cosine similarity
-                similarities = cosine_similarity(question_embedding, context_embeddings)
-                top_indices = similarities[0].argsort()[-5:][::-1]  # Get top 5 similar rows
-                # Prepare the top 5 similar context rows
                 top_context = "\n".join([context[i] for i in top_indices])
                 # Get answer from Hugging Face model using top context
@@ -225,5 +214,4 @@ with col2:
         else:
             answer = "⚠️ Please upload a valid file first!"
-        answer_placeholder.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)

 import streamlit as st
 import pandas as pd
 from transformers import pipeline
+from sentence_transformers import CrossEncoder
+import nltk
 from nltk.tokenize import word_tokenize
 from nltk.corpus import stopwords
 from nltk.stem import WordNetLemmatizer
 # Download NLTK resources (run this once if not already downloaded)
 nltk.download('punkt')
 nltk.download('stopwords')
 nltk.download('wordnet')
         margin: 0;
         padding: 0;
     }
     /* Header Styling */
     .custom-header {
         background: linear-gradient(to right, #1f4068, #1b1b2f);
         font-weight: bold;
         box-shadow: 0px 4px 15px rgba(0, 217, 255, 0.3);
     }
     /* Buttons */
     .stButton>button {
         background: linear-gradient(45deg, #0072ff, #00c6ff);
         transform: scale(1.05);
         box-shadow: 0px 4px 10px rgba(0, 255, 255, 0.5);
     }
     /* Text Input */
     .stTextInput>div>div>input {
         background-color: rgba(255, 255, 255, 0.1);
         padding: 12px;
         font-size: 18px;
     }
     /* Dataframe Container */
     .dataframe-container {
         background: rgba(255, 255, 255, 0.1);
         padding: 15px;
         border-radius: 12px;
     }
     /* Answer Display Box - Larger */
     .answer-box {
         background: rgba(0, 217, 255, 0.15);
         justify-content: center;
         transition: all 0.3s ease;
     }
     /* CSV Display Box */
     .csv-box {
         background: rgba(255, 255, 255, 0.1);
 classifier = pipeline("text-classification", model="Sandini/news-classifier")  # Classification pipeline
 qa_pipeline = pipeline("question-answering", model="distilbert/distilbert-base-cased-distilled-squad")  # QA pipeline
+# Initialize Cross-Encoder for QA relevance scoring
+cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')  # Pre-trained Cross-Encoder model
 # Define preprocessing functions for classification
 def preprocess_text(text):
         st.markdown("<div class='csv-box'><h4>📜 CSV/Excel Preview</h4></div>", unsafe_allow_html=True)
         st.dataframe(df_for_display, use_container_width=True)
 # Right Section - Q&A Interface
 with col2:
     st.subheader("🤖 AI Assistant")
             if 'content' in df.columns:
                 context = df['content'].dropna().tolist()  # Use the content column as context
+                # Prepare pairs of (question, context)
+                pairs = [(user_question, c) for c in context]
+                # Score each pair using the Cross-Encoder
+                scores = cross_encoder.predict(pairs)
+                # Get top matches based on scores
+                top_indices = scores.argsort()[-5:][::-1]  # Get indices of top 5 matches
                 top_context = "\n".join([context[i] for i in top_indices])
                 # Get answer from Hugging Face model using top context
         else:
             answer = "⚠️ Please upload a valid file first!"
+        answer_placeholder.markdown(f"<div class='answer-box'>{answer}</div>", unsafe_allow_html=True)