Spaces:

saranimje
/

news-summarizer

Sleeping

App Files Files Community

Update utils.py

by saranimje - opened May 19

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+458

-457

Files changed (1) hide show

utils.py +458 -457

utils.py CHANGED Viewed

@@ -1,458 +1,459 @@
-# ==========================
-# Data Handling & Storage
-# ==========================
-import json
-import ast
-import pandas as pd
-import numpy as np
-# ==========================
-# Web Scraping & Data Retrieval
-# ==========================
-import requests
-import httpx
-import feedparser
-import concurrent.futures
-from bs4 import BeautifulSoup
-from googlesearch import search
-from urllib.parse import urlparse
-# ==========================
-# Natural Language Processing (NLP)
-# ==========================
-import nltk
-import spacy
-import gensim
-from nltk.corpus import stopwords
-from nltk.tokenize import word_tokenize
-from nltk.stem import WordNetLemmatizer
-from gensim.models import LdaModel
-from gensim.corpora import Dictionary
-from transformers import pipeline
-from deep_translator import GoogleTranslator
-from gtts import gTTS  # Text-to-speech
-# ==========================
-# Machine Learning & Text Analysis
-# ==========================
-from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
-from sklearn.metrics.pairwise import cosine_similarity
-from sklearn.decomposition import NMF, LatentDirichletAllocation
-from sklearn.model_selection import RandomizedSearchCV
-# ==========================
-# Data Visualization
-# ==========================
-import matplotlib.pyplot as plt
-import seaborn as sns
-# ==========================
-# Utility & Performance Optimization
-# ==========================
-import re
-import os
-import io
-from collections import Counter
-from tqdm import tqdm  # progress bar
-def fetch_news_data(company_name: str, article_number: int):
-    excluded_domains = ["youtube.com", "en.wikipedia.org", "m.economictimes.com", "www.prnewswire.com", "economictimes.indiatimes.com", "www.moneycontrol.com"]
-    def is_valid_news_article(url, company_name):
-        try:
-            domain = urlparse(url).netloc  # extracts the domain
-            if company_name.lower() in domain.lower() or any(excluded_domain in domain for excluded_domain in excluded_domains):
-                return False
-            return True
-        except Exception:
-            return False  # handle unexpected errors
-    def get_top_articles(company_name, article_number):
-        query = f"{company_name} latest news article"
-        valid_urls = []
-        for url in search(query, num_results = article_number*2):
-            if is_valid_news_article(url, company_name):
-                valid_urls.append(url)
-            if len(valid_urls) > article_number+1:
-                break
-        return valid_urls
-    def extract_article_data(url):
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
-        }
-        try:
-            response = requests.get(url, headers=headers)
-            response.raise_for_status()  # handle HTTP errors
-            soup = BeautifulSoup(response.content, 'html.parser')
-            # extract title
-            title = soup.title.string.strip() if soup.title else None
-            source = url.split('/')[2]  # Extract domain
-            # validate data
-            if not title:
-                return None
-            return {"title": title, "link": url, "source": source}
-        except (requests.exceptions.RequestException, AttributeError):
-            return None  # skip articles with invalid data
-    def main(company_name, article_number):
-        urls = get_top_articles(company_name, article_number)
-        # extract and validate article data
-        articles_data = [extract_article_data(url) for url in urls]
-        articles_data = [article for article in articles_data if article]  # remove None values
-        # create DataFrame only if valid articles exist
-        if articles_data:
-            df = pd.DataFrame(articles_data)
-        else:
-            df = pd.DataFrame(columns=["title", "link"])  # empty DataFrame if nothing was found
-        return df
-    df = main(company_name, article_number+1)
-    news_df_output = df[["title", "source"]].rename(columns={"title": "Headline", "source": "Source"})
-    news_df_output["Source"] = news_df_output["Source"].str.replace(r"^www\.", "", regex=True).str.split('.').str[0]
-    yield {"news_df_output": news_df_output}
-    def get_article_text(url):
-        try:
-            headers = {'User-Agent': 'Mozilla/5.0'}
-            response = requests.get(url, headers=headers)
-            soup = BeautifulSoup(response.text, "html.parser")
-            # remove unwanted elements
-            for unwanted in soup.select("nav, aside, footer, header, .ad, .advertisement, .promo, .sidebar, .related-articles"):
-                unwanted.extract()
-            # try extracting from known article containers
-            article_body = soup.find(['article', 'div', 'section'], class_=['article-body', 'post-body', 'entry-content', 'main-content'])
-            if article_body:
-                paragraphs = article_body.find_all('p')
-                article_text = " ".join([p.get_text() for p in paragraphs]).strip()
-                return article_text if article_text else None  # return None if empty
-            # fallback to all <p> tags
-            paragraphs = soup.find_all('p')
-            article_text = " ".join([p.get_text() for p in paragraphs]).strip()
-            return article_text if article_text else None  # return None if empty
-        except Exception:
-            return None  # return None in case of an error
-    df['article_text'] = df['link'].apply(get_article_text)
-    df = df.reset_index(drop=True)
-    block_patterns = [
-        # Error messages (with variations)
-        r'Oops[!,\.]? something went wrong',
-        r'An error has occurred',
-        r'This content is not available',
-        r'Please enable JavaScript to continue',
-        r'Error loading content',
-        r'Follow Us',
-        # JavaScript patterns
-        r'var .*?;',
-        r'alert\(.*?\)',
-        r'console\.log\(.*?\)',
-        r'<script.*?</script>',
-        r'<noscript>.*?</noscript>',
-        r'<style.*?</style>',
-        # Loading or restricted content messages
-        r'Loading[\.]*',
-        r'You must be logged in to view this content',
-        r'This content is restricted',
-        r'Access denied',
-        r'Please disable your ad blocker',
-        # GDPR and cookie consent banners
-        r'This site uses cookies',
-        r'We use cookies to improve your experience',
-        r'By using this site, you agree to our use of cookies',
-        r'Accept Cookies',
-        # Stories or content teasers with any number
-        r'\d+\s*Stories',
-        # Miscellaneous
-        r'<iframe.*?</iframe>',
-        r'<meta.*?>',
-        r'<link.*?>',
-        r'Refresh the page and try again',
-        r'Click here if the page does not load',
-        r'© [0-9]{4}.*? All rights reserved',
-        r'Unauthorized access',
-        r'Terms of Service',
-        r'Privacy Policy',
-        r'<.*?>',
-        ]
-    pattern = '|'.join(block_patterns)
-    df['article_text'] = df['article_text'].str.replace(pattern, '', regex=True).str.strip()
-    df['article_text'] = df['article_text'].str.replace(r'\s+', ' ', regex=True).str.strip()
-    custom_stop_words = set(ENGLISH_STOP_WORDS.union({company_name.lower(), 'company', 'ttm', 'rs'}))
-    # add numeric values (integer, decimal, comma-separated, monetary)
-    numeric_patterns = re.compile(r'\b\d+(?:[\.,]\d+)?(?:,\d+)*\b|\$\d+(?:[\.,]\d+)?')
-    numeric_matches = set(re.findall(numeric_patterns, ' '.join(df['article_text'])))
-    custom_stop_words.update(numeric_matches)
-    # remove unwanted unicode characters (like \u2018, \u2019, etc.)
-    unicode_patterns = re.compile(r'[\u2018\u2019\u2020\u2021\u2014]')  # Add more if needed
-    df['article_text'] = df['article_text'].apply(lambda x: unicode_patterns.sub('', x))
-    custom_stop_words = list(custom_stop_words)
-    summarizer = pipeline("summarization", model="google/long-t5-tglobal-base")
-    def generate_summary(text):
-        try:
-            if len(text.split()) > 50:  # skip very short texts
-                summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
-                return summary
-            else:
-                return text
-        except Exception as e:
-            print(f"Error processing text: {e}")
-            return None
-    # apply summarization to the 'article_text' column
-    df['summary'] = df['article_text'].apply(generate_summary)
-    # load a pre-trained BERT-based sentiment model from Hugging Faces
-    sentiment_pipeline = pipeline("sentiment-analysis")
-    def analyze_sentiment(text):
-        """Analyze sentiment with a confidence-based neutral zone."""
-        if not text.strip():
-            return "Neutral"
-        try:
-            result = sentiment_pipeline(text)[0]
-            sentiment_label = result["label"]
-            confidence = round(result["score"], 2)
-            if confidence < 0.7:
-                return "Neutral"
-            return f"{sentiment_label.capitalize()} ({confidence})"
-        except Exception:
-            return "Error in sentiment analysis."
-    # apply sentiment analysis on the summary column
-    df['sentiment'] = df['summary'].apply(analyze_sentiment)
-    df['sentiment_label'] = df['sentiment'].str.extract(r'(Positive|Negative|Neutral)')
-    sentiment_bars = plt.figure(figsize=(7, 7))
-    sns.countplot(x=df['sentiment_label'], palette={'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'})
-    plt.title("Sentiment Analysis of Articles")
-    plt.xlabel("Sentiment")
-    plt.ylabel("Count")
-    # save the figure as an image file to use in gradio interface
-    sentiment_bars_file = "sentiment_bars.png"
-    sentiment_bars.savefig(sentiment_bars_file)
-    plt.close(sentiment_bars)
-    sentiment_counts = df['sentiment_label'].value_counts()
-    colors = {'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'}
-    sentiment_pie = plt.figure(figsize=(7, 7))
-    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=[colors[label] for label in sentiment_counts.index])
-    plt.title("Sentiment Distribution of Articles")
-    sentiment_pie_file = "sentiment_pie.png"
-    sentiment_pie.savefig(sentiment_pie_file)
-    plt.close(sentiment_pie)
-    df['combined_text'] = df['title'] + ' ' + df['summary'] # combine text for analysis
-    vectorizer = TfidfVectorizer(max_features=1000, stop_words=custom_stop_words)
-    tfidf = vectorizer.fit_transform(df['combined_text'])
-    n_topics = 5  # number of topics
-    nmf = NMF(n_components=n_topics, random_state=42)
-    W = nmf.fit_transform(tfidf)
-    H = nmf.components_
-    feature_names = vectorizer.get_feature_names_out()
-    topics = []
-    for topic_idx, topic in enumerate(H):
-        top_words = [feature_names[i] for i in topic.argsort()[-5:]][::-1]  # 5 words per topic
-        topics.append(", ".join(top_words))
-    def get_top_topics(row):
-        topic_indices = W[row].argsort()[-3:][::-1]  # get top 3 topics
-        return [topics[i] for i in topic_indices]
-    df['top_topics'] = [get_top_topics(i) for i in range(len(df))]
-    df['dominant_topic'] = W.argmax(axis=1)
-    df['topic_distribution'] = W.tolist()
-    similarity_matrix = cosine_similarity(W)
-    df['similarity_scores'] = similarity_matrix.mean(axis=1)
-    df['most_similar_article'] = similarity_matrix.argsort(axis=1)[:, -2]  # second highest value
-    df['least_similar_article'] = similarity_matrix.argsort(axis=1)[:, 0]  # lowest value
-    similarity_heatmap = plt.figure(figsize=(10, 8))
-    sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap="coolwarm", xticklabels=False, yticklabels=False)
-    plt.title("Comparative Analysis of News Coverage Across Articles")
-    comparisons = []
-    for i in range(len(df)):
-        # find most similar and least similar articles
-        similar_idx = similarity_matrix[i].argsort()[-2]  # most similar (excluding itself)
-        least_similar_idx = similarity_matrix[i].argsort()[0]  # least similar
-        # build comparison text
-        comparison = {
-            "Most Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', similar to Article {similar_idx + 1} which also discusses '{topics[df['dominant_topic'][similar_idx]]}'.",
-            "Least Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', contrasting with Article {least_similar_idx + 1} which discusses '{topics[df['dominant_topic'][least_similar_idx]]}'."
-        }
-        comparisons.append(comparison)
-    df['coverage_comparison'] = comparisons
-    # find common and unique topics
-    all_topics = df['dominant_topic'].tolist()
-    topic_counter = Counter(all_topics)
-    common_topics = [topics[i] for i, count in topic_counter.items() if count > 1]
-    unique_topics = [topics[i] for i, count in topic_counter.items() if count == 1]
-    topic_overlap = {
-        "Common Topics": common_topics,
-        "Unique Topics": unique_topics
-    }
-    sentiment_counts = df['sentiment_label'].value_counts()
-    if sentiment_counts.get('Positive', 0) > sentiment_counts.get('Negative', 0):
-        sentiment = "Overall sentiment is positive."
-    elif sentiment_counts.get('Negative', 0) > sentiment_counts.get('Positive', 0):
-        sentiment = "Overall sentiment is negative."
-    else:
-        sentiment = "Overall sentiment is mixed."
-    def extract_relevant_topics(topics):
-        if isinstance(topics, str):
-            topics = ast.literal_eval(topics)  # convert string to list if needed
-        if len(topics) <= 2:
-            return topics
-        vectorizer = TfidfVectorizer()
-        tfidf_matrix = vectorizer.fit_transform(topics)
-        similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
-        # sum similarity scores for each topic
-        topic_scores = similarity_matrix.sum(axis=1)
-        # get top 2 highest scoring topics
-        top_indices = topic_scores.argsort()[-2:][::-1]
-        top_topics = [topics[i] for i in top_indices]
-        return top_topics
-    # ensure 'top_topics' is a list
-    df['top_topics'] = df['top_topics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
-    # convert lists to sets for easy comparison
-    df['top_topics_set'] = df['top_topics'].apply(lambda x: set(x) if isinstance(x, list) else set())
-    # find common topics across all articles
-    if len(df) > 1:
-        common_topics = set.intersection(*df['top_topics_set'])
-    else:
-        common_topics = set()  # no common topics if only one article
-    # extract unique topics by removing common ones
-    df['unique_topics'] = df['top_topics_set'].apply(lambda x: list(x - common_topics) if x else [])
-    # drop the temporary 'top_topics_set' column
-    df.drop(columns=['top_topics_set'], inplace=True)
-    coverage_differences = []
-    for _, row in df.iterrows():
-        if row['most_similar_article'] in df.index and row['least_similar_article'] in df.index:
-            most_similar = df.loc[row['most_similar_article']]
-            least_similar = df.loc[row['least_similar_article']]
-            # extract most relevant topics
-            most_relevant_topics = extract_relevant_topics(row['top_topics'])
-            least_relevant_topics = extract_relevant_topics(least_similar['top_topics'])
-            if most_relevant_topics and least_relevant_topics:
-                comparison = {
-                    "Comparison": f"{row['title']} highlights {', '.join(row['top_topics'])}, while {most_similar['title']} discusses {', '.join(most_similar['top_topics'])}.",
-                    "Impact": f"The article emphasizes {most_relevant_topics[0]} and {most_relevant_topics[1]}, contrasting with {least_relevant_topics[0]} and {least_relevant_topics[1]} in the least similar article."
-                }
-                coverage_differences.append(comparison)
-    structured_summary = {
-        "Company": company_name,
-        "Articles": [
-            {
-                "Title": row['title'],
-                "Summary": row['summary'],
-                "Sentiment": row['sentiment'],
-                "Topics": row['top_topics'],
-                "Unique Topics": row['unique_topics']
-            }
-            for _, row in df.iterrows()
-        ],
-        "Comparative Sentiment Score": {
-            "Sentiment Distribution": df['sentiment'].value_counts().to_dict(),
-        },
-        "Topic Overlap": {
-            "Common Topics": list(common_topics) if common_topics else ["No common topics found"],
-            "Unique Topics": [
-                {"Title": row['title'], "Unique Topics": row['unique_topics']}
-                for _, row in df.iterrows()
-            ]
-        },
-        "Final Sentiment Analysis": f"{company_name}’s latest news coverage is mostly {df['sentiment'].mode()[0].lower()}. Potential market impact expected."
-    }
-    yield {"json_summary": structured_summary}
-    english_news = [f"Name of Company: {company_name}"]
-    for i, row in df.iterrows():
-        article_entry = f"Article {i + 1}: "
-        article_entry += f"{row['title']}; "
-        article_entry += f"Summary: {row['summary']} This article has a {row['sentiment_label'].lower()} sentiment."
-        english_news.append(article_entry)
-    yield {"english_news_list": english_news}
-    translator = GoogleTranslator(source='en', target='hi')  # 'hi' = Hindi
-    translated_news = []
-    for text in tqdm(english_news, desc="Translating"):
-        translated_news.append(translator.translate(text))
-    yield {"hindi_news_list": translated_news}
-    hindi_news = '; '.join(translated_news)
-    # yield {"hindi_news_text": hindi_news}
-    def text_to_speech(text, language='hi'):
-      tts = gTTS(text=text, lang=language, slow=False)
-      filename = "hindi_news.mp3"  # save file to path
-      tts.save(filename)
-      return filename
-    print(df)
-    news_audio = text_to_speech(hindi_news)
-    yield {"hindi_news_audio": news_audio}
-    yield {"bar_chart": sentiment_bars_file}
     yield {"pie_chart": sentiment_pie_file}

+# ==========================
+# Data Handling & Storage
+# ==========================
+import json
+import ast
+import pandas as pd
+import numpy as np
+# ==========================
+# Web Scraping & Data Retrieval
+# ==========================
+import requests
+import httpx
+import feedparser
+import concurrent.futures
+from bs4 import BeautifulSoup
+from googlesearch import search
+from urllib.parse import urlparse
+# ==========================
+# Natural Language Processing (NLP)
+# ==========================
+import nltk
+import spacy
+import gensim
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import WordNetLemmatizer
+from gensim.models import LdaModel
+from gensim.corpora import Dictionary
+from transformers import pipeline
+from deep_translator import GoogleTranslator
+from gtts import gTTS  # Text-to-speech
+# ==========================
+# Machine Learning & Text Analysis
+# ==========================
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
+from sklearn.metrics.pairwise import cosine_similarity
+from sklearn.decomposition import NMF, LatentDirichletAllocation
+from sklearn.model_selection import RandomizedSearchCV
+# ==========================
+# Data Visualization
+# ==========================
+import matplotlib.pyplot as plt
+import seaborn as sns
+# ==========================
+# Utility & Performance Optimization
+# ==========================
+import re
+import os
+import io
+from collections import Counter
+from tqdm import tqdm  # progress bar
+def fetch_news_data(company_name: str, article_number: int):
+    excluded_domains = ["youtube.com", "en.wikipedia.org", "m.economictimes.com", "www.prnewswire.com", "economictimes.indiatimes.com", "www.moneycontrol.com"]
+    def is_valid_news_article(url, company_name):
+        try:
+            domain = urlparse(url).netloc  # extracts the domain
+            if company_name.lower() in domain.lower() or any(excluded_domain in domain for excluded_domain in excluded_domains):
+                return False
+            return True
+        except Exception:
+            return False  # handle unexpected errors
+    def get_top_articles(company_name, article_number):
+        query = f"{company_name} latest news article"
+        valid_urls = []
+        for url in search(query, num_results = article_number*2):
+            if is_valid_news_article(url, company_name):
+                valid_urls.append(url)
+            if len(valid_urls) > article_number+1:
+                break
+        return valid_urls
+    def extract_article_data(url):
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
+        }
+        try:
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()  # handle HTTP errors
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # extract title
+            title = soup.title.string.strip() if soup.title else None
+            source = url.split('/')[2]  # Extract domain
+            # validate data
+            if not title:
+                return None
+            return {"title": title, "link": url, "source": source}
+        except (requests.exceptions.RequestException, AttributeError):
+            return None  # skip articles with invalid data
+    def main(company_name, article_number):
+        urls = get_top_articles(company_name, article_number)
+        # extract and validate article data
+        articles_data = [extract_article_data(url) for url in urls]
+        articles_data = [article for article in articles_data if article]  # remove None values
+        # create DataFrame only if valid articles exist
+        if articles_data:
+            df = pd.DataFrame(articles_data)
+        else:
+            df = pd.DataFrame(columns=["title", "link"])  # empty DataFrame if nothing was found
+        return df
+    df = main(company_name, article_number+1)
+    news_df_output = df[["title", "source"]].rename(columns={"title": "Headline", "source": "Source"})
+    news_df_output["Source"] = news_df_output["Source"].str.replace(r"^www\.", "", regex=True).str.split('.').str[0]
+    yield {"news_df_output": news_df_output}
+    def get_article_text(url):
+        try:
+            headers = {'User-Agent': 'Mozilla/5.0'}
+            response = requests.get(url, headers=headers)
+            soup = BeautifulSoup(response.text, "html.parser")
+            # remove unwanted elements
+            for unwanted in soup.select("nav, aside, footer, header, .ad, .advertisement, .promo, .sidebar, .related-articles"):
+                unwanted.extract()
+            # try extracting from known article containers
+            article_body = soup.find(['article', 'div', 'section'], class_=['article-body', 'post-body', 'entry-content', 'main-content'])
+            if article_body:
+                paragraphs = article_body.find_all('p')
+                article_text = " ".join([p.get_text() for p in paragraphs]).strip()
+                return article_text if article_text else None  # return None if empty
+            # fallback to all <p> tags
+            paragraphs = soup.find_all('p')
+            article_text = " ".join([p.get_text() for p in paragraphs]).strip()
+            return article_text if article_text else None  # return None if empty
+        except Exception:
+            return None  # return None in case of an error
+    df['article_text'] = df['link'].apply(get_article_text)
+    df = df.reset_index(drop=True)
+    block_patterns = [
+        # Error messages (with variations)
+        r'Oops[!,\.]? something went wrong',
+        r'An error has occurred',
+        r'This content is not available',
+        r'Please enable JavaScript to continue',
+        r'Error loading content',
+        r'Follow Us',
+        # JavaScript patterns
+        r'var .*?;',
+        r'alert\(.*?\)',
+        r'console\.log\(.*?\)',
+        r'<script.*?</script>',
+        r'<noscript>.*?</noscript>',
+        r'<style.*?</style>',
+        # Loading or restricted content messages
+        r'Loading[\.]*',
+        r'You must be logged in to view this content',
+        r'This content is restricted',
+        r'Access denied',
+        r'Please disable your ad blocker',
+        # GDPR and cookie consent banners
+        r'This site uses cookies',
+        r'We use cookies to improve your experience',
+        r'By using this site, you agree to our use of cookies',
+        r'Accept Cookies',
+        # Stories or content teasers with any number
+        r'\d+\s*Stories',
+        # Miscellaneous
+        r'<iframe.*?</iframe>',
+        r'<meta.*?>',
+        r'<link.*?>',
+        r'Refresh the page and try again',
+        r'Click here if the page does not load',
+        r'© [0-9]{4}.*? All rights reserved',
+        r'Unauthorized access',
+        r'Terms of Service',
+        r'Privacy Policy',
+        r'<.*?>',
+        ]
+    pattern = '|'.join(block_patterns)
+    df['article_text'] = df['article_text'].str.replace(pattern, '', regex=True).str.strip()
+    df['article_text'] = df['article_text'].str.replace(r'\s+', ' ', regex=True).str.strip()
+    custom_stop_words = set(ENGLISH_STOP_WORDS.union({company_name.lower(), 'company', 'ttm', 'rs'}))
+    # add numeric values (integer, decimal, comma-separated, monetary)
+    numeric_patterns = re.compile(r'\b\d+(?:[\.,]\d+)?(?:,\d+)*\b|\$\d+(?:[\.,]\d+)?')
+    clean_text = ' '.join(df['article_text'].fillna('').astype(str))
+    numeric_matches = set(re.findall(numeric_patterns, clean_text))
+    custom_stop_words.update(numeric_matches)
+    # remove unwanted unicode characters (like \u2018, \u2019, etc.)
+    unicode_patterns = re.compile(r'[\u2018\u2019\u2020\u2021\u2014]')  # Add more if needed
+    df['article_text'] = df['article_text'].apply(lambda x: unicode_patterns.sub('', x))
+    custom_stop_words = list(custom_stop_words)
+    summarizer = pipeline("summarization", model="google/long-t5-tglobal-base")
+    def generate_summary(text):
+        try:
+            if len(text.split()) > 50:  # skip very short texts
+                summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
+                return summary
+            else:
+                return text
+        except Exception as e:
+            print(f"Error processing text: {e}")
+            return None
+    # apply summarization to the 'article_text' column
+    df['summary'] = df['article_text'].apply(generate_summary)
+    # load a pre-trained BERT-based sentiment model from Hugging Faces
+    sentiment_pipeline = pipeline("sentiment-analysis")
+    def analyze_sentiment(text):
+        """Analyze sentiment with a confidence-based neutral zone."""
+        if not text.strip():
+            return "Neutral"
+        try:
+            result = sentiment_pipeline(text)[0]
+            sentiment_label = result["label"]
+            confidence = round(result["score"], 2)
+            if confidence < 0.7:
+                return "Neutral"
+            return f"{sentiment_label.capitalize()} ({confidence})"
+        except Exception:
+            return "Error in sentiment analysis."
+    # apply sentiment analysis on the summary column
+    df['sentiment'] = df['summary'].apply(analyze_sentiment)
+    df['sentiment_label'] = df['sentiment'].str.extract(r'(Positive|Negative|Neutral)')
+    sentiment_bars = plt.figure(figsize=(7, 7))
+    sns.countplot(x=df['sentiment_label'], palette={'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'})
+    plt.title("Sentiment Analysis of Articles")
+    plt.xlabel("Sentiment")
+    plt.ylabel("Count")
+    # save the figure as an image file to use in gradio interface
+    sentiment_bars_file = "sentiment_bars.png"
+    sentiment_bars.savefig(sentiment_bars_file)
+    plt.close(sentiment_bars)
+    sentiment_counts = df['sentiment_label'].value_counts()
+    colors = {'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'}
+    sentiment_pie = plt.figure(figsize=(7, 7))
+    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=[colors[label] for label in sentiment_counts.index])
+    plt.title("Sentiment Distribution of Articles")
+    sentiment_pie_file = "sentiment_pie.png"
+    sentiment_pie.savefig(sentiment_pie_file)
+    plt.close(sentiment_pie)
+    df['combined_text'] = df['title'] + ' ' + df['summary'] # combine text for analysis
+    vectorizer = TfidfVectorizer(max_features=1000, stop_words=custom_stop_words)
+    tfidf = vectorizer.fit_transform(df['combined_text'])
+    n_topics = 5  # number of topics
+    nmf = NMF(n_components=n_topics, random_state=42)
+    W = nmf.fit_transform(tfidf)
+    H = nmf.components_
+    feature_names = vectorizer.get_feature_names_out()
+    topics = []
+    for topic_idx, topic in enumerate(H):
+        top_words = [feature_names[i] for i in topic.argsort()[-5:]][::-1]  # 5 words per topic
+        topics.append(", ".join(top_words))
+    def get_top_topics(row):
+        topic_indices = W[row].argsort()[-3:][::-1]  # get top 3 topics
+        return [topics[i] for i in topic_indices]
+    df['top_topics'] = [get_top_topics(i) for i in range(len(df))]
+    df['dominant_topic'] = W.argmax(axis=1)
+    df['topic_distribution'] = W.tolist()
+    similarity_matrix = cosine_similarity(W)
+    df['similarity_scores'] = similarity_matrix.mean(axis=1)
+    df['most_similar_article'] = similarity_matrix.argsort(axis=1)[:, -2]  # second highest value
+    df['least_similar_article'] = similarity_matrix.argsort(axis=1)[:, 0]  # lowest value
+    similarity_heatmap = plt.figure(figsize=(10, 8))
+    sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap="coolwarm", xticklabels=False, yticklabels=False)
+    plt.title("Comparative Analysis of News Coverage Across Articles")
+    comparisons = []
+    for i in range(len(df)):
+        # find most similar and least similar articles
+        similar_idx = similarity_matrix[i].argsort()[-2]  # most similar (excluding itself)
+        least_similar_idx = similarity_matrix[i].argsort()[0]  # least similar
+        # build comparison text
+        comparison = {
+            "Most Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', similar to Article {similar_idx + 1} which also discusses '{topics[df['dominant_topic'][similar_idx]]}'.",
+            "Least Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', contrasting with Article {least_similar_idx + 1} which discusses '{topics[df['dominant_topic'][least_similar_idx]]}'."
+        }
+        comparisons.append(comparison)
+    df['coverage_comparison'] = comparisons
+    # find common and unique topics
+    all_topics = df['dominant_topic'].tolist()
+    topic_counter = Counter(all_topics)
+    common_topics = [topics[i] for i, count in topic_counter.items() if count > 1]
+    unique_topics = [topics[i] for i, count in topic_counter.items() if count == 1]
+    topic_overlap = {
+        "Common Topics": common_topics,
+        "Unique Topics": unique_topics
+    }
+    sentiment_counts = df['sentiment_label'].value_counts()
+    if sentiment_counts.get('Positive', 0) > sentiment_counts.get('Negative', 0):
+        sentiment = "Overall sentiment is positive."
+    elif sentiment_counts.get('Negative', 0) > sentiment_counts.get('Positive', 0):
+        sentiment = "Overall sentiment is negative."
+    else:
+        sentiment = "Overall sentiment is mixed."
+    def extract_relevant_topics(topics):
+        if isinstance(topics, str):
+            topics = ast.literal_eval(topics)  # convert string to list if needed
+        if len(topics) <= 2:
+            return topics
+        vectorizer = TfidfVectorizer()
+        tfidf_matrix = vectorizer.fit_transform(topics)
+        similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
+        # sum similarity scores for each topic
+        topic_scores = similarity_matrix.sum(axis=1)
+        # get top 2 highest scoring topics
+        top_indices = topic_scores.argsort()[-2:][::-1]
+        top_topics = [topics[i] for i in top_indices]
+        return top_topics
+    # ensure 'top_topics' is a list
+    df['top_topics'] = df['top_topics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
+    # convert lists to sets for easy comparison
+    df['top_topics_set'] = df['top_topics'].apply(lambda x: set(x) if isinstance(x, list) else set())
+    # find common topics across all articles
+    if len(df) > 1:
+        common_topics = set.intersection(*df['top_topics_set'])
+    else:
+        common_topics = set()  # no common topics if only one article
+    # extract unique topics by removing common ones
+    df['unique_topics'] = df['top_topics_set'].apply(lambda x: list(x - common_topics) if x else [])
+    # drop the temporary 'top_topics_set' column
+    df.drop(columns=['top_topics_set'], inplace=True)
+    coverage_differences = []
+    for _, row in df.iterrows():
+        if row['most_similar_article'] in df.index and row['least_similar_article'] in df.index:
+            most_similar = df.loc[row['most_similar_article']]
+            least_similar = df.loc[row['least_similar_article']]
+            # extract most relevant topics
+            most_relevant_topics = extract_relevant_topics(row['top_topics'])
+            least_relevant_topics = extract_relevant_topics(least_similar['top_topics'])
+            if most_relevant_topics and least_relevant_topics:
+                comparison = {
+                    "Comparison": f"{row['title']} highlights {', '.join(row['top_topics'])}, while {most_similar['title']} discusses {', '.join(most_similar['top_topics'])}.",
+                    "Impact": f"The article emphasizes {most_relevant_topics[0]} and {most_relevant_topics[1]}, contrasting with {least_relevant_topics[0]} and {least_relevant_topics[1]} in the least similar article."
+                }
+                coverage_differences.append(comparison)
+    structured_summary = {
+        "Company": company_name,
+        "Articles": [
+            {
+                "Title": row['title'],
+                "Summary": row['summary'],
+                "Sentiment": row['sentiment'],
+                "Topics": row['top_topics'],
+                "Unique Topics": row['unique_topics']
+            }
+            for _, row in df.iterrows()
+        ],
+        "Comparative Sentiment Score": {
+            "Sentiment Distribution": df['sentiment'].value_counts().to_dict(),
+        },
+        "Topic Overlap": {
+            "Common Topics": list(common_topics) if common_topics else ["No common topics found"],
+            "Unique Topics": [
+                {"Title": row['title'], "Unique Topics": row['unique_topics']}
+                for _, row in df.iterrows()
+            ]
+        },
+        "Final Sentiment Analysis": f"{company_name}’s latest news coverage is mostly {df['sentiment'].mode()[0].lower()}. Potential market impact expected."
+    }
+    yield {"json_summary": structured_summary}
+    english_news = [f"Name of Company: {company_name}"]
+    for i, row in df.iterrows():
+        article_entry = f"Article {i + 1}: "
+        article_entry += f"{row['title']}; "
+        article_entry += f"Summary: {row['summary']} This article has a {row['sentiment_label'].lower()} sentiment."
+        english_news.append(article_entry)
+    yield {"english_news_list": english_news}
+    translator = GoogleTranslator(source='en', target='hi')  # 'hi' = Hindi
+    translated_news = []
+    for text in tqdm(english_news, desc="Translating"):
+        translated_news.append(translator.translate(text))
+    yield {"hindi_news_list": translated_news}
+    hindi_news = '; '.join(translated_news)
+    # yield {"hindi_news_text": hindi_news}
+    def text_to_speech(text, language='hi'):
+      tts = gTTS(text=text, lang=language, slow=False)
+      filename = "hindi_news.mp3"  # save file to path
+      tts.save(filename)
+      return filename
+    print(df)
+    news_audio = text_to_speech(hindi_news)
+    yield {"hindi_news_audio": news_audio}
+    yield {"bar_chart": sentiment_bars_file}
     yield {"pie_chart": sentiment_pie_file}