Files changed (1) hide show
  1. utils.py +458 -457
utils.py CHANGED
@@ -1,458 +1,459 @@
1
- # ==========================
2
- # Data Handling & Storage
3
- # ==========================
4
- import json
5
- import ast
6
- import pandas as pd
7
- import numpy as np
8
-
9
- # ==========================
10
- # Web Scraping & Data Retrieval
11
- # ==========================
12
- import requests
13
- import httpx
14
- import feedparser
15
- import concurrent.futures
16
- from bs4 import BeautifulSoup
17
- from googlesearch import search
18
- from urllib.parse import urlparse
19
-
20
- # ==========================
21
- # Natural Language Processing (NLP)
22
- # ==========================
23
- import nltk
24
- import spacy
25
- import gensim
26
- from nltk.corpus import stopwords
27
- from nltk.tokenize import word_tokenize
28
- from nltk.stem import WordNetLemmatizer
29
- from gensim.models import LdaModel
30
- from gensim.corpora import Dictionary
31
- from transformers import pipeline
32
- from deep_translator import GoogleTranslator
33
- from gtts import gTTS # Text-to-speech
34
-
35
- # ==========================
36
- # Machine Learning & Text Analysis
37
- # ==========================
38
- from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
39
- from sklearn.metrics.pairwise import cosine_similarity
40
- from sklearn.decomposition import NMF, LatentDirichletAllocation
41
- from sklearn.model_selection import RandomizedSearchCV
42
-
43
- # ==========================
44
- # Data Visualization
45
- # ==========================
46
- import matplotlib.pyplot as plt
47
- import seaborn as sns
48
-
49
- # ==========================
50
- # Utility & Performance Optimization
51
- # ==========================
52
- import re
53
- import os
54
- import io
55
- from collections import Counter
56
- from tqdm import tqdm # progress bar
57
-
58
-
59
- def fetch_news_data(company_name: str, article_number: int):
60
- excluded_domains = ["youtube.com", "en.wikipedia.org", "m.economictimes.com", "www.prnewswire.com", "economictimes.indiatimes.com", "www.moneycontrol.com"]
61
-
62
- def is_valid_news_article(url, company_name):
63
- try:
64
- domain = urlparse(url).netloc # extracts the domain
65
- if company_name.lower() in domain.lower() or any(excluded_domain in domain for excluded_domain in excluded_domains):
66
- return False
67
- return True
68
- except Exception:
69
- return False # handle unexpected errors
70
-
71
- def get_top_articles(company_name, article_number):
72
- query = f"{company_name} latest news article"
73
- valid_urls = []
74
-
75
- for url in search(query, num_results = article_number*2):
76
- if is_valid_news_article(url, company_name):
77
- valid_urls.append(url)
78
- if len(valid_urls) > article_number+1:
79
- break
80
-
81
- return valid_urls
82
-
83
- def extract_article_data(url):
84
- headers = {
85
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
86
- }
87
-
88
- try:
89
- response = requests.get(url, headers=headers)
90
- response.raise_for_status() # handle HTTP errors
91
-
92
- soup = BeautifulSoup(response.content, 'html.parser')
93
-
94
- # extract title
95
- title = soup.title.string.strip() if soup.title else None
96
- source = url.split('/')[2] # Extract domain
97
-
98
- # validate data
99
- if not title:
100
- return None
101
-
102
- return {"title": title, "link": url, "source": source}
103
-
104
- except (requests.exceptions.RequestException, AttributeError):
105
- return None # skip articles with invalid data
106
-
107
- def main(company_name, article_number):
108
- urls = get_top_articles(company_name, article_number)
109
- # extract and validate article data
110
- articles_data = [extract_article_data(url) for url in urls]
111
- articles_data = [article for article in articles_data if article] # remove None values
112
-
113
- # create DataFrame only if valid articles exist
114
- if articles_data:
115
- df = pd.DataFrame(articles_data)
116
- else:
117
- df = pd.DataFrame(columns=["title", "link"]) # empty DataFrame if nothing was found
118
-
119
- return df
120
-
121
- df = main(company_name, article_number+1)
122
- news_df_output = df[["title", "source"]].rename(columns={"title": "Headline", "source": "Source"})
123
- news_df_output["Source"] = news_df_output["Source"].str.replace(r"^www\.", "", regex=True).str.split('.').str[0]
124
-
125
- yield {"news_df_output": news_df_output}
126
-
127
- def get_article_text(url):
128
- try:
129
- headers = {'User-Agent': 'Mozilla/5.0'}
130
- response = requests.get(url, headers=headers)
131
- soup = BeautifulSoup(response.text, "html.parser")
132
-
133
- # remove unwanted elements
134
- for unwanted in soup.select("nav, aside, footer, header, .ad, .advertisement, .promo, .sidebar, .related-articles"):
135
- unwanted.extract()
136
-
137
- # try extracting from known article containers
138
- article_body = soup.find(['article', 'div', 'section'], class_=['article-body', 'post-body', 'entry-content', 'main-content'])
139
-
140
- if article_body:
141
- paragraphs = article_body.find_all('p')
142
- article_text = " ".join([p.get_text() for p in paragraphs]).strip()
143
- return article_text if article_text else None # return None if empty
144
-
145
- # fallback to all <p> tags
146
- paragraphs = soup.find_all('p')
147
- article_text = " ".join([p.get_text() for p in paragraphs]).strip()
148
-
149
- return article_text if article_text else None # return None if empty
150
-
151
- except Exception:
152
- return None # return None in case of an error
153
- df['article_text'] = df['link'].apply(get_article_text)
154
-
155
- df = df.reset_index(drop=True)
156
-
157
- block_patterns = [
158
- # Error messages (with variations)
159
- r'Oops[!,\.]? something went wrong',
160
- r'An error has occurred',
161
- r'This content is not available',
162
- r'Please enable JavaScript to continue',
163
- r'Error loading content',
164
- r'Follow Us',
165
-
166
- # JavaScript patterns
167
- r'var .*?;',
168
- r'alert\(.*?\)',
169
- r'console\.log\(.*?\)',
170
- r'<script.*?</script>',
171
- r'<noscript>.*?</noscript>',
172
- r'<style.*?</style>',
173
-
174
- # Loading or restricted content messages
175
- r'Loading[\.]*',
176
- r'You must be logged in to view this content',
177
- r'This content is restricted',
178
- r'Access denied',
179
- r'Please disable your ad blocker',
180
-
181
- # GDPR and cookie consent banners
182
- r'This site uses cookies',
183
- r'We use cookies to improve your experience',
184
- r'By using this site, you agree to our use of cookies',
185
- r'Accept Cookies',
186
-
187
- # Stories or content teasers with any number
188
- r'\d+\s*Stories',
189
-
190
- # Miscellaneous
191
- r'<iframe.*?</iframe>',
192
- r'<meta.*?>',
193
- r'<link.*?>',
194
- r'Refresh the page and try again',
195
- r'Click here if the page does not load',
196
- r'© [0-9]{4}.*? All rights reserved',
197
- r'Unauthorized access',
198
- r'Terms of Service',
199
- r'Privacy Policy',
200
- r'<.*?>',
201
- ]
202
-
203
- pattern = '|'.join(block_patterns)
204
- df['article_text'] = df['article_text'].str.replace(pattern, '', regex=True).str.strip()
205
- df['article_text'] = df['article_text'].str.replace(r'\s+', ' ', regex=True).str.strip()
206
-
207
- custom_stop_words = set(ENGLISH_STOP_WORDS.union({company_name.lower(), 'company', 'ttm', 'rs'}))
208
-
209
- # add numeric values (integer, decimal, comma-separated, monetary)
210
- numeric_patterns = re.compile(r'\b\d+(?:[\.,]\d+)?(?:,\d+)*\b|\$\d+(?:[\.,]\d+)?')
211
- numeric_matches = set(re.findall(numeric_patterns, ' '.join(df['article_text'])))
212
- custom_stop_words.update(numeric_matches)
213
-
214
- # remove unwanted unicode characters (like \u2018, \u2019, etc.)
215
- unicode_patterns = re.compile(r'[\u2018\u2019\u2020\u2021\u2014]') # Add more if needed
216
- df['article_text'] = df['article_text'].apply(lambda x: unicode_patterns.sub('', x))
217
-
218
- custom_stop_words = list(custom_stop_words)
219
-
220
- summarizer = pipeline("summarization", model="google/long-t5-tglobal-base")
221
-
222
- def generate_summary(text):
223
- try:
224
- if len(text.split()) > 50: # skip very short texts
225
- summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
226
- return summary
227
- else:
228
- return text
229
- except Exception as e:
230
- print(f"Error processing text: {e}")
231
- return None
232
-
233
- # apply summarization to the 'article_text' column
234
- df['summary'] = df['article_text'].apply(generate_summary)
235
-
236
- # load a pre-trained BERT-based sentiment model from Hugging Faces
237
- sentiment_pipeline = pipeline("sentiment-analysis")
238
-
239
- def analyze_sentiment(text):
240
- """Analyze sentiment with a confidence-based neutral zone."""
241
- if not text.strip():
242
- return "Neutral"
243
-
244
- try:
245
- result = sentiment_pipeline(text)[0]
246
- sentiment_label = result["label"]
247
- confidence = round(result["score"], 2)
248
-
249
- if confidence < 0.7:
250
- return "Neutral"
251
- return f"{sentiment_label.capitalize()} ({confidence})"
252
- except Exception:
253
- return "Error in sentiment analysis."
254
-
255
- # apply sentiment analysis on the summary column
256
- df['sentiment'] = df['summary'].apply(analyze_sentiment)
257
-
258
- df['sentiment_label'] = df['sentiment'].str.extract(r'(Positive|Negative|Neutral)')
259
-
260
- sentiment_bars = plt.figure(figsize=(7, 7))
261
- sns.countplot(x=df['sentiment_label'], palette={'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'})
262
- plt.title("Sentiment Analysis of Articles")
263
- plt.xlabel("Sentiment")
264
- plt.ylabel("Count")
265
-
266
- # save the figure as an image file to use in gradio interface
267
- sentiment_bars_file = "sentiment_bars.png"
268
- sentiment_bars.savefig(sentiment_bars_file)
269
- plt.close(sentiment_bars)
270
-
271
- sentiment_counts = df['sentiment_label'].value_counts()
272
-
273
- colors = {'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'}
274
-
275
- sentiment_pie = plt.figure(figsize=(7, 7))
276
- plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=[colors[label] for label in sentiment_counts.index])
277
- plt.title("Sentiment Distribution of Articles")
278
-
279
- sentiment_pie_file = "sentiment_pie.png"
280
- sentiment_pie.savefig(sentiment_pie_file)
281
- plt.close(sentiment_pie)
282
-
283
- df['combined_text'] = df['title'] + ' ' + df['summary'] # combine text for analysis
284
-
285
- vectorizer = TfidfVectorizer(max_features=1000, stop_words=custom_stop_words)
286
- tfidf = vectorizer.fit_transform(df['combined_text'])
287
-
288
- n_topics = 5 # number of topics
289
- nmf = NMF(n_components=n_topics, random_state=42)
290
- W = nmf.fit_transform(tfidf)
291
- H = nmf.components_
292
-
293
- feature_names = vectorizer.get_feature_names_out()
294
- topics = []
295
- for topic_idx, topic in enumerate(H):
296
- top_words = [feature_names[i] for i in topic.argsort()[-5:]][::-1] # 5 words per topic
297
- topics.append(", ".join(top_words))
298
-
299
-
300
- def get_top_topics(row):
301
- topic_indices = W[row].argsort()[-3:][::-1] # get top 3 topics
302
- return [topics[i] for i in topic_indices]
303
-
304
- df['top_topics'] = [get_top_topics(i) for i in range(len(df))]
305
- df['dominant_topic'] = W.argmax(axis=1)
306
- df['topic_distribution'] = W.tolist()
307
- similarity_matrix = cosine_similarity(W)
308
-
309
- df['similarity_scores'] = similarity_matrix.mean(axis=1)
310
- df['most_similar_article'] = similarity_matrix.argsort(axis=1)[:, -2] # second highest value
311
- df['least_similar_article'] = similarity_matrix.argsort(axis=1)[:, 0] # lowest value
312
-
313
- similarity_heatmap = plt.figure(figsize=(10, 8))
314
- sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap="coolwarm", xticklabels=False, yticklabels=False)
315
- plt.title("Comparative Analysis of News Coverage Across Articles")
316
-
317
- comparisons = []
318
- for i in range(len(df)):
319
- # find most similar and least similar articles
320
- similar_idx = similarity_matrix[i].argsort()[-2] # most similar (excluding itself)
321
- least_similar_idx = similarity_matrix[i].argsort()[0] # least similar
322
-
323
- # build comparison text
324
- comparison = {
325
- "Most Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', similar to Article {similar_idx + 1} which also discusses '{topics[df['dominant_topic'][similar_idx]]}'.",
326
- "Least Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', contrasting with Article {least_similar_idx + 1} which discusses '{topics[df['dominant_topic'][least_similar_idx]]}'."
327
- }
328
- comparisons.append(comparison)
329
-
330
- df['coverage_comparison'] = comparisons
331
- # find common and unique topics
332
- all_topics = df['dominant_topic'].tolist()
333
- topic_counter = Counter(all_topics)
334
- common_topics = [topics[i] for i, count in topic_counter.items() if count > 1]
335
- unique_topics = [topics[i] for i, count in topic_counter.items() if count == 1]
336
-
337
- topic_overlap = {
338
- "Common Topics": common_topics,
339
- "Unique Topics": unique_topics
340
- }
341
- sentiment_counts = df['sentiment_label'].value_counts()
342
- if sentiment_counts.get('Positive', 0) > sentiment_counts.get('Negative', 0):
343
- sentiment = "Overall sentiment is positive."
344
- elif sentiment_counts.get('Negative', 0) > sentiment_counts.get('Positive', 0):
345
- sentiment = "Overall sentiment is negative."
346
- else:
347
- sentiment = "Overall sentiment is mixed."
348
-
349
- def extract_relevant_topics(topics):
350
- if isinstance(topics, str):
351
- topics = ast.literal_eval(topics) # convert string to list if needed
352
-
353
- if len(topics) <= 2:
354
- return topics
355
-
356
- vectorizer = TfidfVectorizer()
357
- tfidf_matrix = vectorizer.fit_transform(topics)
358
- similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
359
-
360
- # sum similarity scores for each topic
361
- topic_scores = similarity_matrix.sum(axis=1)
362
-
363
- # get top 2 highest scoring topics
364
- top_indices = topic_scores.argsort()[-2:][::-1]
365
- top_topics = [topics[i] for i in top_indices]
366
-
367
- return top_topics
368
-
369
-
370
- # ensure 'top_topics' is a list
371
- df['top_topics'] = df['top_topics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
372
-
373
- # convert lists to sets for easy comparison
374
- df['top_topics_set'] = df['top_topics'].apply(lambda x: set(x) if isinstance(x, list) else set())
375
-
376
- # find common topics across all articles
377
- if len(df) > 1:
378
- common_topics = set.intersection(*df['top_topics_set'])
379
- else:
380
- common_topics = set() # no common topics if only one article
381
-
382
- # extract unique topics by removing common ones
383
- df['unique_topics'] = df['top_topics_set'].apply(lambda x: list(x - common_topics) if x else [])
384
-
385
- # drop the temporary 'top_topics_set' column
386
- df.drop(columns=['top_topics_set'], inplace=True)
387
-
388
-
389
- coverage_differences = []
390
- for _, row in df.iterrows():
391
- if row['most_similar_article'] in df.index and row['least_similar_article'] in df.index:
392
- most_similar = df.loc[row['most_similar_article']]
393
- least_similar = df.loc[row['least_similar_article']]
394
-
395
- # extract most relevant topics
396
- most_relevant_topics = extract_relevant_topics(row['top_topics'])
397
- least_relevant_topics = extract_relevant_topics(least_similar['top_topics'])
398
-
399
- if most_relevant_topics and least_relevant_topics:
400
- comparison = {
401
- "Comparison": f"{row['title']} highlights {', '.join(row['top_topics'])}, while {most_similar['title']} discusses {', '.join(most_similar['top_topics'])}.",
402
- "Impact": f"The article emphasizes {most_relevant_topics[0]} and {most_relevant_topics[1]}, contrasting with {least_relevant_topics[0]} and {least_relevant_topics[1]} in the least similar article."
403
- }
404
- coverage_differences.append(comparison)
405
- structured_summary = {
406
- "Company": company_name,
407
- "Articles": [
408
- {
409
- "Title": row['title'],
410
- "Summary": row['summary'],
411
- "Sentiment": row['sentiment'],
412
- "Topics": row['top_topics'],
413
- "Unique Topics": row['unique_topics']
414
- }
415
- for _, row in df.iterrows()
416
- ],
417
- "Comparative Sentiment Score": {
418
- "Sentiment Distribution": df['sentiment'].value_counts().to_dict(),
419
- },
420
- "Topic Overlap": {
421
- "Common Topics": list(common_topics) if common_topics else ["No common topics found"],
422
- "Unique Topics": [
423
- {"Title": row['title'], "Unique Topics": row['unique_topics']}
424
- for _, row in df.iterrows()
425
- ]
426
- },
427
- "Final Sentiment Analysis": f"{company_name}’s latest news coverage is mostly {df['sentiment'].mode()[0].lower()}. Potential market impact expected."
428
- }
429
-
430
- yield {"json_summary": structured_summary}
431
- english_news = [f"Name of Company: {company_name}"]
432
-
433
- for i, row in df.iterrows():
434
- article_entry = f"Article {i + 1}: "
435
- article_entry += f"{row['title']}; "
436
- article_entry += f"Summary: {row['summary']} This article has a {row['sentiment_label'].lower()} sentiment."
437
- english_news.append(article_entry)
438
- yield {"english_news_list": english_news}
439
- translator = GoogleTranslator(source='en', target='hi') # 'hi' = Hindi
440
-
441
- translated_news = []
442
- for text in tqdm(english_news, desc="Translating"):
443
- translated_news.append(translator.translate(text))
444
- yield {"hindi_news_list": translated_news}
445
- hindi_news = '; '.join(translated_news)
446
- # yield {"hindi_news_text": hindi_news}
447
- def text_to_speech(text, language='hi'):
448
- tts = gTTS(text=text, lang=language, slow=False)
449
- filename = "hindi_news.mp3" # save file to path
450
- tts.save(filename)
451
- return filename
452
- print(df)
453
- news_audio = text_to_speech(hindi_news)
454
- yield {"hindi_news_audio": news_audio}
455
-
456
- yield {"bar_chart": sentiment_bars_file}
457
-
 
458
  yield {"pie_chart": sentiment_pie_file}
 
1
+ # ==========================
2
+ # Data Handling & Storage
3
+ # ==========================
4
+ import json
5
+ import ast
6
+ import pandas as pd
7
+ import numpy as np
8
+
9
+ # ==========================
10
+ # Web Scraping & Data Retrieval
11
+ # ==========================
12
+ import requests
13
+ import httpx
14
+ import feedparser
15
+ import concurrent.futures
16
+ from bs4 import BeautifulSoup
17
+ from googlesearch import search
18
+ from urllib.parse import urlparse
19
+
20
+ # ==========================
21
+ # Natural Language Processing (NLP)
22
+ # ==========================
23
+ import nltk
24
+ import spacy
25
+ import gensim
26
+ from nltk.corpus import stopwords
27
+ from nltk.tokenize import word_tokenize
28
+ from nltk.stem import WordNetLemmatizer
29
+ from gensim.models import LdaModel
30
+ from gensim.corpora import Dictionary
31
+ from transformers import pipeline
32
+ from deep_translator import GoogleTranslator
33
+ from gtts import gTTS # Text-to-speech
34
+
35
+ # ==========================
36
+ # Machine Learning & Text Analysis
37
+ # ==========================
38
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
39
+ from sklearn.metrics.pairwise import cosine_similarity
40
+ from sklearn.decomposition import NMF, LatentDirichletAllocation
41
+ from sklearn.model_selection import RandomizedSearchCV
42
+
43
+ # ==========================
44
+ # Data Visualization
45
+ # ==========================
46
+ import matplotlib.pyplot as plt
47
+ import seaborn as sns
48
+
49
+ # ==========================
50
+ # Utility & Performance Optimization
51
+ # ==========================
52
+ import re
53
+ import os
54
+ import io
55
+ from collections import Counter
56
+ from tqdm import tqdm # progress bar
57
+
58
+
59
+ def fetch_news_data(company_name: str, article_number: int):
60
+ excluded_domains = ["youtube.com", "en.wikipedia.org", "m.economictimes.com", "www.prnewswire.com", "economictimes.indiatimes.com", "www.moneycontrol.com"]
61
+
62
+ def is_valid_news_article(url, company_name):
63
+ try:
64
+ domain = urlparse(url).netloc # extracts the domain
65
+ if company_name.lower() in domain.lower() or any(excluded_domain in domain for excluded_domain in excluded_domains):
66
+ return False
67
+ return True
68
+ except Exception:
69
+ return False # handle unexpected errors
70
+
71
+ def get_top_articles(company_name, article_number):
72
+ query = f"{company_name} latest news article"
73
+ valid_urls = []
74
+
75
+ for url in search(query, num_results = article_number*2):
76
+ if is_valid_news_article(url, company_name):
77
+ valid_urls.append(url)
78
+ if len(valid_urls) > article_number+1:
79
+ break
80
+
81
+ return valid_urls
82
+
83
+ def extract_article_data(url):
84
+ headers = {
85
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
86
+ }
87
+
88
+ try:
89
+ response = requests.get(url, headers=headers)
90
+ response.raise_for_status() # handle HTTP errors
91
+
92
+ soup = BeautifulSoup(response.content, 'html.parser')
93
+
94
+ # extract title
95
+ title = soup.title.string.strip() if soup.title else None
96
+ source = url.split('/')[2] # Extract domain
97
+
98
+ # validate data
99
+ if not title:
100
+ return None
101
+
102
+ return {"title": title, "link": url, "source": source}
103
+
104
+ except (requests.exceptions.RequestException, AttributeError):
105
+ return None # skip articles with invalid data
106
+
107
+ def main(company_name, article_number):
108
+ urls = get_top_articles(company_name, article_number)
109
+ # extract and validate article data
110
+ articles_data = [extract_article_data(url) for url in urls]
111
+ articles_data = [article for article in articles_data if article] # remove None values
112
+
113
+ # create DataFrame only if valid articles exist
114
+ if articles_data:
115
+ df = pd.DataFrame(articles_data)
116
+ else:
117
+ df = pd.DataFrame(columns=["title", "link"]) # empty DataFrame if nothing was found
118
+
119
+ return df
120
+
121
+ df = main(company_name, article_number+1)
122
+ news_df_output = df[["title", "source"]].rename(columns={"title": "Headline", "source": "Source"})
123
+ news_df_output["Source"] = news_df_output["Source"].str.replace(r"^www\.", "", regex=True).str.split('.').str[0]
124
+
125
+ yield {"news_df_output": news_df_output}
126
+
127
+ def get_article_text(url):
128
+ try:
129
+ headers = {'User-Agent': 'Mozilla/5.0'}
130
+ response = requests.get(url, headers=headers)
131
+ soup = BeautifulSoup(response.text, "html.parser")
132
+
133
+ # remove unwanted elements
134
+ for unwanted in soup.select("nav, aside, footer, header, .ad, .advertisement, .promo, .sidebar, .related-articles"):
135
+ unwanted.extract()
136
+
137
+ # try extracting from known article containers
138
+ article_body = soup.find(['article', 'div', 'section'], class_=['article-body', 'post-body', 'entry-content', 'main-content'])
139
+
140
+ if article_body:
141
+ paragraphs = article_body.find_all('p')
142
+ article_text = " ".join([p.get_text() for p in paragraphs]).strip()
143
+ return article_text if article_text else None # return None if empty
144
+
145
+ # fallback to all <p> tags
146
+ paragraphs = soup.find_all('p')
147
+ article_text = " ".join([p.get_text() for p in paragraphs]).strip()
148
+
149
+ return article_text if article_text else None # return None if empty
150
+
151
+ except Exception:
152
+ return None # return None in case of an error
153
+ df['article_text'] = df['link'].apply(get_article_text)
154
+
155
+ df = df.reset_index(drop=True)
156
+
157
+ block_patterns = [
158
+ # Error messages (with variations)
159
+ r'Oops[!,\.]? something went wrong',
160
+ r'An error has occurred',
161
+ r'This content is not available',
162
+ r'Please enable JavaScript to continue',
163
+ r'Error loading content',
164
+ r'Follow Us',
165
+
166
+ # JavaScript patterns
167
+ r'var .*?;',
168
+ r'alert\(.*?\)',
169
+ r'console\.log\(.*?\)',
170
+ r'<script.*?</script>',
171
+ r'<noscript>.*?</noscript>',
172
+ r'<style.*?</style>',
173
+
174
+ # Loading or restricted content messages
175
+ r'Loading[\.]*',
176
+ r'You must be logged in to view this content',
177
+ r'This content is restricted',
178
+ r'Access denied',
179
+ r'Please disable your ad blocker',
180
+
181
+ # GDPR and cookie consent banners
182
+ r'This site uses cookies',
183
+ r'We use cookies to improve your experience',
184
+ r'By using this site, you agree to our use of cookies',
185
+ r'Accept Cookies',
186
+
187
+ # Stories or content teasers with any number
188
+ r'\d+\s*Stories',
189
+
190
+ # Miscellaneous
191
+ r'<iframe.*?</iframe>',
192
+ r'<meta.*?>',
193
+ r'<link.*?>',
194
+ r'Refresh the page and try again',
195
+ r'Click here if the page does not load',
196
+ r'© [0-9]{4}.*? All rights reserved',
197
+ r'Unauthorized access',
198
+ r'Terms of Service',
199
+ r'Privacy Policy',
200
+ r'<.*?>',
201
+ ]
202
+
203
+ pattern = '|'.join(block_patterns)
204
+ df['article_text'] = df['article_text'].str.replace(pattern, '', regex=True).str.strip()
205
+ df['article_text'] = df['article_text'].str.replace(r'\s+', ' ', regex=True).str.strip()
206
+
207
+ custom_stop_words = set(ENGLISH_STOP_WORDS.union({company_name.lower(), 'company', 'ttm', 'rs'}))
208
+
209
+ # add numeric values (integer, decimal, comma-separated, monetary)
210
+ numeric_patterns = re.compile(r'\b\d+(?:[\.,]\d+)?(?:,\d+)*\b|\$\d+(?:[\.,]\d+)?')
211
+ clean_text = ' '.join(df['article_text'].fillna('').astype(str))
212
+ numeric_matches = set(re.findall(numeric_patterns, clean_text))
213
+ custom_stop_words.update(numeric_matches)
214
+
215
+ # remove unwanted unicode characters (like \u2018, \u2019, etc.)
216
+ unicode_patterns = re.compile(r'[\u2018\u2019\u2020\u2021\u2014]') # Add more if needed
217
+ df['article_text'] = df['article_text'].apply(lambda x: unicode_patterns.sub('', x))
218
+
219
+ custom_stop_words = list(custom_stop_words)
220
+
221
+ summarizer = pipeline("summarization", model="google/long-t5-tglobal-base")
222
+
223
+ def generate_summary(text):
224
+ try:
225
+ if len(text.split()) > 50: # skip very short texts
226
+ summary = summarizer(text, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
227
+ return summary
228
+ else:
229
+ return text
230
+ except Exception as e:
231
+ print(f"Error processing text: {e}")
232
+ return None
233
+
234
+ # apply summarization to the 'article_text' column
235
+ df['summary'] = df['article_text'].apply(generate_summary)
236
+
237
+ # load a pre-trained BERT-based sentiment model from Hugging Faces
238
+ sentiment_pipeline = pipeline("sentiment-analysis")
239
+
240
+ def analyze_sentiment(text):
241
+ """Analyze sentiment with a confidence-based neutral zone."""
242
+ if not text.strip():
243
+ return "Neutral"
244
+
245
+ try:
246
+ result = sentiment_pipeline(text)[0]
247
+ sentiment_label = result["label"]
248
+ confidence = round(result["score"], 2)
249
+
250
+ if confidence < 0.7:
251
+ return "Neutral"
252
+ return f"{sentiment_label.capitalize()} ({confidence})"
253
+ except Exception:
254
+ return "Error in sentiment analysis."
255
+
256
+ # apply sentiment analysis on the summary column
257
+ df['sentiment'] = df['summary'].apply(analyze_sentiment)
258
+
259
+ df['sentiment_label'] = df['sentiment'].str.extract(r'(Positive|Negative|Neutral)')
260
+
261
+ sentiment_bars = plt.figure(figsize=(7, 7))
262
+ sns.countplot(x=df['sentiment_label'], palette={'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'})
263
+ plt.title("Sentiment Analysis of Articles")
264
+ plt.xlabel("Sentiment")
265
+ plt.ylabel("Count")
266
+
267
+ # save the figure as an image file to use in gradio interface
268
+ sentiment_bars_file = "sentiment_bars.png"
269
+ sentiment_bars.savefig(sentiment_bars_file)
270
+ plt.close(sentiment_bars)
271
+
272
+ sentiment_counts = df['sentiment_label'].value_counts()
273
+
274
+ colors = {'Positive': 'green', 'Negative': 'red', 'Neutral': 'gray'}
275
+
276
+ sentiment_pie = plt.figure(figsize=(7, 7))
277
+ plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=[colors[label] for label in sentiment_counts.index])
278
+ plt.title("Sentiment Distribution of Articles")
279
+
280
+ sentiment_pie_file = "sentiment_pie.png"
281
+ sentiment_pie.savefig(sentiment_pie_file)
282
+ plt.close(sentiment_pie)
283
+
284
+ df['combined_text'] = df['title'] + ' ' + df['summary'] # combine text for analysis
285
+
286
+ vectorizer = TfidfVectorizer(max_features=1000, stop_words=custom_stop_words)
287
+ tfidf = vectorizer.fit_transform(df['combined_text'])
288
+
289
+ n_topics = 5 # number of topics
290
+ nmf = NMF(n_components=n_topics, random_state=42)
291
+ W = nmf.fit_transform(tfidf)
292
+ H = nmf.components_
293
+
294
+ feature_names = vectorizer.get_feature_names_out()
295
+ topics = []
296
+ for topic_idx, topic in enumerate(H):
297
+ top_words = [feature_names[i] for i in topic.argsort()[-5:]][::-1] # 5 words per topic
298
+ topics.append(", ".join(top_words))
299
+
300
+
301
+ def get_top_topics(row):
302
+ topic_indices = W[row].argsort()[-3:][::-1] # get top 3 topics
303
+ return [topics[i] for i in topic_indices]
304
+
305
+ df['top_topics'] = [get_top_topics(i) for i in range(len(df))]
306
+ df['dominant_topic'] = W.argmax(axis=1)
307
+ df['topic_distribution'] = W.tolist()
308
+ similarity_matrix = cosine_similarity(W)
309
+
310
+ df['similarity_scores'] = similarity_matrix.mean(axis=1)
311
+ df['most_similar_article'] = similarity_matrix.argsort(axis=1)[:, -2] # second highest value
312
+ df['least_similar_article'] = similarity_matrix.argsort(axis=1)[:, 0] # lowest value
313
+
314
+ similarity_heatmap = plt.figure(figsize=(10, 8))
315
+ sns.heatmap(similarity_matrix, annot=True, fmt=".2f", cmap="coolwarm", xticklabels=False, yticklabels=False)
316
+ plt.title("Comparative Analysis of News Coverage Across Articles")
317
+
318
+ comparisons = []
319
+ for i in range(len(df)):
320
+ # find most similar and least similar articles
321
+ similar_idx = similarity_matrix[i].argsort()[-2] # most similar (excluding itself)
322
+ least_similar_idx = similarity_matrix[i].argsort()[0] # least similar
323
+
324
+ # build comparison text
325
+ comparison = {
326
+ "Most Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', similar to Article {similar_idx + 1} which also discusses '{topics[df['dominant_topic'][similar_idx]]}'.",
327
+ "Least Similar": f"Article {i + 1} focuses on '{topics[df['dominant_topic'][i]]}', contrasting with Article {least_similar_idx + 1} which discusses '{topics[df['dominant_topic'][least_similar_idx]]}'."
328
+ }
329
+ comparisons.append(comparison)
330
+
331
+ df['coverage_comparison'] = comparisons
332
+ # find common and unique topics
333
+ all_topics = df['dominant_topic'].tolist()
334
+ topic_counter = Counter(all_topics)
335
+ common_topics = [topics[i] for i, count in topic_counter.items() if count > 1]
336
+ unique_topics = [topics[i] for i, count in topic_counter.items() if count == 1]
337
+
338
+ topic_overlap = {
339
+ "Common Topics": common_topics,
340
+ "Unique Topics": unique_topics
341
+ }
342
+ sentiment_counts = df['sentiment_label'].value_counts()
343
+ if sentiment_counts.get('Positive', 0) > sentiment_counts.get('Negative', 0):
344
+ sentiment = "Overall sentiment is positive."
345
+ elif sentiment_counts.get('Negative', 0) > sentiment_counts.get('Positive', 0):
346
+ sentiment = "Overall sentiment is negative."
347
+ else:
348
+ sentiment = "Overall sentiment is mixed."
349
+
350
+ def extract_relevant_topics(topics):
351
+ if isinstance(topics, str):
352
+ topics = ast.literal_eval(topics) # convert string to list if needed
353
+
354
+ if len(topics) <= 2:
355
+ return topics
356
+
357
+ vectorizer = TfidfVectorizer()
358
+ tfidf_matrix = vectorizer.fit_transform(topics)
359
+ similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
360
+
361
+ # sum similarity scores for each topic
362
+ topic_scores = similarity_matrix.sum(axis=1)
363
+
364
+ # get top 2 highest scoring topics
365
+ top_indices = topic_scores.argsort()[-2:][::-1]
366
+ top_topics = [topics[i] for i in top_indices]
367
+
368
+ return top_topics
369
+
370
+
371
+ # ensure 'top_topics' is a list
372
+ df['top_topics'] = df['top_topics'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
373
+
374
+ # convert lists to sets for easy comparison
375
+ df['top_topics_set'] = df['top_topics'].apply(lambda x: set(x) if isinstance(x, list) else set())
376
+
377
+ # find common topics across all articles
378
+ if len(df) > 1:
379
+ common_topics = set.intersection(*df['top_topics_set'])
380
+ else:
381
+ common_topics = set() # no common topics if only one article
382
+
383
+ # extract unique topics by removing common ones
384
+ df['unique_topics'] = df['top_topics_set'].apply(lambda x: list(x - common_topics) if x else [])
385
+
386
+ # drop the temporary 'top_topics_set' column
387
+ df.drop(columns=['top_topics_set'], inplace=True)
388
+
389
+
390
+ coverage_differences = []
391
+ for _, row in df.iterrows():
392
+ if row['most_similar_article'] in df.index and row['least_similar_article'] in df.index:
393
+ most_similar = df.loc[row['most_similar_article']]
394
+ least_similar = df.loc[row['least_similar_article']]
395
+
396
+ # extract most relevant topics
397
+ most_relevant_topics = extract_relevant_topics(row['top_topics'])
398
+ least_relevant_topics = extract_relevant_topics(least_similar['top_topics'])
399
+
400
+ if most_relevant_topics and least_relevant_topics:
401
+ comparison = {
402
+ "Comparison": f"{row['title']} highlights {', '.join(row['top_topics'])}, while {most_similar['title']} discusses {', '.join(most_similar['top_topics'])}.",
403
+ "Impact": f"The article emphasizes {most_relevant_topics[0]} and {most_relevant_topics[1]}, contrasting with {least_relevant_topics[0]} and {least_relevant_topics[1]} in the least similar article."
404
+ }
405
+ coverage_differences.append(comparison)
406
+ structured_summary = {
407
+ "Company": company_name,
408
+ "Articles": [
409
+ {
410
+ "Title": row['title'],
411
+ "Summary": row['summary'],
412
+ "Sentiment": row['sentiment'],
413
+ "Topics": row['top_topics'],
414
+ "Unique Topics": row['unique_topics']
415
+ }
416
+ for _, row in df.iterrows()
417
+ ],
418
+ "Comparative Sentiment Score": {
419
+ "Sentiment Distribution": df['sentiment'].value_counts().to_dict(),
420
+ },
421
+ "Topic Overlap": {
422
+ "Common Topics": list(common_topics) if common_topics else ["No common topics found"],
423
+ "Unique Topics": [
424
+ {"Title": row['title'], "Unique Topics": row['unique_topics']}
425
+ for _, row in df.iterrows()
426
+ ]
427
+ },
428
+ "Final Sentiment Analysis": f"{company_name}’s latest news coverage is mostly {df['sentiment'].mode()[0].lower()}. Potential market impact expected."
429
+ }
430
+
431
+ yield {"json_summary": structured_summary}
432
+ english_news = [f"Name of Company: {company_name}"]
433
+
434
+ for i, row in df.iterrows():
435
+ article_entry = f"Article {i + 1}: "
436
+ article_entry += f"{row['title']}; "
437
+ article_entry += f"Summary: {row['summary']} This article has a {row['sentiment_label'].lower()} sentiment."
438
+ english_news.append(article_entry)
439
+ yield {"english_news_list": english_news}
440
+ translator = GoogleTranslator(source='en', target='hi') # 'hi' = Hindi
441
+
442
+ translated_news = []
443
+ for text in tqdm(english_news, desc="Translating"):
444
+ translated_news.append(translator.translate(text))
445
+ yield {"hindi_news_list": translated_news}
446
+ hindi_news = '; '.join(translated_news)
447
+ # yield {"hindi_news_text": hindi_news}
448
+ def text_to_speech(text, language='hi'):
449
+ tts = gTTS(text=text, lang=language, slow=False)
450
+ filename = "hindi_news.mp3" # save file to path
451
+ tts.save(filename)
452
+ return filename
453
+ print(df)
454
+ news_audio = text_to_speech(hindi_news)
455
+ yield {"hindi_news_audio": news_audio}
456
+
457
+ yield {"bar_chart": sentiment_bars_file}
458
+
459
  yield {"pie_chart": sentiment_pie_file}