from urllib.request import urlopen, Request from urllib.error import HTTPError from bs4 import BeautifulSoup import pandas as pd import datetime from dateutil import parser from pathlib import Path import sys import os import nltk # Ensure VADER lexicon is available in a writable location try: from src import config as app_config _nltk_dir = os.path.join(app_config.DATA_DIR, 'nltk_data') except Exception: _nltk_dir = os.path.join(os.environ.get('DATA_DIR', '/data'), 'nltk_data') os.makedirs(_nltk_dir, exist_ok=True) if _nltk_dir not in nltk.data.path: nltk.data.path.insert(0, _nltk_dir) try: nltk.data.find('vader_lexicon') except LookupError: nltk.download('vader_lexicon', download_dir=_nltk_dir) from nltk.sentiment.vader import SentimentIntensityAnalyzer class StockSentimentAnalyzer: def __init__(self): self.stock_url = 'https://finviz.com/quote.ashx?t=' self.crypto_url = 'https://finviz.com/crypto_charts.ashx?t=' self.headers = { 'User-Agent': ( 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/115.0.0.0 Safari/537.36' ) } self.vader = SentimentIntensityAnalyzer() def get_news(self, ticker): """Fetch the Finviz news table for a ticker, falling back to crypto endpoint.""" ticker = ticker.upper() # Try stock quotes endpoint first try: req = Request(self.stock_url + ticker, headers=self.headers) resp = urlopen(req) except HTTPError as e: # On 404 (no stock page), retry crypto endpoint with USD suffix if e.code == 404: if not ticker.endswith('USD'): ticker += 'USD' req = Request(self.crypto_url + ticker, headers=self.headers) resp = urlopen(req) else: raise html = resp.read() soup = BeautifulSoup(html, 'lxml') return soup.find(id='news-table') def parse_news(self, news_table): """Parse rows into DataFrame of date, time, headline.""" today = datetime.datetime.today().strftime('%b-%d-%y') rows = [] for tr in news_table.find_all('tr'): try: text = tr.a.get_text() parts = tr.td.text.split() if len(parts) == 1: date_str, time_str = today, parts[0] else: date_str, time_str = parts if date_str.lower() == 'today': date_str = today rows.append([date_str, time_str, text]) except: continue df = pd.DataFrame(rows, columns=['date','time','headline']) if not df.empty: df['datetime'] = df.apply( lambda r: self._parse_datetime(r['date'], r['time']), axis=1 ) df = df.dropna(subset=['datetime']) return df def _parse_datetime(self, date_str, time_str): try: return parser.parse(f"{date_str} {time_str}") except: return None def score_news(self, df): """Attach VADER sentiment_score to each headline.""" if df.empty: return df scores = df['headline'].apply(self.vader.polarity_scores).tolist() scores_df = pd.DataFrame(scores) out = df.join(scores_df).set_index('datetime') return out.drop(['date','time'], axis=1).rename(columns={'compound':'sentiment_score'}) def get_sentiment_data(self, ticker): try: table = self.get_news(ticker) if table is None: return None, f"No news table for '{ticker}'" parsed = self.parse_news(table) if parsed.empty: return None, f"No articles for '{ticker}'" scored = self.score_news(parsed) if scored.empty: return None, f"Sentiment scoring failed for '{ticker}'" return scored, "Success" except Exception as e: return None, f"Error occurred: {e}" def main(): tickers = sys.argv[1:] or ["AAPL","TSLA","GOOGL","NVDA","MSFT","BTC","SOL","XRP","ETH","ADA", "COIN"] analyzer = StockSentimentAnalyzer() # Get project root directory (3 levels up from this file) project_root = Path(__file__).parent.parent.parent.parent out_dir = project_root / "data" / "finviz" / "sentiment" out_dir.mkdir(parents=True, exist_ok=True) for t in tickers: df, status = analyzer.get_sentiment_data(t) if df is not None: path = out_dir / f"{t.upper()}_sentiment.parquet" # Ensure 'datetime' is a column before saving df_reset = df.reset_index() if df.index.name == 'datetime' else df df_reset.to_parquet(path) print(f"Saved sentiment data for {t} to {path}") else: print(f"Error for {t}: {status}") if __name__ == "__main__": main()