from urllib.request import urlopen, Request
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import pandas as pd
import datetime
from dateutil import parser
from pathlib import Path
import sys
import os
import nltk
# Ensure VADER lexicon is available in a writable location
try:
from src import config as app_config
_nltk_dir = os.path.join(app_config.DATA_DIR, 'nltk_data')
except Exception:
_nltk_dir = os.path.join(os.environ.get('DATA_DIR', '/data'), 'nltk_data')
os.makedirs(_nltk_dir, exist_ok=True)
if _nltk_dir not in nltk.data.path:
nltk.data.path.insert(0, _nltk_dir)
try:
nltk.data.find('vader_lexicon')
except LookupError:
nltk.download('vader_lexicon', download_dir=_nltk_dir)
from nltk.sentiment.vader import SentimentIntensityAnalyzer
class StockSentimentAnalyzer:
def __init__(self):
self.stock_url = 'https://finviz.com/quote.ashx?t='
self.crypto_url = 'https://finviz.com/crypto_charts.ashx?t='
self.headers = {
'User-Agent': (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/115.0.0.0 Safari/537.36'
)
}
self.vader = SentimentIntensityAnalyzer()
def get_news(self, ticker):
"""Fetch the Finviz news table for a ticker, falling back to crypto endpoint."""
ticker = ticker.upper()
# Try stock quotes endpoint first
try:
req = Request(self.stock_url + ticker, headers=self.headers)
resp = urlopen(req)
except HTTPError as e:
# On 404 (no stock page), retry crypto endpoint with USD suffix
if e.code == 404:
if not ticker.endswith('USD'):
ticker += 'USD'
req = Request(self.crypto_url + ticker, headers=self.headers)
resp = urlopen(req)
else:
raise
html = resp.read()
soup = BeautifulSoup(html, 'lxml')
return soup.find(id='news-table')
def parse_news(self, news_table):
"""Parse
rows into DataFrame of date, time, headline."""
today = datetime.datetime.today().strftime('%b-%d-%y')
rows = []
for tr in news_table.find_all('tr'):
try:
text = tr.a.get_text()
parts = tr.td.text.split()
if len(parts) == 1:
date_str, time_str = today, parts[0]
else:
date_str, time_str = parts
if date_str.lower() == 'today':
date_str = today
rows.append([date_str, time_str, text])
except:
continue
df = pd.DataFrame(rows, columns=['date','time','headline'])
if not df.empty:
df['datetime'] = df.apply(
lambda r: self._parse_datetime(r['date'], r['time']), axis=1
)
df = df.dropna(subset=['datetime'])
return df
def _parse_datetime(self, date_str, time_str):
try:
return parser.parse(f"{date_str} {time_str}")
except:
return None
def score_news(self, df):
"""Attach VADER sentiment_score to each headline."""
if df.empty:
return df
scores = df['headline'].apply(self.vader.polarity_scores).tolist()
scores_df = pd.DataFrame(scores)
out = df.join(scores_df).set_index('datetime')
return out.drop(['date','time'], axis=1).rename(columns={'compound':'sentiment_score'})
def get_sentiment_data(self, ticker):
try:
table = self.get_news(ticker)
if table is None:
return None, f"No news table for '{ticker}'"
parsed = self.parse_news(table)
if parsed.empty:
return None, f"No articles for '{ticker}'"
scored = self.score_news(parsed)
if scored.empty:
return None, f"Sentiment scoring failed for '{ticker}'"
return scored, "Success"
except Exception as e:
return None, f"Error occurred: {e}"
def main():
tickers = sys.argv[1:] or ["AAPL","TSLA","GOOGL","NVDA","MSFT","BTC","SOL","XRP","ETH","ADA", "COIN"]
analyzer = StockSentimentAnalyzer()
# Get project root directory (3 levels up from this file)
project_root = Path(__file__).parent.parent.parent.parent
out_dir = project_root / "data" / "finviz" / "sentiment"
out_dir.mkdir(parents=True, exist_ok=True)
for t in tickers:
df, status = analyzer.get_sentiment_data(t)
if df is not None:
path = out_dir / f"{t.upper()}_sentiment.parquet"
# Ensure 'datetime' is a column before saving
df_reset = df.reset_index() if df.index.name == 'datetime' else df
df_reset.to_parquet(path)
print(f"Saved sentiment data for {t} to {path}")
else:
print(f"Error for {t}: {status}")
if __name__ == "__main__":
main()