|
import json |
|
import pandas as pd |
|
from datetime import datetime |
|
import numpy as np |
|
import os |
|
|
|
def parse_news_data(file_path): |
|
"""Parse the news data file containing multiple JSON objects per line""" |
|
news_data = [] |
|
|
|
with open(file_path, 'r') as f: |
|
content = f.read() |
|
|
|
|
|
lines = content.strip().split('\n') |
|
|
|
for line in lines: |
|
if line.strip(): |
|
try: |
|
news_item = json.loads(line) |
|
news_data.append(news_item) |
|
except json.JSONDecodeError as e: |
|
print(f"Error parsing line: {line[:100]}...") |
|
print(f"Error: {e}") |
|
continue |
|
|
|
return news_data |
|
|
|
def extract_sentiment_features(news_data): |
|
"""Extract sentiment features from news data for each symbol""" |
|
sentiment_features = {} |
|
|
|
for article in news_data: |
|
|
|
published_at = article.get('published_at') |
|
title = article.get('title', '') |
|
description = article.get('description', '') |
|
|
|
|
|
entities = article.get('entities', []) |
|
|
|
for entity in entities: |
|
if entity.get('type') == 'equity': |
|
symbol = entity.get('symbol', '').lower() |
|
|
|
if symbol: |
|
if symbol not in sentiment_features: |
|
sentiment_features[symbol] = { |
|
'news_sentiment_scores': [], |
|
'news_match_scores': [], |
|
'news_mentions_count': 0, |
|
'news_articles_count': 0, |
|
'latest_news_timestamp': None, |
|
'news_highlights_count': 0 |
|
} |
|
|
|
|
|
sentiment_score = entity.get('sentiment_score') |
|
match_score = entity.get('match_score') |
|
|
|
if sentiment_score is not None: |
|
sentiment_features[symbol]['news_sentiment_scores'].append(sentiment_score) |
|
|
|
if match_score is not None: |
|
sentiment_features[symbol]['news_match_scores'].append(match_score) |
|
|
|
|
|
highlights = entity.get('highlights', []) |
|
sentiment_features[symbol]['news_highlights_count'] += len(highlights) |
|
|
|
|
|
if published_at: |
|
if (sentiment_features[symbol]['latest_news_timestamp'] is None or |
|
published_at > sentiment_features[symbol]['latest_news_timestamp']): |
|
sentiment_features[symbol]['latest_news_timestamp'] = published_at |
|
|
|
sentiment_features[symbol]['news_mentions_count'] += 1 |
|
|
|
|
|
mentioned_symbols = set(entity.get('symbol', '').lower() for entity in entities |
|
if entity.get('type') == 'equity' and entity.get('symbol')) |
|
|
|
for symbol in mentioned_symbols: |
|
if symbol in sentiment_features: |
|
sentiment_features[symbol]['news_articles_count'] += 1 |
|
|
|
return sentiment_features |
|
|
|
def aggregate_sentiment_features(sentiment_data): |
|
"""Aggregate sentiment features into final metrics""" |
|
aggregated = {} |
|
|
|
for symbol, data in sentiment_data.items(): |
|
|
|
sentiment_scores = data['news_sentiment_scores'] |
|
match_scores = data['news_match_scores'] |
|
|
|
features = { |
|
'news_sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else None, |
|
'news_sentiment_std': np.std(sentiment_scores) if len(sentiment_scores) > 1 else None, |
|
'news_sentiment_min': np.min(sentiment_scores) if sentiment_scores else None, |
|
'news_sentiment_max': np.max(sentiment_scores) if sentiment_scores else None, |
|
'news_match_score_mean': np.mean(match_scores) if match_scores else None, |
|
'news_match_score_max': np.max(match_scores) if match_scores else None, |
|
'news_mentions_count': data['news_mentions_count'], |
|
'news_articles_count': data['news_articles_count'], |
|
'news_highlights_count': data['news_highlights_count'], |
|
'latest_news_timestamp': data['latest_news_timestamp'], |
|
'news_sentiment_range': (np.max(sentiment_scores) - np.min(sentiment_scores)) if len(sentiment_scores) > 0 else None, |
|
'news_activity_score': data['news_mentions_count'] * np.mean(match_scores) if match_scores else 0 |
|
} |
|
|
|
aggregated[symbol] = features |
|
|
|
return aggregated |
|
|
|
def merge_with_existing_features(news_features, existing_features_file): |
|
"""Merge news features with existing market data features""" |
|
|
|
|
|
if existing_features_file.endswith('.parquet'): |
|
df_existing = pd.read_parquet(existing_features_file) |
|
else: |
|
df_existing = pd.read_csv(existing_features_file) |
|
|
|
print(f"Loaded existing features: {df_existing.shape}") |
|
print(f"News features available for {len(news_features)} symbols") |
|
|
|
|
|
news_columns = [ |
|
'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', |
|
'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max', |
|
'news_mentions_count', 'news_articles_count', 'news_highlights_count', |
|
'latest_news_timestamp', 'news_sentiment_range', 'news_activity_score' |
|
] |
|
|
|
|
|
for col in news_columns: |
|
df_existing[col] = np.nan |
|
|
|
|
|
symbols_matched = 0 |
|
for idx, row in df_existing.iterrows(): |
|
symbol = row['symbol'] |
|
if symbol in news_features: |
|
for col in news_columns: |
|
|
|
df_existing.loc[idx, col] = news_features[symbol].get(col, None) |
|
symbols_matched += 1 |
|
|
|
print(f"Matched news features for {symbols_matched} symbols out of {len(df_existing)} total records") |
|
|
|
return df_existing |
|
|
|
def main(): |
|
|
|
|
|
news_file = os.path.join('data', 'marketaux', 'news', 'news_latest.parquet') |
|
existing_features_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet') |
|
output_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet') |
|
|
|
|
|
if not os.path.exists(news_file): |
|
print(f"WARNING: News file not found: {news_file}") |
|
print("This usually happens when MarketAux API keys are exhausted.") |
|
print("Skipping news sentiment merge and keeping existing features unchanged.") |
|
|
|
|
|
if os.path.exists(existing_features_file): |
|
import shutil |
|
shutil.copy2(existing_features_file, output_file) |
|
print(f"Copied existing features to output: {output_file}") |
|
else: |
|
print(f"WARNING: No existing features file found at {existing_features_file}") |
|
return |
|
|
|
print("Step 1: Loading news data from parquet...") |
|
try: |
|
news_df = pd.read_parquet(news_file) |
|
news_data = news_df.to_dict(orient='records') |
|
print(f"Loaded {len(news_data)} news articles from {news_file}") |
|
except Exception as e: |
|
print(f"ERROR: Failed to load news data: {e}") |
|
print("Skipping news sentiment merge.") |
|
|
|
|
|
if os.path.exists(existing_features_file): |
|
import shutil |
|
shutil.copy2(existing_features_file, output_file) |
|
print(f"Copied existing features to output: {output_file}") |
|
return |
|
|
|
print("Step 2: Extracting sentiment features...") |
|
sentiment_data = extract_sentiment_features(news_data) |
|
print(f"Extracted sentiment data for {len(sentiment_data)} symbols") |
|
|
|
print("Step 3: Aggregating sentiment metrics...") |
|
news_features = aggregate_sentiment_features(sentiment_data) |
|
|
|
|
|
print("\nSample of extracted news features:") |
|
for symbol, features in list(news_features.items())[:3]: |
|
print(f"\n{symbol.upper()}:") |
|
for key, value in features.items(): |
|
if value is not None: |
|
if isinstance(value, float): |
|
print(f" {key}: {value:.4f}") |
|
else: |
|
print(f" {key}: {value}") |
|
|
|
print(f"\nStep 4: Merging with existing features...") |
|
try: |
|
merged_df = merge_with_existing_features(news_features, existing_features_file) |
|
|
|
|
|
if 'links.pulsex' in merged_df.columns: |
|
merged_df = merged_df.drop(columns=['links.pulsex']) |
|
|
|
print(f"Step 5: Saving merged features...") |
|
merged_df.to_parquet(output_file, index=False) |
|
print(f"Saved merged features to {output_file}") |
|
print(f"Final dataset shape: {merged_df.shape}") |
|
|
|
|
|
news_cols = [col for col in merged_df.columns if col.startswith('news_')] |
|
print(f"\nNews feature coverage:") |
|
for col in news_cols: |
|
non_null_count = merged_df[col].notna().sum() |
|
coverage = non_null_count / len(merged_df) * 100 |
|
print(f" {col}: {non_null_count}/{len(merged_df)} ({coverage:.1f}%)") |
|
|
|
except Exception as e: |
|
print(f"Error during merging: {e}") |
|
print("Make sure your merged_features.parquet file exists and is accessible") |
|
|
|
if __name__ == "__main__": |
|
main() |