File size: 10,182 Bytes
c49b21b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
import json
import pandas as pd
from datetime import datetime
import numpy as np
import os
def parse_news_data(file_path):
"""Parse the news data file containing multiple JSON objects per line"""
news_data = []
with open(file_path, 'r') as f:
content = f.read()
# Split by newlines and parse each JSON object
lines = content.strip().split('\n')
for line in lines:
if line.strip():
try:
news_item = json.loads(line)
news_data.append(news_item)
except json.JSONDecodeError as e:
print(f"Error parsing line: {line[:100]}...")
print(f"Error: {e}")
continue
return news_data
def extract_sentiment_features(news_data):
"""Extract sentiment features from news data for each symbol"""
sentiment_features = {}
for article in news_data:
# Get article-level info
published_at = article.get('published_at')
title = article.get('title', '')
description = article.get('description', '')
# Process entities (stocks mentioned in the article)
entities = article.get('entities', [])
for entity in entities:
if entity.get('type') == 'equity':
symbol = entity.get('symbol', '').lower() # Convert to lowercase
if symbol:
if symbol not in sentiment_features:
sentiment_features[symbol] = {
'news_sentiment_scores': [],
'news_match_scores': [],
'news_mentions_count': 0,
'news_articles_count': 0,
'latest_news_timestamp': None,
'news_highlights_count': 0
}
# Add sentiment and match scores
sentiment_score = entity.get('sentiment_score')
match_score = entity.get('match_score')
if sentiment_score is not None:
sentiment_features[symbol]['news_sentiment_scores'].append(sentiment_score)
if match_score is not None:
sentiment_features[symbol]['news_match_scores'].append(match_score)
# Count highlights
highlights = entity.get('highlights', [])
sentiment_features[symbol]['news_highlights_count'] += len(highlights)
# Update latest timestamp
if published_at:
if (sentiment_features[symbol]['latest_news_timestamp'] is None or
published_at > sentiment_features[symbol]['latest_news_timestamp']):
sentiment_features[symbol]['latest_news_timestamp'] = published_at
sentiment_features[symbol]['news_mentions_count'] += 1
# Count unique articles per symbol
mentioned_symbols = set(entity.get('symbol', '').lower() for entity in entities
if entity.get('type') == 'equity' and entity.get('symbol'))
for symbol in mentioned_symbols:
if symbol in sentiment_features:
sentiment_features[symbol]['news_articles_count'] += 1
return sentiment_features
def aggregate_sentiment_features(sentiment_data):
"""Aggregate sentiment features into final metrics"""
aggregated = {}
for symbol, data in sentiment_data.items():
# Calculate aggregated metrics
sentiment_scores = data['news_sentiment_scores']
match_scores = data['news_match_scores']
features = {
'news_sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else None,
'news_sentiment_std': np.std(sentiment_scores) if len(sentiment_scores) > 1 else None,
'news_sentiment_min': np.min(sentiment_scores) if sentiment_scores else None,
'news_sentiment_max': np.max(sentiment_scores) if sentiment_scores else None,
'news_match_score_mean': np.mean(match_scores) if match_scores else None,
'news_match_score_max': np.max(match_scores) if match_scores else None,
'news_mentions_count': data['news_mentions_count'],
'news_articles_count': data['news_articles_count'],
'news_highlights_count': data['news_highlights_count'],
'latest_news_timestamp': data['latest_news_timestamp'],
'news_sentiment_range': (np.max(sentiment_scores) - np.min(sentiment_scores)) if len(sentiment_scores) > 0 else None,
'news_activity_score': data['news_mentions_count'] * np.mean(match_scores) if match_scores else 0
}
aggregated[symbol] = features
return aggregated
def merge_with_existing_features(news_features, existing_features_file):
"""Merge news features with existing market data features"""
# Load existing features
if existing_features_file.endswith('.parquet'):
df_existing = pd.read_parquet(existing_features_file)
else:
df_existing = pd.read_csv(existing_features_file)
print(f"Loaded existing features: {df_existing.shape}")
print(f"News features available for {len(news_features)} symbols")
# Add news features as new columns
news_columns = [
'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min',
'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max',
'news_mentions_count', 'news_articles_count', 'news_highlights_count',
'latest_news_timestamp', 'news_sentiment_range', 'news_activity_score'
]
# Initialize all news columns with NaN
for col in news_columns:
df_existing[col] = np.nan
# Fill in news features where available
symbols_matched = 0
for idx, row in df_existing.iterrows():
symbol = row['symbol']
if symbol in news_features:
for col in news_columns:
# The keys in news_features already have the correct names
df_existing.loc[idx, col] = news_features[symbol].get(col, None)
symbols_matched += 1
print(f"Matched news features for {symbols_matched} symbols out of {len(df_existing)} total records")
return df_existing
def main():
# Configuration
# Use Marketaux parquet file for news data
news_file = os.path.join('data', 'marketaux', 'news', 'news_latest.parquet')
existing_features_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')
output_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')
# Check if news file exists
if not os.path.exists(news_file):
print(f"WARNING: News file not found: {news_file}")
print("This usually happens when MarketAux API keys are exhausted.")
print("Skipping news sentiment merge and keeping existing features unchanged.")
# Just copy existing features if they exist
if os.path.exists(existing_features_file):
import shutil
shutil.copy2(existing_features_file, output_file)
print(f"Copied existing features to output: {output_file}")
else:
print(f"WARNING: No existing features file found at {existing_features_file}")
return
print("Step 1: Loading news data from parquet...")
try:
news_df = pd.read_parquet(news_file)
news_data = news_df.to_dict(orient='records')
print(f"Loaded {len(news_data)} news articles from {news_file}")
except Exception as e:
print(f"ERROR: Failed to load news data: {e}")
print("Skipping news sentiment merge.")
# Copy existing features as fallback
if os.path.exists(existing_features_file):
import shutil
shutil.copy2(existing_features_file, output_file)
print(f"Copied existing features to output: {output_file}")
return
print("Step 2: Extracting sentiment features...")
sentiment_data = extract_sentiment_features(news_data)
print(f"Extracted sentiment data for {len(sentiment_data)} symbols")
print("Step 3: Aggregating sentiment metrics...")
news_features = aggregate_sentiment_features(sentiment_data)
# Display sample of extracted features
print("\nSample of extracted news features:")
for symbol, features in list(news_features.items())[:3]:
print(f"\n{symbol.upper()}:")
for key, value in features.items():
if value is not None:
if isinstance(value, float):
print(f" {key}: {value:.4f}")
else:
print(f" {key}: {value}")
print(f"\nStep 4: Merging with existing features...")
try:
merged_df = merge_with_existing_features(news_features, existing_features_file)
# Remove 'links.pulsex' column if present
if 'links.pulsex' in merged_df.columns:
merged_df = merged_df.drop(columns=['links.pulsex'])
print(f"Step 5: Saving merged features...")
merged_df.to_parquet(output_file, index=False)
print(f"Saved merged features to {output_file}")
print(f"Final dataset shape: {merged_df.shape}")
# Show summary of news feature coverage
news_cols = [col for col in merged_df.columns if col.startswith('news_')]
print(f"\nNews feature coverage:")
for col in news_cols:
non_null_count = merged_df[col].notna().sum()
coverage = non_null_count / len(merged_df) * 100
print(f" {col}: {non_null_count}/{len(merged_df)} ({coverage:.1f}%)")
except Exception as e:
print(f"Error during merging: {e}")
print("Make sure your merged_features.parquet file exists and is accessible")
if __name__ == "__main__":
main() |