|
import pandas as pd |
|
import numpy as np |
|
from datetime import datetime, timedelta |
|
import json |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
logger = logging.getLogger(__name__) |
|
|
|
def normalize_stock_data(df_stocks): |
|
""" |
|
Normalize stock data to ensure consistent format for merging. |
|
""" |
|
logger.info("=== NORMALIZING STOCK DATA ===") |
|
df_stocks = df_stocks.copy() |
|
|
|
|
|
df_stocks['symbol'] = df_stocks['symbol'].astype(str).str.upper().str.strip() |
|
|
|
|
|
if 'interval_timestamp' in df_stocks.columns: |
|
|
|
df_stocks['interval_timestamp'] = pd.to_numeric(df_stocks['interval_timestamp'], errors='coerce').astype('int64') |
|
logger.info(f"Stock timestamp range: {df_stocks['interval_timestamp'].min()} to {df_stocks['interval_timestamp'].max()}") |
|
logger.info(f"Stock timestamp sample: {df_stocks['interval_timestamp'].head().tolist()}") |
|
|
|
logger.info(f"Stock symbols sample: {df_stocks['symbol'].unique()[:10].tolist()}") |
|
logger.info(f"Stock data shape: {df_stocks.shape}") |
|
|
|
return df_stocks |
|
|
|
def normalize_news_data(df_news): |
|
""" |
|
Normalize news data to ensure consistent format for merging. |
|
""" |
|
logger.info("=== NORMALIZING NEWS DATA ===") |
|
df_news = df_news.copy() |
|
|
|
|
|
news_records = [] |
|
|
|
for idx, row in df_news.iterrows(): |
|
entities = row.get('entities', []) |
|
|
|
|
|
if not isinstance(entities, (list, np.ndarray)) or len(entities) == 0: |
|
continue |
|
|
|
|
|
try: |
|
if isinstance(row['published_at'], str): |
|
published_dt = pd.to_datetime(row['published_at']) |
|
else: |
|
published_dt = row['published_at'] |
|
except: |
|
logger.warning(f"Could not parse published_at for row {idx}") |
|
continue |
|
|
|
|
|
for entity in entities: |
|
if not isinstance(entity, dict): |
|
continue |
|
|
|
|
|
if entity.get('type') == 'equity' and 'symbol' in entity: |
|
symbol = str(entity['symbol']).upper().strip() |
|
|
|
|
|
interval_dt = published_dt.floor('30min') |
|
|
|
interval_timestamp = int(interval_dt.timestamp() * 1000) |
|
|
|
news_records.append({ |
|
'symbol': symbol, |
|
'interval_timestamp': interval_timestamp, |
|
'published_at': published_dt, |
|
'sentiment_score': entity.get('sentiment_score', 0), |
|
'match_score': entity.get('match_score', 0), |
|
'highlights_count': len(entity.get('highlights', [])), |
|
'news_uuid': row.get('uuid', ''), |
|
'news_title': row.get('title', ''), |
|
'news_source': row.get('source', ''), |
|
'relevance_score': row.get('relevance_score', 0) |
|
}) |
|
|
|
if not news_records: |
|
logger.warning("No valid news records found") |
|
return pd.DataFrame() |
|
|
|
df_news_normalized = pd.DataFrame(news_records) |
|
logger.info(f"Normalized news data shape: {df_news_normalized.shape}") |
|
|
|
null_columns = [col for col in df_news_normalized.columns if df_news_normalized[col].isnull().all()] |
|
not_null_columns = [col for col in df_news_normalized.columns if not df_news_normalized[col].isnull().all()] |
|
print(f"Completely null columns: {null_columns}") |
|
print(f"Non-null columns: {not_null_columns}") |
|
logger.info(f"News symbols sample: {df_news_normalized['symbol'].unique()[:10].tolist()}") |
|
logger.info(f"News timestamp range: {df_news_normalized['interval_timestamp'].min()} to {df_news_normalized['interval_timestamp'].max()}") |
|
logger.info(f"News timestamp sample: {df_news_normalized['interval_timestamp'].head().tolist()}") |
|
|
|
return df_news_normalized |
|
|
|
def find_nearest_timestamp_matches(df_stocks, df_news, time_tolerance_minutes=30): |
|
""" |
|
Find the nearest timestamp matches within a tolerance window. |
|
This handles cases where timestamps don't align exactly. |
|
""" |
|
logger.info(f"=== FINDING NEAREST TIMESTAMP MATCHES (tolerance: {time_tolerance_minutes} min) ===") |
|
|
|
if df_news.empty: |
|
return df_stocks.assign(**{col: 0 for col in [ |
|
'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max', |
|
'news_match_score_mean', 'news_match_score_max', 'news_highlights_count', |
|
'news_articles_count', 'latest_news_timestamp', 'news_sentiment_range', |
|
'news_activity_score', 'news_mentions_count' |
|
]}) |
|
|
|
|
|
tolerance_ms = time_tolerance_minutes * 60 * 1000 |
|
|
|
|
|
stock_keys = df_stocks[['symbol', 'interval_timestamp']].drop_duplicates() |
|
|
|
matched_records = [] |
|
|
|
for _, stock_row in stock_keys.iterrows(): |
|
symbol = stock_row['symbol'] |
|
stock_timestamp = stock_row['interval_timestamp'] |
|
|
|
|
|
symbol_news = df_news[df_news['symbol'] == symbol].copy() |
|
|
|
if symbol_news.empty: |
|
continue |
|
|
|
|
|
symbol_news['time_diff'] = abs(symbol_news['interval_timestamp'] - stock_timestamp) |
|
|
|
|
|
nearby_news = symbol_news[symbol_news['time_diff'] <= tolerance_ms] |
|
|
|
if nearby_news.empty: |
|
continue |
|
|
|
|
|
agg_data = { |
|
'symbol': symbol, |
|
'interval_timestamp': stock_timestamp, |
|
'news_sentiment_mean': nearby_news['sentiment_score'].mean(), |
|
'news_sentiment_std': nearby_news['sentiment_score'].std(), |
|
'news_sentiment_min': nearby_news['sentiment_score'].min(), |
|
'news_sentiment_max': nearby_news['sentiment_score'].max(), |
|
'news_match_score_mean': nearby_news['match_score'].mean(), |
|
'news_match_score_max': nearby_news['match_score'].max(), |
|
'news_highlights_count': nearby_news['highlights_count'].sum(), |
|
'news_articles_count': len(nearby_news), |
|
'latest_news_timestamp': nearby_news['published_at'].max(), |
|
'news_mentions_count': len(nearby_news) |
|
} |
|
|
|
|
|
agg_data['news_sentiment_range'] = agg_data['news_sentiment_max'] - agg_data['news_sentiment_min'] |
|
agg_data['news_activity_score'] = agg_data['news_match_score_mean'] + agg_data['news_match_score_max'] |
|
|
|
|
|
for key, value in agg_data.items(): |
|
if pd.isna(value) and key not in ['symbol', 'interval_timestamp', 'latest_news_timestamp']: |
|
agg_data[key] = 0 |
|
|
|
matched_records.append(agg_data) |
|
|
|
if matched_records: |
|
df_matched_news = pd.DataFrame(matched_records) |
|
logger.info(f"Found {len(df_matched_news)} symbol-timestamp matches") |
|
|
|
|
|
df_result = df_stocks.merge( |
|
df_matched_news, |
|
on=['symbol', 'interval_timestamp'], |
|
how='left' |
|
) |
|
else: |
|
logger.warning("No timestamp matches found within tolerance") |
|
df_result = df_stocks.copy() |
|
|
|
|
|
news_columns = [ |
|
'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max', |
|
'news_match_score_mean', 'news_match_score_max', 'news_highlights_count', |
|
'news_articles_count', 'news_sentiment_range', 'news_activity_score', 'news_mentions_count' |
|
] |
|
|
|
for col in news_columns: |
|
if col in df_result.columns: |
|
df_result[col] = df_result[col].fillna(0) |
|
|
|
|
|
if 'news_articles_count' in df_result.columns: |
|
stocks_with_news = len(df_result[df_result['news_articles_count'] > 0]) |
|
total_news_articles = df_result['news_articles_count'].sum() |
|
logger.info(f"Successfully matched news for {stocks_with_news} stock records out of {len(df_result)}") |
|
logger.info(f"Total news articles matched: {total_news_articles}") |
|
|
|
return df_result |
|
|
|
def diagnose_data_alignment(df_stocks, df_news): |
|
""" |
|
Diagnose alignment issues between stock and news data. |
|
""" |
|
logger.info("=== DATA ALIGNMENT DIAGNOSIS ===") |
|
|
|
|
|
stock_symbols = set(df_stocks['symbol'].unique()) if 'symbol' in df_stocks.columns else set() |
|
news_symbols = set(df_news['symbol'].unique()) if len(df_news) > 0 and 'symbol' in df_news.columns else set() |
|
|
|
common_symbols = stock_symbols.intersection(news_symbols) |
|
|
|
logger.info(f"Stock symbols: {len(stock_symbols)} unique") |
|
logger.info(f"News symbols: {len(news_symbols)} unique") |
|
logger.info(f"Common symbols: {len(common_symbols)}") |
|
logger.info(f"Common symbols sample: {list(common_symbols)[:10]}") |
|
|
|
|
|
if 'interval_timestamp' in df_stocks.columns: |
|
stock_ts_min = df_stocks['interval_timestamp'].min() |
|
stock_ts_max = df_stocks['interval_timestamp'].max() |
|
stock_ts_range = pd.to_datetime([stock_ts_min, stock_ts_max], unit='ms') |
|
logger.info(f"Stock timestamp range: {stock_ts_range[0]} to {stock_ts_range[1]}") |
|
|
|
if len(df_news) > 0 and 'interval_timestamp' in df_news.columns: |
|
news_ts_min = df_news['interval_timestamp'].min() |
|
news_ts_max = df_news['interval_timestamp'].max() |
|
news_ts_range = pd.to_datetime([news_ts_min, news_ts_max], unit='ms') |
|
logger.info(f"News timestamp range: {news_ts_range[0]} to {news_ts_range[1]}") |
|
|
|
|
|
if 'interval_timestamp' in df_stocks.columns: |
|
overlap_start = max(stock_ts_min, news_ts_min) |
|
overlap_end = min(stock_ts_max, news_ts_max) |
|
if overlap_start <= overlap_end: |
|
overlap_range = pd.to_datetime([overlap_start, overlap_end], unit='ms') |
|
logger.info(f"Timestamp overlap: {overlap_range[0]} to {overlap_range[1]}") |
|
else: |
|
logger.warning("No timestamp overlap between stock and news data") |
|
|
|
def parse_json_news_file(news_file_path): |
|
""" |
|
Parse news file that contains JSON records (one per line or structured). |
|
""" |
|
logger.info(f"Parsing news file: {news_file_path}") |
|
|
|
try: |
|
|
|
df_news = pd.read_parquet(news_file_path) |
|
logger.info(f"Successfully read parquet file with shape: {df_news.shape}") |
|
|
|
|
|
if len(df_news.columns) == 1 and df_news.iloc[0, 0] and isinstance(df_news.iloc[0, 0], str): |
|
logger.info("Detected JSON strings in single column, parsing...") |
|
json_records = [] |
|
for idx, row in df_news.iterrows(): |
|
try: |
|
json_data = json.loads(row.iloc[0]) |
|
json_records.append(json_data) |
|
except json.JSONDecodeError as e: |
|
logger.warning(f"Failed to parse JSON at row {idx}: {e}") |
|
continue |
|
|
|
if json_records: |
|
df_news = pd.DataFrame(json_records) |
|
logger.info(f"Parsed {len(json_records)} JSON records") |
|
|
|
return df_news |
|
|
|
except Exception as e: |
|
logger.error(f"Error reading news file: {e}") |
|
return pd.DataFrame() |
|
|
|
def main(stocks_file_path, news_file_path, output_file_path, time_tolerance_minutes=30): |
|
""" |
|
Main function to normalize and merge stock and news data. |
|
""" |
|
try: |
|
logger.info("=== STARTING DATA NORMALIZATION AND MERGE ===") |
|
|
|
|
|
logger.info("Step 1: Loading stock data...") |
|
df_stocks = pd.read_parquet(stocks_file_path) |
|
logger.info(f"Loaded stock data with shape: {df_stocks.shape}") |
|
|
|
|
|
logger.info("Step 2: Loading news data...") |
|
df_news_raw = parse_json_news_file(news_file_path) |
|
|
|
if df_news_raw.empty: |
|
logger.warning("No news data found, creating stock data with empty news columns") |
|
df_stocks = normalize_stock_data(df_stocks) |
|
|
|
for col in ['news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', |
|
'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max', |
|
'news_highlights_count', 'news_articles_count', 'latest_news_timestamp', |
|
'news_sentiment_range', 'news_activity_score', 'news_mentions_count']: |
|
df_stocks[col] = 0 if col != 'latest_news_timestamp' else None |
|
df_stocks.to_parquet(output_file_path, index=False) |
|
logger.info("Saved stock data with empty news columns") |
|
return df_stocks |
|
|
|
|
|
logger.info("Step 3: Normalizing stock data...") |
|
df_stocks_norm = normalize_stock_data(df_stocks) |
|
|
|
logger.info("Step 4: Normalizing news data...") |
|
df_news_norm = normalize_news_data(df_news_raw) |
|
|
|
|
|
logger.info("Step 5: Diagnosing data alignment...") |
|
diagnose_data_alignment(df_stocks_norm, df_news_norm) |
|
|
|
|
|
logger.info("Step 6: Finding nearest timestamp matches and merging...") |
|
df_merged = find_nearest_timestamp_matches( |
|
df_stocks_norm, |
|
df_news_norm, |
|
time_tolerance_minutes=time_tolerance_minutes |
|
) |
|
|
|
|
|
logger.info("Step 7: Saving merged data...") |
|
df_merged.to_parquet(output_file_path, index=False) |
|
logger.info(f"Saved merged data to {output_file_path}") |
|
|
|
|
|
logger.info("=== MERGE COMPLETED ===") |
|
logger.info(f"Final dataset shape: {df_merged.shape}") |
|
|
|
news_cols = [col for col in df_merged.columns if col.startswith('news_')] |
|
logger.info(f"News columns added: {len(news_cols)}") |
|
|
|
if 'news_articles_count' in df_merged.columns: |
|
total_articles = df_merged['news_articles_count'].sum() |
|
records_with_news = len(df_merged[df_merged['news_articles_count'] > 0]) |
|
logger.info(f"Total news articles merged: {total_articles}") |
|
logger.info(f"Stock records with news: {records_with_news} / {len(df_merged)}") |
|
|
|
return df_merged |
|
|
|
except Exception as e: |
|
logger.error(f"Error in main process: {e}") |
|
import traceback |
|
logger.error(traceback.format_exc()) |
|
raise |
|
|
|
|
|
if __name__ == "__main__": |
|
import os |
|
|
|
|
|
base_dir = "data/" |
|
stocks_file = os.path.join(base_dir, "merged/features/stocks_features.parquet") |
|
news_file = os.path.join(base_dir, "marketaux/news/news_latest.parquet") |
|
output_file = os.path.join(base_dir, "merged/features/stocks_features.parquet") |
|
|
|
|
|
if not os.path.exists(stocks_file): |
|
logger.error(f"Input file missing: {stocks_file}") |
|
print(f"ERROR: Input file missing: {stocks_file}") |
|
exit(1) |
|
|
|
|
|
df_result = main( |
|
stocks_file_path=stocks_file, |
|
news_file_path=news_file, |
|
output_file_path=output_file, |
|
time_tolerance_minutes=60*24 |
|
) |