Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

advisorai-data-enhanced / src /merge /merge_5.py

Maaroufabousaleh

c49b21b about 1 month ago

16.5 kB

	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta
	import json
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def normalize_stock_data(df_stocks):
	"""
	Normalize stock data to ensure consistent format for merging.
	"""
	logger.info("=== NORMALIZING STOCK DATA ===")
	df_stocks = df_stocks.copy()

	# Normalize symbol to uppercase and strip whitespace
	df_stocks['symbol'] = df_stocks['symbol'].astype(str).str.upper().str.strip()

	# Ensure interval_timestamp is int64 (Unix timestamp in milliseconds)
	if 'interval_timestamp' in df_stocks.columns:
	# If it's already numeric, ensure it's int64
	df_stocks['interval_timestamp'] = pd.to_numeric(df_stocks['interval_timestamp'], errors='coerce').astype('int64')
	logger.info(f"Stock timestamp range: {df_stocks['interval_timestamp'].min()} to {df_stocks['interval_timestamp'].max()}")
	logger.info(f"Stock timestamp sample: {df_stocks['interval_timestamp'].head().tolist()}")

	logger.info(f"Stock symbols sample: {df_stocks['symbol'].unique()[:10].tolist()}")
	logger.info(f"Stock data shape: {df_stocks.shape}")

	return df_stocks

	def normalize_news_data(df_news):
	"""
	Normalize news data to ensure consistent format for merging.
	"""
	logger.info("=== NORMALIZING NEWS DATA ===")
	df_news = df_news.copy()

	# Extract entities and create individual records
	news_records = []

	for idx, row in df_news.iterrows():
	entities = row.get('entities', [])

	# Only proceed if entities is a non-empty list or ndarray
	if not isinstance(entities, (list, np.ndarray)) or len(entities) == 0:
	continue

	# Convert published_at to timestamp
	try:
	if isinstance(row['published_at'], str):
	published_dt = pd.to_datetime(row['published_at'])
	else:
	published_dt = row['published_at']
	except:
	logger.warning(f"Could not parse published_at for row {idx}")
	continue

	# Process each entity
	for entity in entities:
	if not isinstance(entity, dict):
	continue

	# Only process equity type entities with symbols
	if entity.get('type') == 'equity' and 'symbol' in entity:
	symbol = str(entity['symbol']).upper().strip()

	# Create 30-minute intervals (matching your stock data)
	interval_dt = published_dt.floor('30min')
	# Convert to Unix timestamp in milliseconds
	interval_timestamp = int(interval_dt.timestamp() * 1000)

	news_records.append({
	'symbol': symbol,
	'interval_timestamp': interval_timestamp,
	'published_at': published_dt,
	'sentiment_score': entity.get('sentiment_score', 0),
	'match_score': entity.get('match_score', 0),
	'highlights_count': len(entity.get('highlights', [])),
	'news_uuid': row.get('uuid', ''),
	'news_title': row.get('title', ''),
	'news_source': row.get('source', ''),
	'relevance_score': row.get('relevance_score', 0)
	})

	if not news_records:
	logger.warning("No valid news records found")
	return pd.DataFrame()

	df_news_normalized = pd.DataFrame(news_records)
	logger.info(f"Normalized news data shape: {df_news_normalized.shape}")
	# Print columns that are completely null and those that aren't
	null_columns = [col for col in df_news_normalized.columns if df_news_normalized[col].isnull().all()]
	not_null_columns = [col for col in df_news_normalized.columns if not df_news_normalized[col].isnull().all()]
	print(f"Completely null columns: {null_columns}")
	print(f"Non-null columns: {not_null_columns}")
	logger.info(f"News symbols sample: {df_news_normalized['symbol'].unique()[:10].tolist()}")
	logger.info(f"News timestamp range: {df_news_normalized['interval_timestamp'].min()} to {df_news_normalized['interval_timestamp'].max()}")
	logger.info(f"News timestamp sample: {df_news_normalized['interval_timestamp'].head().tolist()}")

	return df_news_normalized

	def find_nearest_timestamp_matches(df_stocks, df_news, time_tolerance_minutes=30):
	"""
	Find the nearest timestamp matches within a tolerance window.
	This handles cases where timestamps don't align exactly.
	"""
	logger.info(f"=== FINDING NEAREST TIMESTAMP MATCHES (tolerance: {time_tolerance_minutes} min) ===")

	if df_news.empty:
	return df_stocks.assign(**{col: 0 for col in [
	'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max',
	'news_match_score_mean', 'news_match_score_max', 'news_highlights_count',
	'news_articles_count', 'latest_news_timestamp', 'news_sentiment_range',
	'news_activity_score', 'news_mentions_count'
	]})

	# Convert tolerance to milliseconds
	tolerance_ms = time_tolerance_minutes * 60 * 1000

	# Get unique combinations for efficient processing
	stock_keys = df_stocks[['symbol', 'interval_timestamp']].drop_duplicates()

	matched_records = []

	for _, stock_row in stock_keys.iterrows():
	symbol = stock_row['symbol']
	stock_timestamp = stock_row['interval_timestamp']

	# Find news for this symbol
	symbol_news = df_news[df_news['symbol'] == symbol].copy()

	if symbol_news.empty:
	continue

	# Calculate time differences
	symbol_news['time_diff'] = abs(symbol_news['interval_timestamp'] - stock_timestamp)

	# Filter within tolerance
	nearby_news = symbol_news[symbol_news['time_diff'] <= tolerance_ms]

	if nearby_news.empty:
	continue

	# Aggregate the nearby news
	agg_data = {
	'symbol': symbol,
	'interval_timestamp': stock_timestamp,
	'news_sentiment_mean': nearby_news['sentiment_score'].mean(),
	'news_sentiment_std': nearby_news['sentiment_score'].std(),
	'news_sentiment_min': nearby_news['sentiment_score'].min(),
	'news_sentiment_max': nearby_news['sentiment_score'].max(),
	'news_match_score_mean': nearby_news['match_score'].mean(),
	'news_match_score_max': nearby_news['match_score'].max(),
	'news_highlights_count': nearby_news['highlights_count'].sum(),
	'news_articles_count': len(nearby_news),
	'latest_news_timestamp': nearby_news['published_at'].max(),
	'news_mentions_count': len(nearby_news)
	}

	# Calculate additional features
	agg_data['news_sentiment_range'] = agg_data['news_sentiment_max'] - agg_data['news_sentiment_min']
	agg_data['news_activity_score'] = agg_data['news_match_score_mean'] + agg_data['news_match_score_max']

	# Fill NaN values
	for key, value in agg_data.items():
	if pd.isna(value) and key not in ['symbol', 'interval_timestamp', 'latest_news_timestamp']:
	agg_data[key] = 0

	matched_records.append(agg_data)

	if matched_records:
	df_matched_news = pd.DataFrame(matched_records)
	logger.info(f"Found {len(df_matched_news)} symbol-timestamp matches")

	# Merge with stock data
	df_result = df_stocks.merge(
	df_matched_news,
	on=['symbol', 'interval_timestamp'],
	how='left'
	)
	else:
	logger.warning("No timestamp matches found within tolerance")
	df_result = df_stocks.copy()

	# Fill remaining NaN values for stocks without news
	news_columns = [
	'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max',
	'news_match_score_mean', 'news_match_score_max', 'news_highlights_count',
	'news_articles_count', 'news_sentiment_range', 'news_activity_score', 'news_mentions_count'
	]

	for col in news_columns:
	if col in df_result.columns:
	df_result[col] = df_result[col].fillna(0)

	# Report results
	if 'news_articles_count' in df_result.columns:
	stocks_with_news = len(df_result[df_result['news_articles_count'] > 0])
	total_news_articles = df_result['news_articles_count'].sum()
	logger.info(f"Successfully matched news for {stocks_with_news} stock records out of {len(df_result)}")
	logger.info(f"Total news articles matched: {total_news_articles}")

	return df_result

	def diagnose_data_alignment(df_stocks, df_news):
	"""
	Diagnose alignment issues between stock and news data.
	"""
	logger.info("=== DATA ALIGNMENT DIAGNOSIS ===")

	# Check symbol overlap
	stock_symbols = set(df_stocks['symbol'].unique()) if 'symbol' in df_stocks.columns else set()
	news_symbols = set(df_news['symbol'].unique()) if len(df_news) > 0 and 'symbol' in df_news.columns else set()

	common_symbols = stock_symbols.intersection(news_symbols)

	logger.info(f"Stock symbols: {len(stock_symbols)} unique")
	logger.info(f"News symbols: {len(news_symbols)} unique")
	logger.info(f"Common symbols: {len(common_symbols)}")
	logger.info(f"Common symbols sample: {list(common_symbols)[:10]}")

	# Check timestamp ranges
	if 'interval_timestamp' in df_stocks.columns:
	stock_ts_min = df_stocks['interval_timestamp'].min()
	stock_ts_max = df_stocks['interval_timestamp'].max()
	stock_ts_range = pd.to_datetime([stock_ts_min, stock_ts_max], unit='ms')
	logger.info(f"Stock timestamp range: {stock_ts_range[0]} to {stock_ts_range[1]}")

	if len(df_news) > 0 and 'interval_timestamp' in df_news.columns:
	news_ts_min = df_news['interval_timestamp'].min()
	news_ts_max = df_news['interval_timestamp'].max()
	news_ts_range = pd.to_datetime([news_ts_min, news_ts_max], unit='ms')
	logger.info(f"News timestamp range: {news_ts_range[0]} to {news_ts_range[1]}")

	# Check for timestamp overlap
	if 'interval_timestamp' in df_stocks.columns:
	overlap_start = max(stock_ts_min, news_ts_min)
	overlap_end = min(stock_ts_max, news_ts_max)
	if overlap_start <= overlap_end:
	overlap_range = pd.to_datetime([overlap_start, overlap_end], unit='ms')
	logger.info(f"Timestamp overlap: {overlap_range[0]} to {overlap_range[1]}")
	else:
	logger.warning("No timestamp overlap between stock and news data")

	def parse_json_news_file(news_file_path):
	"""
	Parse news file that contains JSON records (one per line or structured).
	"""
	logger.info(f"Parsing news file: {news_file_path}")

	try:
	# Try reading as parquet first
	df_news = pd.read_parquet(news_file_path)
	logger.info(f"Successfully read parquet file with shape: {df_news.shape}")

	# Check if the data contains JSON strings that need parsing
	if len(df_news.columns) == 1 and df_news.iloc[0, 0] and isinstance(df_news.iloc[0, 0], str):
	logger.info("Detected JSON strings in single column, parsing...")
	json_records = []
	for idx, row in df_news.iterrows():
	try:
	json_data = json.loads(row.iloc[0])
	json_records.append(json_data)
	except json.JSONDecodeError as e:
	logger.warning(f"Failed to parse JSON at row {idx}: {e}")
	continue

	if json_records:
	df_news = pd.DataFrame(json_records)
	logger.info(f"Parsed {len(json_records)} JSON records")

	return df_news

	except Exception as e:
	logger.error(f"Error reading news file: {e}")
	return pd.DataFrame()

	def main(stocks_file_path, news_file_path, output_file_path, time_tolerance_minutes=30):
	"""
	Main function to normalize and merge stock and news data.
	"""
	try:
	logger.info("=== STARTING DATA NORMALIZATION AND MERGE ===")

	# Step 1: Load stock data
	logger.info("Step 1: Loading stock data...")
	df_stocks = pd.read_parquet(stocks_file_path)
	logger.info(f"Loaded stock data with shape: {df_stocks.shape}")

	# Step 2: Load and parse news data
	logger.info("Step 2: Loading news data...")
	df_news_raw = parse_json_news_file(news_file_path)

	if df_news_raw.empty:
	logger.warning("No news data found, creating stock data with empty news columns")
	df_stocks = normalize_stock_data(df_stocks)
	# Add empty news columns
	for col in ['news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min',
	'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max',
	'news_highlights_count', 'news_articles_count', 'latest_news_timestamp',
	'news_sentiment_range', 'news_activity_score', 'news_mentions_count']:
	df_stocks[col] = 0 if col != 'latest_news_timestamp' else None
	df_stocks.to_parquet(output_file_path, index=False)
	logger.info("Saved stock data with empty news columns")
	return df_stocks

	# Step 3: Normalize both datasets
	logger.info("Step 3: Normalizing stock data...")
	df_stocks_norm = normalize_stock_data(df_stocks)

	logger.info("Step 4: Normalizing news data...")
	df_news_norm = normalize_news_data(df_news_raw)

	# Step 5: Diagnose alignment
	logger.info("Step 5: Diagnosing data alignment...")
	diagnose_data_alignment(df_stocks_norm, df_news_norm)

	# Step 6: Find nearest timestamp matches and merge
	logger.info("Step 6: Finding nearest timestamp matches and merging...")
	df_merged = find_nearest_timestamp_matches(
	df_stocks_norm,
	df_news_norm,
	time_tolerance_minutes=time_tolerance_minutes
	)

	# Step 7: Save results
	logger.info("Step 7: Saving merged data...")
	df_merged.to_parquet(output_file_path, index=False)
	logger.info(f"Saved merged data to {output_file_path}")

	# Final report
	logger.info("=== MERGE COMPLETED ===")
	logger.info(f"Final dataset shape: {df_merged.shape}")

	news_cols = [col for col in df_merged.columns if col.startswith('news_')]
	logger.info(f"News columns added: {len(news_cols)}")

	if 'news_articles_count' in df_merged.columns:
	total_articles = df_merged['news_articles_count'].sum()
	records_with_news = len(df_merged[df_merged['news_articles_count'] > 0])
	logger.info(f"Total news articles merged: {total_articles}")
	logger.info(f"Stock records with news: {records_with_news} / {len(df_merged)}")

	return df_merged

	except Exception as e:
	logger.error(f"Error in main process: {e}")
	import traceback
	logger.error(traceback.format_exc())
	raise

	# Example usage
	if __name__ == "__main__":
	import os

	# Update these paths to match your actual file locations
	base_dir = "data/" # Update this
	stocks_file = os.path.join(base_dir, "merged/features/stocks_features.parquet")
	news_file = os.path.join(base_dir, "marketaux/news/news_latest.parquet")
	output_file = os.path.join(base_dir, "merged/features/stocks_features.parquet")

	# Check if stocks_features.parquet exists before running
	if not os.path.exists(stocks_file):
	logger.error(f"Input file missing: {stocks_file}")
	print(f"ERROR: Input file missing: {stocks_file}")
	exit(1)

	# Run the merge with 30-minute tolerance (adjust as needed)
	df_result = main(
	stocks_file_path=stocks_file,
	news_file_path=news_file,
	output_file_path=output_file,
	time_tolerance_minutes=60*24 # Adjust this based on your needs
	)