Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

advisorai-data-enhanced / src /merge /merge_4.py

Maaroufabousaleh

c49b21b about 1 month ago

10.2 kB

	import json
	import pandas as pd
	from datetime import datetime
	import numpy as np
	import os

	def parse_news_data(file_path):
	"""Parse the news data file containing multiple JSON objects per line"""
	news_data = []

	with open(file_path, 'r') as f:
	content = f.read()

	# Split by newlines and parse each JSON object
	lines = content.strip().split('\n')

	for line in lines:
	if line.strip():
	try:
	news_item = json.loads(line)
	news_data.append(news_item)
	except json.JSONDecodeError as e:
	print(f"Error parsing line: {line[:100]}...")
	print(f"Error: {e}")
	continue

	return news_data

	def extract_sentiment_features(news_data):
	"""Extract sentiment features from news data for each symbol"""
	sentiment_features = {}

	for article in news_data:
	# Get article-level info
	published_at = article.get('published_at')
	title = article.get('title', '')
	description = article.get('description', '')

	# Process entities (stocks mentioned in the article)
	entities = article.get('entities', [])

	for entity in entities:
	if entity.get('type') == 'equity':
	symbol = entity.get('symbol', '').lower() # Convert to lowercase

	if symbol:
	if symbol not in sentiment_features:
	sentiment_features[symbol] = {
	'news_sentiment_scores': [],
	'news_match_scores': [],
	'news_mentions_count': 0,
	'news_articles_count': 0,
	'latest_news_timestamp': None,
	'news_highlights_count': 0
	}

	# Add sentiment and match scores
	sentiment_score = entity.get('sentiment_score')
	match_score = entity.get('match_score')

	if sentiment_score is not None:
	sentiment_features[symbol]['news_sentiment_scores'].append(sentiment_score)

	if match_score is not None:
	sentiment_features[symbol]['news_match_scores'].append(match_score)

	# Count highlights
	highlights = entity.get('highlights', [])
	sentiment_features[symbol]['news_highlights_count'] += len(highlights)

	# Update latest timestamp
	if published_at:
	if (sentiment_features[symbol]['latest_news_timestamp'] is None or
	published_at > sentiment_features[symbol]['latest_news_timestamp']):
	sentiment_features[symbol]['latest_news_timestamp'] = published_at

	sentiment_features[symbol]['news_mentions_count'] += 1

	# Count unique articles per symbol
	mentioned_symbols = set(entity.get('symbol', '').lower() for entity in entities
	if entity.get('type') == 'equity' and entity.get('symbol'))

	for symbol in mentioned_symbols:
	if symbol in sentiment_features:
	sentiment_features[symbol]['news_articles_count'] += 1

	return sentiment_features

	def aggregate_sentiment_features(sentiment_data):
	"""Aggregate sentiment features into final metrics"""
	aggregated = {}

	for symbol, data in sentiment_data.items():
	# Calculate aggregated metrics
	sentiment_scores = data['news_sentiment_scores']
	match_scores = data['news_match_scores']

	features = {
	'news_sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else None,
	'news_sentiment_std': np.std(sentiment_scores) if len(sentiment_scores) > 1 else None,
	'news_sentiment_min': np.min(sentiment_scores) if sentiment_scores else None,
	'news_sentiment_max': np.max(sentiment_scores) if sentiment_scores else None,
	'news_match_score_mean': np.mean(match_scores) if match_scores else None,
	'news_match_score_max': np.max(match_scores) if match_scores else None,
	'news_mentions_count': data['news_mentions_count'],
	'news_articles_count': data['news_articles_count'],
	'news_highlights_count': data['news_highlights_count'],
	'latest_news_timestamp': data['latest_news_timestamp'],
	'news_sentiment_range': (np.max(sentiment_scores) - np.min(sentiment_scores)) if len(sentiment_scores) > 0 else None,
	'news_activity_score': data['news_mentions_count'] * np.mean(match_scores) if match_scores else 0
	}

	aggregated[symbol] = features

	return aggregated

	def merge_with_existing_features(news_features, existing_features_file):
	"""Merge news features with existing market data features"""

	# Load existing features
	if existing_features_file.endswith('.parquet'):
	df_existing = pd.read_parquet(existing_features_file)
	else:
	df_existing = pd.read_csv(existing_features_file)

	print(f"Loaded existing features: {df_existing.shape}")
	print(f"News features available for {len(news_features)} symbols")

	# Add news features as new columns
	news_columns = [
	'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min',
	'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max',
	'news_mentions_count', 'news_articles_count', 'news_highlights_count',
	'latest_news_timestamp', 'news_sentiment_range', 'news_activity_score'
	]

	# Initialize all news columns with NaN
	for col in news_columns:
	df_existing[col] = np.nan

	# Fill in news features where available
	symbols_matched = 0
	for idx, row in df_existing.iterrows():
	symbol = row['symbol']
	if symbol in news_features:
	for col in news_columns:
	# The keys in news_features already have the correct names
	df_existing.loc[idx, col] = news_features[symbol].get(col, None)
	symbols_matched += 1

	print(f"Matched news features for {symbols_matched} symbols out of {len(df_existing)} total records")

	return df_existing

	def main():
	# Configuration
	# Use Marketaux parquet file for news data
	news_file = os.path.join('data', 'marketaux', 'news', 'news_latest.parquet')
	existing_features_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')
	output_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')

	# Check if news file exists
	if not os.path.exists(news_file):
	print(f"WARNING: News file not found: {news_file}")
	print("This usually happens when MarketAux API keys are exhausted.")
	print("Skipping news sentiment merge and keeping existing features unchanged.")

	# Just copy existing features if they exist
	if os.path.exists(existing_features_file):
	import shutil
	shutil.copy2(existing_features_file, output_file)
	print(f"Copied existing features to output: {output_file}")
	else:
	print(f"WARNING: No existing features file found at {existing_features_file}")
	return

	print("Step 1: Loading news data from parquet...")
	try:
	news_df = pd.read_parquet(news_file)
	news_data = news_df.to_dict(orient='records')
	print(f"Loaded {len(news_data)} news articles from {news_file}")
	except Exception as e:
	print(f"ERROR: Failed to load news data: {e}")
	print("Skipping news sentiment merge.")

	# Copy existing features as fallback
	if os.path.exists(existing_features_file):
	import shutil
	shutil.copy2(existing_features_file, output_file)
	print(f"Copied existing features to output: {output_file}")
	return

	print("Step 2: Extracting sentiment features...")
	sentiment_data = extract_sentiment_features(news_data)
	print(f"Extracted sentiment data for {len(sentiment_data)} symbols")

	print("Step 3: Aggregating sentiment metrics...")
	news_features = aggregate_sentiment_features(sentiment_data)

	# Display sample of extracted features
	print("\nSample of extracted news features:")
	for symbol, features in list(news_features.items())[:3]:
	print(f"\n{symbol.upper()}:")
	for key, value in features.items():
	if value is not None:
	if isinstance(value, float):
	print(f" {key}: {value:.4f}")
	else:
	print(f" {key}: {value}")

	print(f"\nStep 4: Merging with existing features...")
	try:
	merged_df = merge_with_existing_features(news_features, existing_features_file)

	# Remove 'links.pulsex' column if present
	if 'links.pulsex' in merged_df.columns:
	merged_df = merged_df.drop(columns=['links.pulsex'])

	print(f"Step 5: Saving merged features...")
	merged_df.to_parquet(output_file, index=False)
	print(f"Saved merged features to {output_file}")
	print(f"Final dataset shape: {merged_df.shape}")

	# Show summary of news feature coverage
	news_cols = [col for col in merged_df.columns if col.startswith('news_')]
	print(f"\nNews feature coverage:")
	for col in news_cols:
	non_null_count = merged_df[col].notna().sum()
	coverage = non_null_count / len(merged_df) * 100
	print(f" {col}: {non_null_count}/{len(merged_df)} ({coverage:.1f}%)")

	except Exception as e:
	print(f"Error during merging: {e}")
	print("Make sure your merged_features.parquet file exists and is accessible")

	if __name__ == "__main__":
	main()