File size: 10,182 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import json
import pandas as pd
from datetime import datetime
import numpy as np
import os

def parse_news_data(file_path):
    """Parse the news data file containing multiple JSON objects per line"""
    news_data = []
    
    with open(file_path, 'r') as f:
        content = f.read()
        
    # Split by newlines and parse each JSON object
    lines = content.strip().split('\n')
    
    for line in lines:
        if line.strip():
            try:
                news_item = json.loads(line)
                news_data.append(news_item)
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {line[:100]}...")
                print(f"Error: {e}")
                continue
    
    return news_data

def extract_sentiment_features(news_data):
    """Extract sentiment features from news data for each symbol"""
    sentiment_features = {}
    
    for article in news_data:
        # Get article-level info
        published_at = article.get('published_at')
        title = article.get('title', '')
        description = article.get('description', '')
        
        # Process entities (stocks mentioned in the article)
        entities = article.get('entities', [])
        
        for entity in entities:
            if entity.get('type') == 'equity':
                symbol = entity.get('symbol', '').lower()  # Convert to lowercase
                
                if symbol:
                    if symbol not in sentiment_features:
                        sentiment_features[symbol] = {
                            'news_sentiment_scores': [],
                            'news_match_scores': [],
                            'news_mentions_count': 0,
                            'news_articles_count': 0,
                            'latest_news_timestamp': None,
                            'news_highlights_count': 0
                        }
                    
                    # Add sentiment and match scores
                    sentiment_score = entity.get('sentiment_score')
                    match_score = entity.get('match_score')
                    
                    if sentiment_score is not None:
                        sentiment_features[symbol]['news_sentiment_scores'].append(sentiment_score)
                    
                    if match_score is not None:
                        sentiment_features[symbol]['news_match_scores'].append(match_score)
                    
                    # Count highlights
                    highlights = entity.get('highlights', [])
                    sentiment_features[symbol]['news_highlights_count'] += len(highlights)
                    
                    # Update latest timestamp
                    if published_at:
                        if (sentiment_features[symbol]['latest_news_timestamp'] is None or 
                            published_at > sentiment_features[symbol]['latest_news_timestamp']):
                            sentiment_features[symbol]['latest_news_timestamp'] = published_at
                    
                    sentiment_features[symbol]['news_mentions_count'] += 1
        
        # Count unique articles per symbol
        mentioned_symbols = set(entity.get('symbol', '').lower() for entity in entities 
                               if entity.get('type') == 'equity' and entity.get('symbol'))
        
        for symbol in mentioned_symbols:
            if symbol in sentiment_features:
                sentiment_features[symbol]['news_articles_count'] += 1
    
    return sentiment_features

def aggregate_sentiment_features(sentiment_data):
    """Aggregate sentiment features into final metrics"""
    aggregated = {}
    
    for symbol, data in sentiment_data.items():
        # Calculate aggregated metrics
        sentiment_scores = data['news_sentiment_scores']
        match_scores = data['news_match_scores']
        
        features = {
            'news_sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else None,
            'news_sentiment_std': np.std(sentiment_scores) if len(sentiment_scores) > 1 else None,
            'news_sentiment_min': np.min(sentiment_scores) if sentiment_scores else None,
            'news_sentiment_max': np.max(sentiment_scores) if sentiment_scores else None,
            'news_match_score_mean': np.mean(match_scores) if match_scores else None,
            'news_match_score_max': np.max(match_scores) if match_scores else None,
            'news_mentions_count': data['news_mentions_count'],
            'news_articles_count': data['news_articles_count'],
            'news_highlights_count': data['news_highlights_count'],
            'latest_news_timestamp': data['latest_news_timestamp'],
            'news_sentiment_range': (np.max(sentiment_scores) - np.min(sentiment_scores)) if len(sentiment_scores) > 0 else None,
            'news_activity_score': data['news_mentions_count'] * np.mean(match_scores) if match_scores else 0
        }
        
        aggregated[symbol] = features
    
    return aggregated

def merge_with_existing_features(news_features, existing_features_file):
    """Merge news features with existing market data features"""
    
    # Load existing features
    if existing_features_file.endswith('.parquet'):
        df_existing = pd.read_parquet(existing_features_file)
    else:
        df_existing = pd.read_csv(existing_features_file)
    
    print(f"Loaded existing features: {df_existing.shape}")
    print(f"News features available for {len(news_features)} symbols")
    
    # Add news features as new columns
    news_columns = [
        'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 
        'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max',
        'news_mentions_count', 'news_articles_count', 'news_highlights_count',
        'latest_news_timestamp', 'news_sentiment_range', 'news_activity_score'
    ]
    
    # Initialize all news columns with NaN
    for col in news_columns:
        df_existing[col] = np.nan
    
    # Fill in news features where available
    symbols_matched = 0
    for idx, row in df_existing.iterrows():
        symbol = row['symbol']
        if symbol in news_features:
            for col in news_columns:
                # The keys in news_features already have the correct names
                df_existing.loc[idx, col] = news_features[symbol].get(col, None)
            symbols_matched += 1
    
    print(f"Matched news features for {symbols_matched} symbols out of {len(df_existing)} total records")
    
    return df_existing

def main():
    # Configuration
    # Use Marketaux parquet file for news data
    news_file = os.path.join('data', 'marketaux', 'news', 'news_latest.parquet')
    existing_features_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')
    output_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')

    # Check if news file exists
    if not os.path.exists(news_file):
        print(f"WARNING: News file not found: {news_file}")
        print("This usually happens when MarketAux API keys are exhausted.")
        print("Skipping news sentiment merge and keeping existing features unchanged.")
        
        # Just copy existing features if they exist
        if os.path.exists(existing_features_file):
            import shutil
            shutil.copy2(existing_features_file, output_file)
            print(f"Copied existing features to output: {output_file}")
        else:
            print(f"WARNING: No existing features file found at {existing_features_file}")
        return

    print("Step 1: Loading news data from parquet...")
    try:
        news_df = pd.read_parquet(news_file)
        news_data = news_df.to_dict(orient='records')
        print(f"Loaded {len(news_data)} news articles from {news_file}")
    except Exception as e:
        print(f"ERROR: Failed to load news data: {e}")
        print("Skipping news sentiment merge.")
        
        # Copy existing features as fallback
        if os.path.exists(existing_features_file):
            import shutil
            shutil.copy2(existing_features_file, output_file)
            print(f"Copied existing features to output: {output_file}")
        return
    
    print("Step 2: Extracting sentiment features...")
    sentiment_data = extract_sentiment_features(news_data)
    print(f"Extracted sentiment data for {len(sentiment_data)} symbols")
    
    print("Step 3: Aggregating sentiment metrics...")
    news_features = aggregate_sentiment_features(sentiment_data)
    
    # Display sample of extracted features
    print("\nSample of extracted news features:")
    for symbol, features in list(news_features.items())[:3]:
        print(f"\n{symbol.upper()}:")
        for key, value in features.items():
            if value is not None:
                if isinstance(value, float):
                    print(f"  {key}: {value:.4f}")
                else:
                    print(f"  {key}: {value}")
    
    print(f"\nStep 4: Merging with existing features...")
    try:
        merged_df = merge_with_existing_features(news_features, existing_features_file)

        # Remove 'links.pulsex' column if present
        if 'links.pulsex' in merged_df.columns:
            merged_df = merged_df.drop(columns=['links.pulsex'])

        print(f"Step 5: Saving merged features...")
        merged_df.to_parquet(output_file, index=False)
        print(f"Saved merged features to {output_file}")
        print(f"Final dataset shape: {merged_df.shape}")

        # Show summary of news feature coverage
        news_cols = [col for col in merged_df.columns if col.startswith('news_')]
        print(f"\nNews feature coverage:")
        for col in news_cols:
            non_null_count = merged_df[col].notna().sum()
            coverage = non_null_count / len(merged_df) * 100
            print(f"  {col}: {non_null_count}/{len(merged_df)} ({coverage:.1f}%)")
            
    except Exception as e:
        print(f"Error during merging: {e}")
        print("Make sure your merged_features.parquet file exists and is accessible")

if __name__ == "__main__":
    main()