File size: 15,042 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
import sys
import os
import numpy as np
import pandas as pd
from datetime import datetime

# Ensure src/merge is in the path for import
sys.path.append(os.path.dirname(__file__))

from alpaca_features import build_features, save

def create_symbol_mapping():
    """
    Create mapping between crypto full names and ticker symbols.
    """
    # Common crypto symbol mappings
    crypto_mapping = {
        # Major cryptocurrencies
        'bitcoin': 'BTC',
        'ethereum': 'ETH',
        'binancecoin': 'BNB',
        'ripple': 'XRP',
        'cardano': 'ADA',
        'solana': 'SOL',
        'dogecoin': 'DOGE',
        'polkadot': 'DOT',
        'matic-network': 'MATIC',
        'polygon': 'MATIC',
        'avalanche-2': 'AVAX',
        'avalanche': 'AVAX',
        'chainlink': 'LINK',
        'litecoin': 'LTC',
        'bitcoin-cash': 'BCH',
        'stellar': 'XLM',
        'vechain': 'VET',
        'ethereum-classic': 'ETC',
        'filecoin': 'FIL',
        'tron': 'TRX',
        'monero': 'XMR',
        'eos': 'EOS',
        'aave': 'AAVE',
        'maker': 'MKR',
        'compound': 'COMP',
        'uniswap': 'UNI',
        'yearn-finance': 'YFI',
        'sushi': 'SUSHI',
        'curve-dao-token': 'CRV',
        'pancakeswap-token': 'CAKE',
        'terra-luna': 'LUNA',
        'fantom': 'FTM',
        'harmony': 'ONE',
        'near': 'NEAR',
        'algorand': 'ALGO',
        'cosmos': 'ATOM',
        'internet-computer': 'ICP',
        'helium': 'HNT',
        'theta-token': 'THETA',
        'chiliz': 'CHZ',
        'decentraland': 'MANA',
        'the-sandbox': 'SAND',
        'axie-infinity': 'AXS',
        'shiba-inu': 'SHIB',
        'apecoin': 'APE',
        'gala': 'GALA',
        'enjincoin': 'ENJ',
        'flow': 'FLOW',
        'basic-attention-token': 'BAT',
        'omg': 'OMG',
        'loopring': 'LRC',
        'immutable-x': 'IMX',
        'render-token': 'RNDR',
        'quant-network': 'QNT',
        'injective-protocol': 'INJ',
        'sei-network': 'SEI',
        'arbitrum': 'ARB',
        'optimism': 'OP',
        'blur': 'BLUR',
        'pepe': 'PEPE',
        'bonk': 'BONK',
        'wormhole': 'W',
        'jupiter-exchange-solana': 'JUP',
        'worldcoin-wld': 'WLD',
        'pyth-network': 'PYTH',
        'jito': 'JTO',
        'tensor': 'TNSR',
        'meme': 'MEME',
        'cat-in-a-dogs-world': 'MEW',
        'book-of-meme': 'BOME',
        'dogwifhat': 'WIF',
        'popcat': 'POPCAT',
        'goatseus-maximus': 'GOAT',
        'peanut-the-squirrel': 'PNUT',
        'act-i-the-ai-prophecy': 'ACT',
        'fartcoin': 'FARTCOIN',
        'ai16z': 'AI16Z',
        'virtual-protocol': 'VIRTUAL',
        'zerebro': 'ZEREBRO',
        'griffain': 'GRIFFAIN',
        'aixbt-by-virtuals': 'AIXBT',
        'marc-and-ethan-are-based': 'BASED',
        'pudgy-penguins': 'PENGU',
        'hyperliquid': 'HYPE',
        'move-movement': 'MOVE',
        'usual': 'USUAL',
        'reserve-rights': 'RSR',
        'ondo-finance': 'ONDO',
        'ethena': 'ENA',
        'eigenlayer': 'EIGEN',
        'grass': 'GRASS',
        'io': 'IO',
        'notcoin': 'NOT',
        'turbo': 'TURBO',
        'jasmy': 'JASMY',
        'neo': 'NEO',
        'iota': 'IOTA',
        'dash': 'DASH',
        'zcash': 'ZEC',
        'waves': 'WAVES',
    }
    
    # Create reverse mapping (ticker -> full name)
    reverse_mapping = {v.lower(): k for k, v in crypto_mapping.items()}
    
    # Also add the forward mapping (full name -> ticker)
    forward_mapping = {k: v.lower() for k, v in crypto_mapping.items()}
    
    return crypto_mapping, reverse_mapping, forward_mapping

def normalize_symbols(df, symbol_col, is_alpaca=False):
    """
    Normalize symbols to handle crypto name/ticker differences and stock symbols.
    """
    df = df.copy()
    crypto_mapping, reverse_mapping, forward_mapping = create_symbol_mapping()
    
    # Convert to lowercase for consistency
    df[symbol_col] = df[symbol_col].str.lower()
    
    if is_alpaca:
        # Alpaca uses tickers (BTC, ETH, etc. for crypto, NVDA, AAPL, etc. for stocks)
        # For crypto: Map tickers to full names to match merged data
        # For stocks: Keep the ticker symbol as-is (in lowercase)
        
        def map_alpaca_symbol(symbol):
            symbol_lower = symbol.lower()
            
            # Check if it's a crypto ticker that needs mapping
            if symbol_lower in reverse_mapping:
                return reverse_mapping[symbol_lower]
            else:
                # It's likely a stock symbol, keep as-is (lowercase)
                return symbol_lower
        
        df[symbol_col] = df[symbol_col].apply(map_alpaca_symbol)
    else:
        # Merged data uses full names for crypto (bitcoin, ethereum, etc.)
        # and should use lowercase tickers for stocks (nvda, aapl, etc.)
        # Keep as is, but ensure lowercase
        pass
    
    return df

def merge_alpaca_features():
    """
    Merge Alpaca features with existing merged features.
    Handles timestamp alignment, column conflicts, and symbol mapping.
    """
    
    # Step 1: Create Alpaca features
    alpaca_df = build_features()
    save(alpaca_df)
    
    # Step 2: Load merged features
    try:
        from src import config as app_config
        base_dir = app_config.DATA_DIR
    except Exception:
        base_dir = os.environ.get("DATA_DIR", "/data")
    merged_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet")
    
    merged_df = pd.read_parquet(merged_path)
    
    # Normalize symbols
    alpaca_df_normalized = normalize_symbols(alpaca_df, "symbol", is_alpaca=True)
    merged_df_normalized = normalize_symbols(merged_df, "symbol", is_alpaca=False)
    
    # Find overlapping symbols
    alpaca_normalized = set(alpaca_df_normalized["symbol"].unique())
    merged_normalized = set(merged_df_normalized["symbol"].unique())
    overlapping_symbols = alpaca_normalized.intersection(merged_normalized)
    missing_in_merged = alpaca_normalized - merged_normalized
    
    
    # Step 6: Handle symbols that exist only in Alpaca data
    if missing_in_merged:
        
        new_symbol_rows = []
        for missing_symbol in missing_in_merged:
            # Get actual data for this symbol from Alpaca
            symbol_data = alpaca_df_normalized[alpaca_df_normalized["symbol"] == missing_symbol]
            if len(symbol_data) == 0:
                continue
                
            
            # Create rows based on Alpaca timestamps, not merged timestamps
            for _, alpaca_row in symbol_data.iterrows():
                new_row = {
                    "symbol": missing_symbol,
                    "interval_timestamp": alpaca_row["timestamp"],  # Use Alpaca timestamp
                    "is_stock": True if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else False,
                    "is_crypto": False if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else True,
                    "stock_market": "NASDAQ" if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else None,
                    "feature_timestamp": pd.Timestamp.now().value // 1000000,  # Convert to milliseconds
                }

                # Copy all Alpaca feature columns into the new row
                for col in alpaca_row.index:
                    if col not in new_row:
                        new_row[col] = alpaca_row[col]

                # Add all other columns from merged_df with NaN values (except the ones we set above)
                for col in merged_df_normalized.columns:
                    if col not in new_row:
                        new_row[col] = np.nan

                new_symbol_rows.append(new_row)
        
        if new_symbol_rows:
            new_symbols_df = pd.DataFrame(new_symbol_rows)
            merged_df_normalized = pd.concat([merged_df_normalized, new_symbols_df], ignore_index=True)
    
    # Step 7: Check for overlapping columns and handle them
    join_keys = ["symbol", "timestamp", "interval_timestamp"]
    alpaca_cols = set(alpaca_df_normalized.columns) - set(join_keys)
    merged_cols = set(merged_df_normalized.columns) - set(join_keys)
    overlapping_cols = alpaca_cols.intersection(merged_cols)
    
    # Convert timestamps to datetime for processing (use pd.concat to avoid fragmentation)
    timestamp_columns = {}
    
    if "timestamp" in alpaca_df_normalized.columns:
        timestamp_columns["timestamp_dt"] = pd.to_datetime(alpaca_df_normalized["timestamp"], unit="ms")
    
    if "interval_timestamp" in merged_df_normalized.columns:
        timestamp_columns["interval_timestamp_dt"] = pd.to_datetime(merged_df_normalized["interval_timestamp"], unit="ms")
    
    # Add timestamp columns efficiently using pd.concat
    if timestamp_columns:
        for col_name, col_data in timestamp_columns.items():
            if col_name == "timestamp_dt" and "timestamp" in alpaca_df_normalized.columns:
                alpaca_df_normalized = pd.concat([alpaca_df_normalized, col_data.to_frame(col_name)], axis=1)
            elif col_name == "interval_timestamp_dt" and "interval_timestamp" in merged_df_normalized.columns:
                merged_df_normalized = pd.concat([merged_df_normalized, col_data.to_frame(col_name)], axis=1)
    
    # Perform an OUTER merge to capture all data from both sources
    final_merge = pd.merge(
        merged_df_normalized,
        alpaca_df_normalized,
        left_on=["symbol", "interval_timestamp"],
        right_on=["symbol", "timestamp"],
        how="outer",  # Changed from "left" to "outer"
        suffixes=("", "_alpaca")
    )
    
    # For rows that came only from Alpaca (new symbols), copy the timestamp to interval_timestamp
    alpaca_only_mask = final_merge["interval_timestamp"].isna() & final_merge["timestamp"].notna()
    if alpaca_only_mask.any():
        final_merge.loc[alpaca_only_mask, "interval_timestamp"] = final_merge.loc[alpaca_only_mask, "timestamp"]

        # Set basic metadata for these new rows
        final_merge.loc[alpaca_only_mask, "feature_timestamp"] = pd.Timestamp.now().value // 1000000

        # Set stock/crypto flags based on symbol
        for symbol in final_merge.loc[alpaca_only_mask, "symbol"].unique():
            symbol_mask = alpaca_only_mask & (final_merge["symbol"] == symbol)
            is_stock = symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"]
            final_merge.loc[symbol_mask, "is_stock"] = is_stock
            final_merge.loc[symbol_mask, "is_crypto"] = not is_stock
            if is_stock:
                final_merge.loc[symbol_mask, "stock_market"] = "NASDAQ"

        # Copy _alpaca columns into base columns for Alpaca-only rows
        feature_cols = [
            "open", "high", "low", "close", "volume", "trade_count", "vwap",
            "symbol_quote", "bid_price", "bid_size", "bid_exchange", "ask_price", "ask_size", "ask_exchange",
            "conditions", "tape", "symbol_trade", "exchange", "price", "size", "id", "conditions_trade", "tape_trade"
        ]
        for col in feature_cols:
            alpaca_col = f"{col}_alpaca"
            if alpaca_col in final_merge.columns and col in final_merge.columns:
                final_merge.loc[alpaca_only_mask, col] = final_merge.loc[alpaca_only_mask, alpaca_col]
    
    # Step 11: Calculate merge statistics
    total_merged_rows = len(merged_df_normalized)
    total_alpaca_rows = len(alpaca_df_normalized)
    total_final_rows = len(final_merge)
    
    # Count matches from original merged data
    original_matched_rows = final_merge[
        final_merge["timestamp"].notna() & 
        final_merge["interval_timestamp"].notna() & 
        (final_merge["interval_timestamp"] != final_merge["timestamp"])
    ].shape[0]
    
    # Count new rows from Alpaca-only symbols
    alpaca_only_rows = final_merge[
        final_merge["timestamp"].notna() & 
        (final_merge["interval_timestamp"] == final_merge["timestamp"])
    ].shape[0]
    
    # Total rows with Alpaca data
    total_alpaca_matched = final_merge[final_merge["timestamp"].notna()].shape[0]
    
    original_match_rate = original_matched_rows / total_merged_rows if total_merged_rows > 0 else 0
    overall_match_rate = total_alpaca_matched / total_final_rows if total_final_rows > 0 else 0
    
    
    # Step 12: Debug successful matches and new symbols
    if total_alpaca_matched > 0:
        successful_matches = final_merge[final_merge["timestamp"].notna()]
        sample_cols = ["symbol", "interval_timestamp", "timestamp", "open", "high", "low", "close", "volume"]
        available_cols = [col for col in sample_cols if col in successful_matches.columns]
        
    # Step 13: Add merge metadata
    final_merge["alpaca_merge_timestamp"] = pd.Timestamp.now().value // 1000000  # Convert to milliseconds
    final_merge["alpaca_data_available"] = final_merge["timestamp"].notna()
    final_merge["alpaca_match_rate"] = overall_match_rate
    final_merge["is_new_symbol"] = final_merge["interval_timestamp"] == final_merge["timestamp"]
    
    # Step 14: Handle duplicate columns before saving
    duplicate_cols = final_merge.columns[final_merge.columns.duplicated()].tolist()
    if duplicate_cols:
        final_merge = final_merge.loc[:, ~final_merge.columns.duplicated()]
    
    # Save the merged features
    out_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet")
    
    final_merge.to_parquet(out_path, index=False)
    
    # Generate detailed summary report
    print(f"Total final rows: {len(final_merge)}")
    print(f"Rows with Alpaca data: {total_alpaca_matched}")
    print(f"New symbols added: {alpaca_only_rows}")
    print(f"Overall match rate: {overall_match_rate:.2%}")
    print(f"Total columns: {len(final_merge.columns)}")
    
    # Show symbols with and without Alpaca data
    symbol_summary = final_merge.groupby("symbol").agg({
        "alpaca_data_available": ["count", "sum"],
        "is_new_symbol": "sum"
    }).round(2)
    
    symbol_summary.columns = ["total_rows", "alpaca_matches", "new_symbol_rows"]
    symbol_summary["match_rate"] = symbol_summary["alpaca_matches"] / symbol_summary["total_rows"]
    symbol_summary["is_new_symbol"] = symbol_summary["new_symbol_rows"] > 0
    
    # Show which symbols have complete data
    complete_symbols = symbol_summary[symbol_summary["match_rate"] > 0.5]
    if len(complete_symbols) > 0:
        print(complete_symbols[["total_rows", "alpaca_matches", "match_rate"]])
    
    # Show sample of final merged data
    sample_cols = ["symbol", "interval_timestamp", "alpaca_data_available", "is_new_symbol", "open", "high", "low", "close", "volume"]
    
    return final_merge

if __name__ == "__main__":
    try:
        merged_df = merge_alpaca_features()
    except Exception as e:
        import traceback
        traceback.print_exc()