|
import sys |
|
import os |
|
import numpy as np |
|
import pandas as pd |
|
from datetime import datetime |
|
|
|
|
|
sys.path.append(os.path.dirname(__file__)) |
|
|
|
from alpaca_features import build_features, save |
|
|
|
def create_symbol_mapping(): |
|
""" |
|
Create mapping between crypto full names and ticker symbols. |
|
""" |
|
|
|
crypto_mapping = { |
|
|
|
'bitcoin': 'BTC', |
|
'ethereum': 'ETH', |
|
'binancecoin': 'BNB', |
|
'ripple': 'XRP', |
|
'cardano': 'ADA', |
|
'solana': 'SOL', |
|
'dogecoin': 'DOGE', |
|
'polkadot': 'DOT', |
|
'matic-network': 'MATIC', |
|
'polygon': 'MATIC', |
|
'avalanche-2': 'AVAX', |
|
'avalanche': 'AVAX', |
|
'chainlink': 'LINK', |
|
'litecoin': 'LTC', |
|
'bitcoin-cash': 'BCH', |
|
'stellar': 'XLM', |
|
'vechain': 'VET', |
|
'ethereum-classic': 'ETC', |
|
'filecoin': 'FIL', |
|
'tron': 'TRX', |
|
'monero': 'XMR', |
|
'eos': 'EOS', |
|
'aave': 'AAVE', |
|
'maker': 'MKR', |
|
'compound': 'COMP', |
|
'uniswap': 'UNI', |
|
'yearn-finance': 'YFI', |
|
'sushi': 'SUSHI', |
|
'curve-dao-token': 'CRV', |
|
'pancakeswap-token': 'CAKE', |
|
'terra-luna': 'LUNA', |
|
'fantom': 'FTM', |
|
'harmony': 'ONE', |
|
'near': 'NEAR', |
|
'algorand': 'ALGO', |
|
'cosmos': 'ATOM', |
|
'internet-computer': 'ICP', |
|
'helium': 'HNT', |
|
'theta-token': 'THETA', |
|
'chiliz': 'CHZ', |
|
'decentraland': 'MANA', |
|
'the-sandbox': 'SAND', |
|
'axie-infinity': 'AXS', |
|
'shiba-inu': 'SHIB', |
|
'apecoin': 'APE', |
|
'gala': 'GALA', |
|
'enjincoin': 'ENJ', |
|
'flow': 'FLOW', |
|
'basic-attention-token': 'BAT', |
|
'omg': 'OMG', |
|
'loopring': 'LRC', |
|
'immutable-x': 'IMX', |
|
'render-token': 'RNDR', |
|
'quant-network': 'QNT', |
|
'injective-protocol': 'INJ', |
|
'sei-network': 'SEI', |
|
'arbitrum': 'ARB', |
|
'optimism': 'OP', |
|
'blur': 'BLUR', |
|
'pepe': 'PEPE', |
|
'bonk': 'BONK', |
|
'wormhole': 'W', |
|
'jupiter-exchange-solana': 'JUP', |
|
'worldcoin-wld': 'WLD', |
|
'pyth-network': 'PYTH', |
|
'jito': 'JTO', |
|
'tensor': 'TNSR', |
|
'meme': 'MEME', |
|
'cat-in-a-dogs-world': 'MEW', |
|
'book-of-meme': 'BOME', |
|
'dogwifhat': 'WIF', |
|
'popcat': 'POPCAT', |
|
'goatseus-maximus': 'GOAT', |
|
'peanut-the-squirrel': 'PNUT', |
|
'act-i-the-ai-prophecy': 'ACT', |
|
'fartcoin': 'FARTCOIN', |
|
'ai16z': 'AI16Z', |
|
'virtual-protocol': 'VIRTUAL', |
|
'zerebro': 'ZEREBRO', |
|
'griffain': 'GRIFFAIN', |
|
'aixbt-by-virtuals': 'AIXBT', |
|
'marc-and-ethan-are-based': 'BASED', |
|
'pudgy-penguins': 'PENGU', |
|
'hyperliquid': 'HYPE', |
|
'move-movement': 'MOVE', |
|
'usual': 'USUAL', |
|
'reserve-rights': 'RSR', |
|
'ondo-finance': 'ONDO', |
|
'ethena': 'ENA', |
|
'eigenlayer': 'EIGEN', |
|
'grass': 'GRASS', |
|
'io': 'IO', |
|
'notcoin': 'NOT', |
|
'turbo': 'TURBO', |
|
'jasmy': 'JASMY', |
|
'neo': 'NEO', |
|
'iota': 'IOTA', |
|
'dash': 'DASH', |
|
'zcash': 'ZEC', |
|
'waves': 'WAVES', |
|
} |
|
|
|
|
|
reverse_mapping = {v.lower(): k for k, v in crypto_mapping.items()} |
|
|
|
|
|
forward_mapping = {k: v.lower() for k, v in crypto_mapping.items()} |
|
|
|
return crypto_mapping, reverse_mapping, forward_mapping |
|
|
|
def normalize_symbols(df, symbol_col, is_alpaca=False): |
|
""" |
|
Normalize symbols to handle crypto name/ticker differences and stock symbols. |
|
""" |
|
df = df.copy() |
|
crypto_mapping, reverse_mapping, forward_mapping = create_symbol_mapping() |
|
|
|
|
|
df[symbol_col] = df[symbol_col].str.lower() |
|
|
|
if is_alpaca: |
|
|
|
|
|
|
|
|
|
def map_alpaca_symbol(symbol): |
|
symbol_lower = symbol.lower() |
|
|
|
|
|
if symbol_lower in reverse_mapping: |
|
return reverse_mapping[symbol_lower] |
|
else: |
|
|
|
return symbol_lower |
|
|
|
df[symbol_col] = df[symbol_col].apply(map_alpaca_symbol) |
|
else: |
|
|
|
|
|
|
|
pass |
|
|
|
return df |
|
|
|
def merge_alpaca_features(): |
|
""" |
|
Merge Alpaca features with existing merged features. |
|
Handles timestamp alignment, column conflicts, and symbol mapping. |
|
""" |
|
|
|
|
|
alpaca_df = build_features() |
|
save(alpaca_df) |
|
|
|
|
|
try: |
|
from src import config as app_config |
|
base_dir = app_config.DATA_DIR |
|
except Exception: |
|
base_dir = os.environ.get("DATA_DIR", "/data") |
|
merged_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet") |
|
|
|
merged_df = pd.read_parquet(merged_path) |
|
|
|
|
|
alpaca_df_normalized = normalize_symbols(alpaca_df, "symbol", is_alpaca=True) |
|
merged_df_normalized = normalize_symbols(merged_df, "symbol", is_alpaca=False) |
|
|
|
|
|
alpaca_normalized = set(alpaca_df_normalized["symbol"].unique()) |
|
merged_normalized = set(merged_df_normalized["symbol"].unique()) |
|
overlapping_symbols = alpaca_normalized.intersection(merged_normalized) |
|
missing_in_merged = alpaca_normalized - merged_normalized |
|
|
|
|
|
|
|
if missing_in_merged: |
|
|
|
new_symbol_rows = [] |
|
for missing_symbol in missing_in_merged: |
|
|
|
symbol_data = alpaca_df_normalized[alpaca_df_normalized["symbol"] == missing_symbol] |
|
if len(symbol_data) == 0: |
|
continue |
|
|
|
|
|
|
|
for _, alpaca_row in symbol_data.iterrows(): |
|
new_row = { |
|
"symbol": missing_symbol, |
|
"interval_timestamp": alpaca_row["timestamp"], |
|
"is_stock": True if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else False, |
|
"is_crypto": False if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else True, |
|
"stock_market": "NASDAQ" if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else None, |
|
"feature_timestamp": pd.Timestamp.now().value // 1000000, |
|
} |
|
|
|
|
|
for col in alpaca_row.index: |
|
if col not in new_row: |
|
new_row[col] = alpaca_row[col] |
|
|
|
|
|
for col in merged_df_normalized.columns: |
|
if col not in new_row: |
|
new_row[col] = np.nan |
|
|
|
new_symbol_rows.append(new_row) |
|
|
|
if new_symbol_rows: |
|
new_symbols_df = pd.DataFrame(new_symbol_rows) |
|
merged_df_normalized = pd.concat([merged_df_normalized, new_symbols_df], ignore_index=True) |
|
|
|
|
|
join_keys = ["symbol", "timestamp", "interval_timestamp"] |
|
alpaca_cols = set(alpaca_df_normalized.columns) - set(join_keys) |
|
merged_cols = set(merged_df_normalized.columns) - set(join_keys) |
|
overlapping_cols = alpaca_cols.intersection(merged_cols) |
|
|
|
|
|
timestamp_columns = {} |
|
|
|
if "timestamp" in alpaca_df_normalized.columns: |
|
timestamp_columns["timestamp_dt"] = pd.to_datetime(alpaca_df_normalized["timestamp"], unit="ms") |
|
|
|
if "interval_timestamp" in merged_df_normalized.columns: |
|
timestamp_columns["interval_timestamp_dt"] = pd.to_datetime(merged_df_normalized["interval_timestamp"], unit="ms") |
|
|
|
|
|
if timestamp_columns: |
|
for col_name, col_data in timestamp_columns.items(): |
|
if col_name == "timestamp_dt" and "timestamp" in alpaca_df_normalized.columns: |
|
alpaca_df_normalized = pd.concat([alpaca_df_normalized, col_data.to_frame(col_name)], axis=1) |
|
elif col_name == "interval_timestamp_dt" and "interval_timestamp" in merged_df_normalized.columns: |
|
merged_df_normalized = pd.concat([merged_df_normalized, col_data.to_frame(col_name)], axis=1) |
|
|
|
|
|
final_merge = pd.merge( |
|
merged_df_normalized, |
|
alpaca_df_normalized, |
|
left_on=["symbol", "interval_timestamp"], |
|
right_on=["symbol", "timestamp"], |
|
how="outer", |
|
suffixes=("", "_alpaca") |
|
) |
|
|
|
|
|
alpaca_only_mask = final_merge["interval_timestamp"].isna() & final_merge["timestamp"].notna() |
|
if alpaca_only_mask.any(): |
|
final_merge.loc[alpaca_only_mask, "interval_timestamp"] = final_merge.loc[alpaca_only_mask, "timestamp"] |
|
|
|
|
|
final_merge.loc[alpaca_only_mask, "feature_timestamp"] = pd.Timestamp.now().value // 1000000 |
|
|
|
|
|
for symbol in final_merge.loc[alpaca_only_mask, "symbol"].unique(): |
|
symbol_mask = alpaca_only_mask & (final_merge["symbol"] == symbol) |
|
is_stock = symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] |
|
final_merge.loc[symbol_mask, "is_stock"] = is_stock |
|
final_merge.loc[symbol_mask, "is_crypto"] = not is_stock |
|
if is_stock: |
|
final_merge.loc[symbol_mask, "stock_market"] = "NASDAQ" |
|
|
|
|
|
feature_cols = [ |
|
"open", "high", "low", "close", "volume", "trade_count", "vwap", |
|
"symbol_quote", "bid_price", "bid_size", "bid_exchange", "ask_price", "ask_size", "ask_exchange", |
|
"conditions", "tape", "symbol_trade", "exchange", "price", "size", "id", "conditions_trade", "tape_trade" |
|
] |
|
for col in feature_cols: |
|
alpaca_col = f"{col}_alpaca" |
|
if alpaca_col in final_merge.columns and col in final_merge.columns: |
|
final_merge.loc[alpaca_only_mask, col] = final_merge.loc[alpaca_only_mask, alpaca_col] |
|
|
|
|
|
total_merged_rows = len(merged_df_normalized) |
|
total_alpaca_rows = len(alpaca_df_normalized) |
|
total_final_rows = len(final_merge) |
|
|
|
|
|
original_matched_rows = final_merge[ |
|
final_merge["timestamp"].notna() & |
|
final_merge["interval_timestamp"].notna() & |
|
(final_merge["interval_timestamp"] != final_merge["timestamp"]) |
|
].shape[0] |
|
|
|
|
|
alpaca_only_rows = final_merge[ |
|
final_merge["timestamp"].notna() & |
|
(final_merge["interval_timestamp"] == final_merge["timestamp"]) |
|
].shape[0] |
|
|
|
|
|
total_alpaca_matched = final_merge[final_merge["timestamp"].notna()].shape[0] |
|
|
|
original_match_rate = original_matched_rows / total_merged_rows if total_merged_rows > 0 else 0 |
|
overall_match_rate = total_alpaca_matched / total_final_rows if total_final_rows > 0 else 0 |
|
|
|
|
|
|
|
if total_alpaca_matched > 0: |
|
successful_matches = final_merge[final_merge["timestamp"].notna()] |
|
sample_cols = ["symbol", "interval_timestamp", "timestamp", "open", "high", "low", "close", "volume"] |
|
available_cols = [col for col in sample_cols if col in successful_matches.columns] |
|
|
|
|
|
final_merge["alpaca_merge_timestamp"] = pd.Timestamp.now().value // 1000000 |
|
final_merge["alpaca_data_available"] = final_merge["timestamp"].notna() |
|
final_merge["alpaca_match_rate"] = overall_match_rate |
|
final_merge["is_new_symbol"] = final_merge["interval_timestamp"] == final_merge["timestamp"] |
|
|
|
|
|
duplicate_cols = final_merge.columns[final_merge.columns.duplicated()].tolist() |
|
if duplicate_cols: |
|
final_merge = final_merge.loc[:, ~final_merge.columns.duplicated()] |
|
|
|
|
|
out_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet") |
|
|
|
final_merge.to_parquet(out_path, index=False) |
|
|
|
|
|
print(f"Total final rows: {len(final_merge)}") |
|
print(f"Rows with Alpaca data: {total_alpaca_matched}") |
|
print(f"New symbols added: {alpaca_only_rows}") |
|
print(f"Overall match rate: {overall_match_rate:.2%}") |
|
print(f"Total columns: {len(final_merge.columns)}") |
|
|
|
|
|
symbol_summary = final_merge.groupby("symbol").agg({ |
|
"alpaca_data_available": ["count", "sum"], |
|
"is_new_symbol": "sum" |
|
}).round(2) |
|
|
|
symbol_summary.columns = ["total_rows", "alpaca_matches", "new_symbol_rows"] |
|
symbol_summary["match_rate"] = symbol_summary["alpaca_matches"] / symbol_summary["total_rows"] |
|
symbol_summary["is_new_symbol"] = symbol_summary["new_symbol_rows"] > 0 |
|
|
|
|
|
complete_symbols = symbol_summary[symbol_summary["match_rate"] > 0.5] |
|
if len(complete_symbols) > 0: |
|
print(complete_symbols[["total_rows", "alpaca_matches", "match_rate"]]) |
|
|
|
|
|
sample_cols = ["symbol", "interval_timestamp", "alpaca_data_available", "is_new_symbol", "open", "high", "low", "close", "volume"] |
|
|
|
return final_merge |
|
|
|
if __name__ == "__main__": |
|
try: |
|
merged_df = merge_alpaca_features() |
|
except Exception as e: |
|
import traceback |
|
traceback.print_exc() |