File size: 4,673 Bytes
c49b21b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
"""
Merge Alpaca bars + quotes + trades into a single feature table.
β’ data/alpaca/*_bars.parquet β master timeline (daily)
β’ data/alpaca/*_quotes.parquet β L1 quotes (intraday ticks)
β’ data/alpaca/*_trades.parquet β raw trades (intraday ticks)
The script logs shapes / null counts so you can eyeball data quality.
"""
from __future__ import annotations
import os
import sys
from glob import glob
import pandas as pd
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# --------------------------------------------------------------------------- #
# CONFIG
# --------------------------------------------------------------------------- #
# Resolve writable base using central config (fallback to /data)
try:
from src import config as app_config
BASE_DATA_DIR = app_config.DATA_DIR
except Exception:
BASE_DATA_DIR = os.environ.get("DATA_DIR", "/data")
DATA_DIR = os.path.join(BASE_DATA_DIR, "alpaca")
os.makedirs(DATA_DIR, exist_ok=True)
OUT_FILE = "alpaca_features.parquet"
TOLERANCE = 86_400_000 # 1 day in ms for integer timestamps
MERGE_DIR = "nearest" # β **important change**
# --------------------------------------------------------------------------- #
# HELPERS
# --------------------------------------------------------------------------- #
def log(title: str, char: str = "=", width: int = 60) -> None:
print(f"\n{title.center(width, char)}")
def load_parquets(suffix: str) -> pd.DataFrame:
"""Read every *{suffix}.parquet in DATA_DIR and concat."""
paths = glob(os.path.join(DATA_DIR, f"*{suffix}.parquet"))
if not paths:
return pd.DataFrame()
def normalize(df: pd.DataFrame) -> pd.DataFrame:
# Normalize symbol: "XRP/USD" -> "XRP"
df["symbol"] = df["symbol"].astype(str).str.replace(r"([A-Z]+)[/_][A-Z]+", r"\1", regex=True)
# Convert timestamp to ms since epoch
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp"] = df["timestamp"].astype("int64") // 10**6
return df
dfs: list[pd.DataFrame] = []
for p in paths:
df = pd.read_parquet(p)
df = normalize(df)
dfs.append(df)
out = pd.concat(dfs, ignore_index=True)
return out
# --------------------------------------------------------------------------- #
# MAIN LOGIC
# --------------------------------------------------------------------------- #
def build_features() -> pd.DataFrame:
bars = load_parquets("_bars")
quotes = load_parquets("_quotes")
trades = load_parquets("_trades")
if bars.empty:
raise RuntimeError(f"No '*_bars.parquet' files found in {DATA_DIR}")
# Merge symbol-by-symbol so each group is already sorted
features = []
symbols = sorted(bars["symbol"].unique())
for sym in symbols:
bar_df = bars[bars["symbol"] == sym].sort_values("timestamp").reset_index(drop=True)
# nearest quote merge
if not quotes.empty:
q = quotes[quotes["symbol"] == sym].sort_values("timestamp")
if not q.empty:
bar_df = pd.merge_asof(
bar_df,
q,
on="timestamp",
suffixes=("", "_quote"),
tolerance=TOLERANCE,
direction=MERGE_DIR, # β nearest!
)
# nearest trade merge
if not trades.empty:
t = trades[trades["symbol"] == sym].sort_values("timestamp")
if not t.empty:
bar_df = pd.merge_asof(
bar_df,
t,
on="timestamp",
suffixes=("", "_trade"),
tolerance=TOLERANCE,
direction=MERGE_DIR, # β nearest!
)
features.append(bar_df)
feat = pd.concat(features, ignore_index=True)
# --------------------------------------------------------------------- #
# Fill remaining holes within each symbol
# --------------------------------------------------------------------- #
feat = (
feat
.groupby("symbol", group_keys=False)
.apply(lambda df: df.ffill().bfill())
.reset_index(drop=True)
)
return feat
def save(df: pd.DataFrame) -> None:
out_path = os.path.join(DATA_DIR, OUT_FILE)
df.to_parquet(out_path, index=False)
print(f"\n-> wrote merged features to {out_path}")
# --------------------------------------------------------------------------- #
def main() -> None:
merged = build_features()
save(merged)
if __name__ == "__main__":
log("Merging Alpaca Features")
main()
|