|
import pandas as pd |
|
import os |
|
import numpy as np |
|
from datetime import datetime, timedelta |
|
|
|
DAYS_OLD = 7 |
|
MERGED_DIR = "data/merged/features" |
|
TEMP_DIR = "data/merged/temp" |
|
|
|
|
|
def _cast_value_for_column(target_series: pd.Series, value): |
|
try: |
|
|
|
if pd.api.types.is_datetime64_any_dtype(target_series.dtype): |
|
v = pd.to_datetime(value, errors='coerce', utc=True) |
|
if isinstance(v, pd.Timestamp): |
|
return v.tz_localize(None) |
|
return v |
|
return value |
|
except Exception: |
|
return value |
|
|
|
def fill_nulls_from_temp(df_merged, df_temp): |
|
""" |
|
Fill null values in df_merged using non-null values from df_temp |
|
for the same symbol + interval_timestamp combination. |
|
Returns the number of null values filled. |
|
""" |
|
nulls_filled = 0 |
|
|
|
if df_merged.empty or df_temp.empty: |
|
return nulls_filled |
|
|
|
|
|
key_cols = ["symbol", "interval_timestamp"] |
|
|
|
|
|
if not all(col in df_merged.columns for col in key_cols): |
|
print("[WARN] Key columns missing in merged data, skipping null filling") |
|
return nulls_filled |
|
if not all(col in df_temp.columns for col in key_cols): |
|
print("[WARN] Key columns missing in temp data, skipping null filling") |
|
return nulls_filled |
|
|
|
|
|
|
|
temp_lookup = {} |
|
for _, row in df_temp.iterrows(): |
|
key = (row['symbol'], row['interval_timestamp']) |
|
temp_lookup[key] = row.to_dict() |
|
|
|
|
|
common_cols = [col for col in df_merged.columns |
|
if col in df_temp.columns and col not in key_cols] |
|
|
|
if not common_cols: |
|
print("[WARN] No common columns found for null filling") |
|
return nulls_filled |
|
|
|
|
|
null_cols_before = [] |
|
for col in common_cols: |
|
if df_merged[col].isnull().any(): |
|
null_cols_before.append(col) |
|
|
|
if not null_cols_before: |
|
print("[INFO] No null values found in common columns") |
|
return nulls_filled |
|
|
|
print(f"[INFO] Attempting to fill nulls in {len(null_cols_before)} columns: {null_cols_before}") |
|
|
|
|
|
for idx, row in df_merged.iterrows(): |
|
key = (row['symbol'], row['interval_timestamp']) |
|
|
|
|
|
if key in temp_lookup: |
|
temp_row = temp_lookup[key] |
|
|
|
|
|
for col in null_cols_before: |
|
try: |
|
|
|
row_val = row[col] |
|
temp_val = temp_row.get(col) |
|
|
|
|
|
row_is_null = pd.isnull(row_val) |
|
if hasattr(row_is_null, '__len__') and len(row_is_null) > 1: |
|
row_is_null = row_is_null.any() |
|
|
|
|
|
temp_is_not_null = not pd.isnull(temp_val) |
|
if hasattr(temp_is_not_null, '__len__') and len(temp_is_not_null) > 1: |
|
temp_is_not_null = temp_is_not_null.all() |
|
|
|
if row_is_null and temp_is_not_null: |
|
|
|
df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], temp_val) |
|
nulls_filled += 1 |
|
except Exception as e: |
|
|
|
print(f"[WARN] Could not process column '{col}' for null filling: {e}") |
|
continue |
|
|
|
if nulls_filled > 0: |
|
print(f"[INFO] Successfully filled {nulls_filled} null values from temp data") |
|
|
|
|
|
for col in null_cols_before: |
|
nulls_remaining = df_merged[col].isnull().sum() |
|
print(f"[INFO] Column '{col}': {nulls_remaining} nulls remaining") |
|
|
|
return nulls_filled |
|
|
|
|
|
def filter_new(df): |
|
|
|
|
|
return df.copy() |
|
|
|
def merge_temp_to_merged(temp_name, merged_name): |
|
temp_path = os.path.join(TEMP_DIR, temp_name) |
|
merged_path = os.path.join(MERGED_DIR, merged_name) |
|
if not os.path.exists(temp_path): |
|
print(f"[WARN] Temp file missing: {temp_path}") |
|
return |
|
if not os.path.exists(merged_path): |
|
print(f"[WARN] Merged file missing: {merged_path}") |
|
return |
|
|
|
df_temp = pd.read_parquet(temp_path) |
|
df_merged = pd.read_parquet(merged_path) |
|
|
|
|
|
required_cols = ["symbol", "interval_timestamp"] |
|
missing_cols_temp = [col for col in required_cols if col not in df_temp.columns] |
|
missing_cols_merged = [col for col in required_cols if col not in df_merged.columns] |
|
|
|
if missing_cols_temp: |
|
print(f"[ERROR] Missing columns in temp file {temp_name}: {missing_cols_temp}") |
|
print(f"[INFO] Available columns in temp: {list(df_temp.columns)}") |
|
return |
|
|
|
if missing_cols_merged: |
|
print(f"[ERROR] Missing columns in merged file {merged_name}: {missing_cols_merged}") |
|
print(f"[INFO] Available columns in merged: {list(df_merged.columns)}") |
|
return |
|
|
|
new_temp = filter_new(df_temp) |
|
|
|
|
|
nulls_filled = fill_nulls_from_temp(df_merged, df_temp) |
|
|
|
|
|
key_cols = ["symbol", "interval_timestamp"] |
|
merged_keys = set(tuple(row) for row in df_merged[key_cols].values) |
|
new_rows = new_temp[~new_temp[key_cols].apply(tuple, axis=1).isin(merged_keys)] |
|
|
|
if new_rows.empty and nulls_filled == 0: |
|
print(f"[INFO] No new records to add from {temp_name} and no nulls filled") |
|
return |
|
|
|
df_final = pd.concat([df_merged, new_rows], ignore_index=True) |
|
df_final.to_parquet(merged_path, index=False) |
|
print(f"[OK] Added {len(new_rows)} new records from {temp_name} to {merged_name}, filled {nulls_filled} null values") |
|
|
|
def merge_all_to_train(archive_name, features_name, temp_name, train_name): |
|
""" |
|
Merge archive, features, and temp files into a deduplicated train file under merge/train/. |
|
Uniqueness is enforced on (symbol, interval_timestamp). |
|
Also performs null filling between different sources. |
|
""" |
|
ARCHIVE_DIR = os.path.join(MERGED_DIR, "archive") |
|
TRAIN_DIR = os.path.join("data", "merged", "train") |
|
os.makedirs(TRAIN_DIR, exist_ok=True) |
|
features_path = os.path.join(MERGED_DIR, features_name) |
|
temp_path = os.path.join(TEMP_DIR, temp_name) |
|
train_path = os.path.join(TRAIN_DIR, train_name) |
|
|
|
dfs = [] |
|
df_sources = {} |
|
|
|
|
|
archive_dfs = [] |
|
if os.path.isdir(ARCHIVE_DIR): |
|
for root, dirs, files in os.walk(ARCHIVE_DIR): |
|
for fname in files: |
|
|
|
if fname.startswith(archive_name.replace('.parquet', '_archived_')) and fname.endswith('.parquet'): |
|
fpath = os.path.join(root, fname) |
|
try: |
|
archive_dfs.append(pd.read_parquet(fpath)) |
|
except Exception as e: |
|
print(f"[WARN] Could not read archive file {fpath}: {e}") |
|
if archive_dfs: |
|
df_archive = pd.concat(archive_dfs, ignore_index=True) |
|
dfs.append(df_archive) |
|
df_sources['archive'] = df_archive |
|
else: |
|
print(f"[WARN] No archive files found for {archive_name}") |
|
|
|
|
|
if os.path.exists(features_path): |
|
df_features = pd.read_parquet(features_path) |
|
dfs.append(df_features) |
|
df_sources['features'] = df_features |
|
else: |
|
print(f"[WARN] Missing: {features_path}") |
|
|
|
if os.path.exists(temp_path): |
|
df_temp = pd.read_parquet(temp_path) |
|
dfs.append(df_temp) |
|
df_sources['temp'] = df_temp |
|
else: |
|
print(f"[WARN] Missing: {temp_path}") |
|
|
|
if not dfs: |
|
print("[ERROR] No input files found.") |
|
return |
|
|
|
|
|
df_all = pd.concat(dfs, ignore_index=True) |
|
|
|
|
|
total_nulls_filled = 0 |
|
if len(df_sources) > 1: |
|
print(f"[INFO] Attempting cross-source null filling for {train_name}") |
|
|
|
|
|
all_data_lookup = {} |
|
for source_name, df_source in df_sources.items(): |
|
for _, row in df_source.iterrows(): |
|
key = (row['symbol'], row['interval_timestamp']) |
|
if key not in all_data_lookup: |
|
all_data_lookup[key] = {} |
|
|
|
|
|
for col in df_source.columns: |
|
try: |
|
|
|
col_val = row[col] |
|
|
|
|
|
is_not_null = not pd.isnull(col_val) |
|
if hasattr(is_not_null, '__len__') and len(is_not_null) > 1: |
|
is_not_null = is_not_null.all() |
|
|
|
if is_not_null: |
|
all_data_lookup[key][col] = col_val |
|
except Exception as e: |
|
|
|
print(f"[WARN] Could not process column '{col}' for train lookup: {e}") |
|
continue |
|
|
|
|
|
for idx, row in df_all.iterrows(): |
|
key = (row['symbol'], row['interval_timestamp']) |
|
if key in all_data_lookup: |
|
lookup_row = all_data_lookup[key] |
|
for col in df_all.columns: |
|
try: |
|
|
|
row_val = row[col] |
|
|
|
|
|
row_is_null = pd.isnull(row_val) |
|
if hasattr(row_is_null, '__len__') and len(row_is_null) > 1: |
|
row_is_null = row_is_null.any() |
|
|
|
if row_is_null and col in lookup_row: |
|
df_all.at[idx, col] = _cast_value_for_column(df_all[col], lookup_row[col]) |
|
total_nulls_filled += 1 |
|
except Exception as e: |
|
|
|
print(f"[WARN] Could not process column '{col}' for train null filling: {e}") |
|
continue |
|
|
|
|
|
df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last") |
|
|
|
|
|
problematic_cols = [] |
|
for col in df_all.columns: |
|
try: |
|
|
|
sample = df_all[col].iloc[0] if len(df_all) > 0 else None |
|
if sample is not None and hasattr(sample, '__len__') and not isinstance(sample, str): |
|
|
|
if len(sample) > 1: |
|
problematic_cols.append(col) |
|
except: |
|
|
|
problematic_cols.append(col) |
|
|
|
if problematic_cols: |
|
print(f"[WARN] Dropping problematic columns that can't be serialized: {problematic_cols}") |
|
df_all = df_all.drop(columns=problematic_cols) |
|
|
|
|
|
df_all.to_parquet(train_path, index=False) |
|
|
|
if total_nulls_filled > 0: |
|
print(f"[OK] Created train file: {train_path} with {len(df_all)} records, filled {total_nulls_filled} nulls") |
|
else: |
|
print(f"[OK] Created train file: {train_path} with {len(df_all)} records.") |
|
|
|
def create_merged_features(): |
|
""" |
|
Create the main merged_features.parquet file by combining crypto and stock features |
|
with intelligent null filling between the two datasets. |
|
""" |
|
crypto_path = os.path.join(MERGED_DIR, "crypto_features.parquet") |
|
stocks_path = os.path.join(MERGED_DIR, "stocks_features.parquet") |
|
merged_path = os.path.join(MERGED_DIR, "merged_features.parquet") |
|
|
|
dfs_to_merge = [] |
|
|
|
|
|
if os.path.exists(crypto_path): |
|
df_crypto = pd.read_parquet(crypto_path) |
|
dfs_to_merge.append(('crypto', df_crypto)) |
|
print(f"[INFO] Loaded crypto features: {len(df_crypto)} rows, {len(df_crypto.columns)} columns") |
|
else: |
|
print(f"[WARN] Crypto features not found: {crypto_path}") |
|
|
|
|
|
if os.path.exists(stocks_path): |
|
df_stocks = pd.read_parquet(stocks_path) |
|
dfs_to_merge.append(('stocks', df_stocks)) |
|
print(f"[INFO] Loaded stock features: {len(df_stocks)} rows, {len(df_stocks.columns)} columns") |
|
else: |
|
print(f"[WARN] Stock features not found: {stocks_path}") |
|
|
|
if not dfs_to_merge: |
|
print("[ERROR] No feature files found to merge") |
|
return |
|
|
|
if len(dfs_to_merge) == 1: |
|
|
|
df_merged = dfs_to_merge[0][1].copy() |
|
print(f"[INFO] Only {dfs_to_merge[0][0]} features available") |
|
else: |
|
|
|
print("[INFO] Merging crypto and stock features with cross-dataset null filling") |
|
|
|
|
|
all_dfs = [df for _, df in dfs_to_merge] |
|
df_merged = pd.concat(all_dfs, ignore_index=True, sort=False) |
|
|
|
|
|
|
|
lookup_data = {} |
|
for dataset_name, df in dfs_to_merge: |
|
for _, row in df.iterrows(): |
|
key = (row['symbol'], row['interval_timestamp']) |
|
if key not in lookup_data: |
|
lookup_data[key] = {} |
|
|
|
|
|
for col in df.columns: |
|
try: |
|
|
|
col_val = row[col] |
|
|
|
|
|
is_not_null = not pd.isnull(col_val) |
|
if hasattr(is_not_null, '__len__') and len(is_not_null) > 1: |
|
is_not_null = is_not_null.all() |
|
|
|
if is_not_null: |
|
lookup_data[key][col] = col_val |
|
except Exception as e: |
|
|
|
print(f"[WARN] Could not process column '{col}' for lookup: {e}") |
|
continue |
|
|
|
|
|
nulls_filled = 0 |
|
for idx, row in df_merged.iterrows(): |
|
key = (row['symbol'], row['interval_timestamp']) |
|
if key in lookup_data: |
|
lookup_row = lookup_data[key] |
|
for col in df_merged.columns: |
|
try: |
|
|
|
row_val = row[col] |
|
|
|
|
|
row_is_null = pd.isnull(row_val) |
|
if hasattr(row_is_null, '__len__') and len(row_is_null) > 1: |
|
row_is_null = row_is_null.any() |
|
|
|
if row_is_null and col in lookup_row: |
|
df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], lookup_row[col]) |
|
nulls_filled += 1 |
|
except Exception as e: |
|
|
|
print(f"[WARN] Could not process column '{col}' for null filling: {e}") |
|
continue |
|
|
|
if nulls_filled > 0: |
|
print(f"[INFO] Cross-dataset null filling: {nulls_filled} values filled") |
|
|
|
|
|
initial_len = len(df_merged) |
|
df_merged = df_merged.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last") |
|
final_len = len(df_merged) |
|
|
|
if initial_len != final_len: |
|
print(f"[INFO] Removed {initial_len - final_len} duplicate records") |
|
|
|
|
|
df_merged.to_parquet(merged_path, index=False) |
|
print(f"[OK] Created merged features: {merged_path} with {len(df_merged)} rows, {len(df_merged.columns)} columns") |
|
|
|
|
|
nulls_remaining = df_merged.isnull().sum().sum() |
|
print(f"[INFO] Merged features null count: {nulls_remaining}") |
|
|
|
|
|
if 'symbol' in df_merged.columns: |
|
symbol_counts = df_merged['symbol'].value_counts() |
|
print(f"[INFO] Top symbols: {dict(symbol_counts.head(10))}") |
|
|
|
def main(): |
|
import sys |
|
|
|
|
|
if len(sys.argv) > 1 and sys.argv[1] == '--test-null-filling': |
|
from test_null_filling_merge import main as test_main |
|
sys.exit(test_main()) |
|
|
|
merge_temp_to_merged("crypto_features.parquet", "crypto_features.parquet") |
|
merge_temp_to_merged("stocks_features.parquet", "stocks_features.parquet") |
|
|
|
|
|
create_merged_features() |
|
|
|
merge_all_to_train("crypto_features.parquet", "crypto_features.parquet", "crypto_features.parquet", "crypto_features_train.parquet") |
|
merge_all_to_train("stocks_features.parquet", "stocks_features.parquet", "stocks_features.parquet", "stocks_features_train.parquet") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|