File size: 19,650 Bytes
c49b21b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 |
import pandas as pd
import os
import numpy as np
from datetime import datetime, timedelta
DAYS_OLD = 7
MERGED_DIR = "data/merged/features"
TEMP_DIR = "data/merged/temp"
# Helper: safely cast a value to match a target column's dtype (e.g., drop tz on datetimes)
def _cast_value_for_column(target_series: pd.Series, value):
try:
# If target is datetime64[ns], ensure assigned value is tz-naive
if pd.api.types.is_datetime64_any_dtype(target_series.dtype):
v = pd.to_datetime(value, errors='coerce', utc=True)
if isinstance(v, pd.Timestamp):
return v.tz_localize(None)
return v
return value
except Exception:
return value
def fill_nulls_from_temp(df_merged, df_temp):
"""
Fill null values in df_merged using non-null values from df_temp
for the same symbol + interval_timestamp combination.
Returns the number of null values filled.
"""
nulls_filled = 0
if df_merged.empty or df_temp.empty:
return nulls_filled
# Create lookup key for efficient matching
key_cols = ["symbol", "interval_timestamp"]
# Check if key columns exist in both dataframes
if not all(col in df_merged.columns for col in key_cols):
print("[WARN] Key columns missing in merged data, skipping null filling")
return nulls_filled
if not all(col in df_temp.columns for col in key_cols):
print("[WARN] Key columns missing in temp data, skipping null filling")
return nulls_filled
# Create a lookup dictionary from temp data
# Format: {(symbol, timestamp): {column: value, ...}}
temp_lookup = {}
for _, row in df_temp.iterrows():
key = (row['symbol'], row['interval_timestamp'])
temp_lookup[key] = row.to_dict()
# Find common columns between merged and temp (excluding keys)
common_cols = [col for col in df_merged.columns
if col in df_temp.columns and col not in key_cols]
if not common_cols:
print("[WARN] No common columns found for null filling")
return nulls_filled
# Track columns with null values before processing
null_cols_before = []
for col in common_cols:
if df_merged[col].isnull().any():
null_cols_before.append(col)
if not null_cols_before:
print("[INFO] No null values found in common columns")
return nulls_filled
print(f"[INFO] Attempting to fill nulls in {len(null_cols_before)} columns: {null_cols_before}")
# Fill null values row by row
for idx, row in df_merged.iterrows():
key = (row['symbol'], row['interval_timestamp'])
# Check if we have corresponding temp data for this key
if key in temp_lookup:
temp_row = temp_lookup[key]
# Fill null values for each column
for col in null_cols_before:
try:
# Use more robust null checking to handle arrays/scalars
row_val = row[col]
temp_val = temp_row.get(col)
# Check if row value is null (handle both scalar and array cases)
row_is_null = pd.isnull(row_val)
if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
row_is_null = row_is_null.any() # For arrays, check if any are null
# Check if temp value is not null
temp_is_not_null = not pd.isnull(temp_val)
if hasattr(temp_is_not_null, '__len__') and len(temp_is_not_null) > 1:
temp_is_not_null = temp_is_not_null.all() # For arrays, check if all are not null
if row_is_null and temp_is_not_null:
# Fill the null value with dtype-compatible casting
df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], temp_val)
nulls_filled += 1
except Exception as e:
# Skip problematic columns with a warning
print(f"[WARN] Could not process column '{col}' for null filling: {e}")
continue
if nulls_filled > 0:
print(f"[INFO] Successfully filled {nulls_filled} null values from temp data")
# Report which columns were improved
for col in null_cols_before:
nulls_remaining = df_merged[col].isnull().sum()
print(f"[INFO] Column '{col}': {nulls_remaining} nulls remaining")
return nulls_filled
# Helper to filter new records (DISABLED - now keeps ALL data for accumulative merging)
def filter_new(df):
# IMPORTANT: Return ALL data instead of filtering by days
# This ensures accumulative merging from day one
return df.copy()
def merge_temp_to_merged(temp_name, merged_name):
temp_path = os.path.join(TEMP_DIR, temp_name)
merged_path = os.path.join(MERGED_DIR, merged_name)
if not os.path.exists(temp_path):
print(f"[WARN] Temp file missing: {temp_path}")
return
if not os.path.exists(merged_path):
print(f"[WARN] Merged file missing: {merged_path}")
return
df_temp = pd.read_parquet(temp_path)
df_merged = pd.read_parquet(merged_path)
# Check if required columns exist
required_cols = ["symbol", "interval_timestamp"]
missing_cols_temp = [col for col in required_cols if col not in df_temp.columns]
missing_cols_merged = [col for col in required_cols if col not in df_merged.columns]
if missing_cols_temp:
print(f"[ERROR] Missing columns in temp file {temp_name}: {missing_cols_temp}")
print(f"[INFO] Available columns in temp: {list(df_temp.columns)}")
return
if missing_cols_merged:
print(f"[ERROR] Missing columns in merged file {merged_name}: {missing_cols_merged}")
print(f"[INFO] Available columns in merged: {list(df_merged.columns)}")
return
new_temp = filter_new(df_temp)
# Step 1: Fill null values in merged data using temp data for same symbol+timestamp
nulls_filled = fill_nulls_from_temp(df_merged, df_temp)
# Step 2: Only add truly new rows (not already in merged)
key_cols = ["symbol", "interval_timestamp"]
merged_keys = set(tuple(row) for row in df_merged[key_cols].values)
new_rows = new_temp[~new_temp[key_cols].apply(tuple, axis=1).isin(merged_keys)]
if new_rows.empty and nulls_filled == 0:
print(f"[INFO] No new records to add from {temp_name} and no nulls filled")
return
df_final = pd.concat([df_merged, new_rows], ignore_index=True)
df_final.to_parquet(merged_path, index=False)
print(f"[OK] Added {len(new_rows)} new records from {temp_name} to {merged_name}, filled {nulls_filled} null values")
def merge_all_to_train(archive_name, features_name, temp_name, train_name):
"""
Merge archive, features, and temp files into a deduplicated train file under merge/train/.
Uniqueness is enforced on (symbol, interval_timestamp).
Also performs null filling between different sources.
"""
ARCHIVE_DIR = os.path.join(MERGED_DIR, "archive")
TRAIN_DIR = os.path.join("data", "merged", "train")
os.makedirs(TRAIN_DIR, exist_ok=True)
features_path = os.path.join(MERGED_DIR, features_name)
temp_path = os.path.join(TEMP_DIR, temp_name)
train_path = os.path.join(TRAIN_DIR, train_name)
dfs = []
df_sources = {} # Track which dataframe came from which source
# 1. Read all relevant archive files (recursively)
archive_dfs = []
if os.path.isdir(ARCHIVE_DIR):
for root, dirs, files in os.walk(ARCHIVE_DIR):
for fname in files:
# Only include files matching the asset (e.g., crypto_features_archived_*.parquet)
if fname.startswith(archive_name.replace('.parquet', '_archived_')) and fname.endswith('.parquet'):
fpath = os.path.join(root, fname)
try:
archive_dfs.append(pd.read_parquet(fpath))
except Exception as e:
print(f"[WARN] Could not read archive file {fpath}: {e}")
if archive_dfs:
df_archive = pd.concat(archive_dfs, ignore_index=True)
dfs.append(df_archive)
df_sources['archive'] = df_archive
else:
print(f"[WARN] No archive files found for {archive_name}")
# 2. Read features and temp
if os.path.exists(features_path):
df_features = pd.read_parquet(features_path)
dfs.append(df_features)
df_sources['features'] = df_features
else:
print(f"[WARN] Missing: {features_path}")
if os.path.exists(temp_path):
df_temp = pd.read_parquet(temp_path)
dfs.append(df_temp)
df_sources['temp'] = df_temp
else:
print(f"[WARN] Missing: {temp_path}")
if not dfs:
print("[ERROR] No input files found.")
return
# 3. Merge all data
df_all = pd.concat(dfs, ignore_index=True)
# 4. Before deduplication, try to fill nulls using data from different sources
total_nulls_filled = 0
if len(df_sources) > 1:
print(f"[INFO] Attempting cross-source null filling for {train_name}")
# Create a comprehensive lookup from all sources
all_data_lookup = {}
for source_name, df_source in df_sources.items():
for _, row in df_source.iterrows():
key = (row['symbol'], row['interval_timestamp'])
if key not in all_data_lookup:
all_data_lookup[key] = {}
# Add non-null values from this source
for col in df_source.columns:
try:
# Use more robust null checking to handle arrays/scalars
col_val = row[col]
# Check if value is not null (handle both scalar and array cases)
is_not_null = not pd.isnull(col_val)
if hasattr(is_not_null, '__len__') and len(is_not_null) > 1:
is_not_null = is_not_null.all() # For arrays, check if all are not null
if is_not_null:
all_data_lookup[key][col] = col_val
except Exception as e:
# Skip problematic columns with a warning
print(f"[WARN] Could not process column '{col}' for train lookup: {e}")
continue
# Fill nulls in the combined dataframe
for idx, row in df_all.iterrows():
key = (row['symbol'], row['interval_timestamp'])
if key in all_data_lookup:
lookup_row = all_data_lookup[key]
for col in df_all.columns:
try:
# Use more robust null checking
row_val = row[col]
# Check if row value is null (handle both scalar and array cases)
row_is_null = pd.isnull(row_val)
if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
row_is_null = row_is_null.any() # For arrays, check if any are null
if row_is_null and col in lookup_row:
df_all.at[idx, col] = _cast_value_for_column(df_all[col], lookup_row[col])
total_nulls_filled += 1
except Exception as e:
# Skip problematic columns with a warning
print(f"[WARN] Could not process column '{col}' for train null filling: {e}")
continue
# 5. Deduplicate by symbol+interval_timestamp, keeping the last occurrence
df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
# 6. Handle problematic columns that can't be serialized to parquet
problematic_cols = []
for col in df_all.columns:
try:
# Test if column can be converted to parquet-compatible format
sample = df_all[col].iloc[0] if len(df_all) > 0 else None
if sample is not None and hasattr(sample, '__len__') and not isinstance(sample, str):
# If it's an array-like object (but not string), it might cause issues
if len(sample) > 1: # Multi-dimensional array
problematic_cols.append(col)
except:
# If we can't even check the sample, it's definitely problematic
problematic_cols.append(col)
if problematic_cols:
print(f"[WARN] Dropping problematic columns that can't be serialized: {problematic_cols}")
df_all = df_all.drop(columns=problematic_cols)
# Save to parquet
df_all.to_parquet(train_path, index=False)
if total_nulls_filled > 0:
print(f"[OK] Created train file: {train_path} with {len(df_all)} records, filled {total_nulls_filled} nulls")
else:
print(f"[OK] Created train file: {train_path} with {len(df_all)} records.")
def create_merged_features():
"""
Create the main merged_features.parquet file by combining crypto and stock features
with intelligent null filling between the two datasets.
"""
crypto_path = os.path.join(MERGED_DIR, "crypto_features.parquet")
stocks_path = os.path.join(MERGED_DIR, "stocks_features.parquet")
merged_path = os.path.join(MERGED_DIR, "merged_features.parquet")
dfs_to_merge = []
# Read crypto features
if os.path.exists(crypto_path):
df_crypto = pd.read_parquet(crypto_path)
dfs_to_merge.append(('crypto', df_crypto))
print(f"[INFO] Loaded crypto features: {len(df_crypto)} rows, {len(df_crypto.columns)} columns")
else:
print(f"[WARN] Crypto features not found: {crypto_path}")
# Read stock features
if os.path.exists(stocks_path):
df_stocks = pd.read_parquet(stocks_path)
dfs_to_merge.append(('stocks', df_stocks))
print(f"[INFO] Loaded stock features: {len(df_stocks)} rows, {len(df_stocks.columns)} columns")
else:
print(f"[WARN] Stock features not found: {stocks_path}")
if not dfs_to_merge:
print("[ERROR] No feature files found to merge")
return
if len(dfs_to_merge) == 1:
# Only one dataset available, just copy it
df_merged = dfs_to_merge[0][1].copy()
print(f"[INFO] Only {dfs_to_merge[0][0]} features available")
else:
# Multiple datasets - merge with null filling
print("[INFO] Merging crypto and stock features with cross-dataset null filling")
# Combine all dataframes
all_dfs = [df for _, df in dfs_to_merge]
df_merged = pd.concat(all_dfs, ignore_index=True, sort=False)
# Perform cross-dataset null filling
# Create lookup from all datasets for same symbol+timestamp
lookup_data = {}
for dataset_name, df in dfs_to_merge:
for _, row in df.iterrows():
key = (row['symbol'], row['interval_timestamp'])
if key not in lookup_data:
lookup_data[key] = {}
# Add non-null values from this dataset
for col in df.columns:
try:
# Use more robust null checking to handle arrays/scalars
col_val = row[col]
# Check if value is not null (handle both scalar and array cases)
is_not_null = not pd.isnull(col_val)
if hasattr(is_not_null, '__len__') and len(is_not_null) > 1:
is_not_null = is_not_null.all() # For arrays, check if all are not null
if is_not_null:
lookup_data[key][col] = col_val
except Exception as e:
# Skip problematic columns with a warning
print(f"[WARN] Could not process column '{col}' for lookup: {e}")
continue
# Fill nulls using the comprehensive lookup
nulls_filled = 0
for idx, row in df_merged.iterrows():
key = (row['symbol'], row['interval_timestamp'])
if key in lookup_data:
lookup_row = lookup_data[key]
for col in df_merged.columns:
try:
# Use more robust null checking
row_val = row[col]
# Check if row value is null (handle both scalar and array cases)
row_is_null = pd.isnull(row_val)
if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
row_is_null = row_is_null.any() # For arrays, check if any are null
if row_is_null and col in lookup_row:
df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], lookup_row[col])
nulls_filled += 1
except Exception as e:
# Skip problematic columns with a warning
print(f"[WARN] Could not process column '{col}' for null filling: {e}")
continue
if nulls_filled > 0:
print(f"[INFO] Cross-dataset null filling: {nulls_filled} values filled")
# Remove duplicates if any (keeping last occurrence)
initial_len = len(df_merged)
df_merged = df_merged.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
final_len = len(df_merged)
if initial_len != final_len:
print(f"[INFO] Removed {initial_len - final_len} duplicate records")
# Save merged features
df_merged.to_parquet(merged_path, index=False)
print(f"[OK] Created merged features: {merged_path} with {len(df_merged)} rows, {len(df_merged.columns)} columns")
# Report statistics
nulls_remaining = df_merged.isnull().sum().sum()
print(f"[INFO] Merged features null count: {nulls_remaining}")
# Report symbol breakdown
if 'symbol' in df_merged.columns:
symbol_counts = df_merged['symbol'].value_counts()
print(f"[INFO] Top symbols: {dict(symbol_counts.head(10))}")
def main():
import sys
# Check if this is being run as a test
if len(sys.argv) > 1 and sys.argv[1] == '--test-null-filling':
from test_null_filling_merge import main as test_main
sys.exit(test_main())
merge_temp_to_merged("crypto_features.parquet", "crypto_features.parquet")
merge_temp_to_merged("stocks_features.parquet", "stocks_features.parquet")
# Create the main merged features file
create_merged_features()
merge_all_to_train("crypto_features.parquet", "crypto_features.parquet", "crypto_features.parquet", "crypto_features_train.parquet")
merge_all_to_train("stocks_features.parquet", "stocks_features.parquet", "stocks_features.parquet", "stocks_features_train.parquet")
if __name__ == "__main__":
main()
|