|
import pandas as pd |
|
from pathlib import Path |
|
|
|
def separate_features(merged_path, crypto_path, stocks_path): |
|
""" |
|
Split merged_features.parquet into crypto_features and stocks_features using is_crypto attribute, |
|
then drop any columns that are entirely null. |
|
""" |
|
merged_path = Path(merged_path) |
|
if not merged_path.exists(): |
|
print(f"File not found: {merged_path}") |
|
return |
|
|
|
df = pd.read_parquet(merged_path) |
|
|
|
|
|
if 'symbol' in df.columns: |
|
xrp_mask = df['symbol'].str.upper() == 'RIPPLE' |
|
df.loc[xrp_mask, 'is_crypto'] = 1 |
|
|
|
|
|
crypto_df = df[df['is_crypto'] == 1].copy() |
|
stocks_df = df[df['is_crypto'] == 0].copy() |
|
|
|
|
|
def drop_all_null(df, name): |
|
null_cols = df.columns[df.isna().all()] |
|
if len(null_cols): |
|
print(f"Dropping {len(null_cols)} all-null columns from {name}:") |
|
|
|
|
|
df.drop(columns=null_cols, inplace=True) |
|
else: |
|
print(f"No all-null columns in {name}.") |
|
return df |
|
|
|
crypto_df = drop_all_null(crypto_df, "crypto_features") |
|
stocks_df = drop_all_null(stocks_df, "stocks_features") |
|
|
|
|
|
crypto_df.to_parquet(crypto_path) |
|
stocks_df.to_parquet(stocks_path) |
|
print(f"Saved {len(crypto_df)} crypto features to {crypto_path}") |
|
print(f"Saved {len(stocks_df)} stocks features to {stocks_path}") |
|
|
|
|
|
if __name__ == "__main__": |
|
try: |
|
from src import config as app_config |
|
base = Path(app_config.DATA_DIR) |
|
except Exception: |
|
from os import getenv |
|
base = Path(getenv("DATA_DIR", "/data")) |
|
merged_path = base / "merged" / "features" / "merged_features.parquet" |
|
crypto_path = base / "merged" / "features" / "crypto_features.parquet" |
|
stocks_path = base / "merged" / "features" / "stocks_features.parquet" |
|
separate_features(merged_path, crypto_path, stocks_path) |
|
|