Maaroufabousaleh
f
c49b21b
raw
history blame
2.06 kB
import pandas as pd
from pathlib import Path
def separate_features(merged_path, crypto_path, stocks_path):
"""
Split merged_features.parquet into crypto_features and stocks_features using is_crypto attribute,
then drop any columns that are entirely null.
"""
merged_path = Path(merged_path)
if not merged_path.exists():
print(f"File not found: {merged_path}")
return
df = pd.read_parquet(merged_path)
# Ensure COIN and XRP are marked as crypto
if 'symbol' in df.columns:
xrp_mask = df['symbol'].str.upper() == 'RIPPLE'
df.loc[xrp_mask, 'is_crypto'] = 1
# Separate by is_crypto
crypto_df = df[df['is_crypto'] == 1].copy()
stocks_df = df[df['is_crypto'] == 0].copy()
# Drop columns that are entirely null
def drop_all_null(df, name):
null_cols = df.columns[df.isna().all()]
if len(null_cols):
print(f"Dropping {len(null_cols)} all-null columns from {name}:")
# for c in null_cols:
# print(f" • {c}")
df.drop(columns=null_cols, inplace=True)
else:
print(f"No all-null columns in {name}.")
return df
crypto_df = drop_all_null(crypto_df, "crypto_features")
stocks_df = drop_all_null(stocks_df, "stocks_features")
# Save to parquet
crypto_df.to_parquet(crypto_path)
stocks_df.to_parquet(stocks_path)
print(f"Saved {len(crypto_df)} crypto features to {crypto_path}")
print(f"Saved {len(stocks_df)} stocks features to {stocks_path}")
if __name__ == "__main__":
try:
from src import config as app_config
base = Path(app_config.DATA_DIR)
except Exception:
from os import getenv
base = Path(getenv("DATA_DIR", "/data"))
merged_path = base / "merged" / "features" / "merged_features.parquet"
crypto_path = base / "merged" / "features" / "crypto_features.parquet"
stocks_path = base / "merged" / "features" / "stocks_features.parquet"
separate_features(merged_path, crypto_path, stocks_path)