import pandas as pd from pathlib import Path def separate_features(merged_path, crypto_path, stocks_path): """ Split merged_features.parquet into crypto_features and stocks_features using is_crypto attribute, then drop any columns that are entirely null. """ merged_path = Path(merged_path) if not merged_path.exists(): print(f"File not found: {merged_path}") return df = pd.read_parquet(merged_path) # Ensure COIN and XRP are marked as crypto if 'symbol' in df.columns: xrp_mask = df['symbol'].str.upper() == 'RIPPLE' df.loc[xrp_mask, 'is_crypto'] = 1 # Separate by is_crypto crypto_df = df[df['is_crypto'] == 1].copy() stocks_df = df[df['is_crypto'] == 0].copy() # Drop columns that are entirely null def drop_all_null(df, name): null_cols = df.columns[df.isna().all()] if len(null_cols): print(f"Dropping {len(null_cols)} all-null columns from {name}:") # for c in null_cols: # print(f" • {c}") df.drop(columns=null_cols, inplace=True) else: print(f"No all-null columns in {name}.") return df crypto_df = drop_all_null(crypto_df, "crypto_features") stocks_df = drop_all_null(stocks_df, "stocks_features") # Save to parquet crypto_df.to_parquet(crypto_path) stocks_df.to_parquet(stocks_path) print(f"Saved {len(crypto_df)} crypto features to {crypto_path}") print(f"Saved {len(stocks_df)} stocks features to {stocks_path}") if __name__ == "__main__": try: from src import config as app_config base = Path(app_config.DATA_DIR) except Exception: from os import getenv base = Path(getenv("DATA_DIR", "/data")) merged_path = base / "merged" / "features" / "merged_features.parquet" crypto_path = base / "merged" / "features" / "crypto_features.parquet" stocks_path = base / "merged" / "features" / "stocks_features.parquet" separate_features(merged_path, crypto_path, stocks_path)