File size: 2,059 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import pandas as pd
from pathlib import Path

def separate_features(merged_path, crypto_path, stocks_path):
    """
    Split merged_features.parquet into crypto_features and stocks_features using is_crypto attribute,
    then drop any columns that are entirely null.
    """
    merged_path = Path(merged_path)
    if not merged_path.exists():
        print(f"File not found: {merged_path}")
        return

    df = pd.read_parquet(merged_path)

    # Ensure COIN and XRP are marked as crypto
    if 'symbol' in df.columns:
        xrp_mask = df['symbol'].str.upper() == 'RIPPLE'
        df.loc[xrp_mask, 'is_crypto'] = 1

    # Separate by is_crypto
    crypto_df = df[df['is_crypto'] == 1].copy()
    stocks_df = df[df['is_crypto'] == 0].copy()

    # Drop columns that are entirely null
    def drop_all_null(df, name):
        null_cols = df.columns[df.isna().all()]
        if len(null_cols):
            print(f"Dropping {len(null_cols)} all-null columns from {name}:")
            # for c in null_cols:
                # print(f"  • {c}")
            df.drop(columns=null_cols, inplace=True)
        else:
            print(f"No all-null columns in {name}.")
        return df

    crypto_df = drop_all_null(crypto_df, "crypto_features")
    stocks_df = drop_all_null(stocks_df, "stocks_features")

    # Save to parquet
    crypto_df.to_parquet(crypto_path)
    stocks_df.to_parquet(stocks_path)
    print(f"Saved {len(crypto_df)} crypto features to {crypto_path}")
    print(f"Saved {len(stocks_df)} stocks features to {stocks_path}")


if __name__ == "__main__":
    try:
        from src import config as app_config
        base = Path(app_config.DATA_DIR)
    except Exception:
        from os import getenv
        base = Path(getenv("DATA_DIR", "/data"))
    merged_path = base / "merged" / "features" / "merged_features.parquet"
    crypto_path = base / "merged" / "features" / "crypto_features.parquet"
    stocks_path = base / "merged" / "features" / "stocks_features.parquet"
    separate_features(merged_path, crypto_path, stocks_path)