|
|
|
""" |
|
Remove rows with null symbols from crypto and stock features. |
|
This script ensures that all records have valid symbols for downstream processing. |
|
""" |
|
|
|
import pandas as pd |
|
from pathlib import Path |
|
|
|
def remove_null_symbols(): |
|
"""Remove rows with null symbols from crypto and stock features.""" |
|
|
|
|
|
crypto_path = Path("data/merged/features/crypto_features.parquet") |
|
if crypto_path.exists(): |
|
df_crypto = pd.read_parquet(crypto_path) |
|
|
|
initial_count = len(df_crypto) |
|
null_count = df_crypto['symbol'].isnull().sum() |
|
|
|
if null_count > 0: |
|
|
|
df_crypto_clean = df_crypto[df_crypto['symbol'].notnull()].copy() |
|
|
|
final_count = len(df_crypto_clean) |
|
removed_count = initial_count - final_count |
|
|
|
print(f"[CRYPTO] Removed {removed_count} rows with null symbols ({final_count} remaining)") |
|
|
|
|
|
df_crypto_clean.to_parquet(crypto_path, index=False) |
|
|
|
|
|
remaining_nulls = df_crypto_clean['symbol'].isnull().sum() |
|
if remaining_nulls > 0: |
|
print(f"⚠️ Warning: {remaining_nulls} null symbols still remain") |
|
|
|
|
|
stocks_path = Path("data/merged/features/stocks_features.parquet") |
|
if stocks_path.exists(): |
|
df_stocks = pd.read_parquet(stocks_path) |
|
|
|
initial_count = len(df_stocks) |
|
null_count = df_stocks['symbol'].isnull().sum() |
|
|
|
if null_count > 0: |
|
|
|
df_stocks_clean = df_stocks[df_stocks['symbol'].notnull()].copy() |
|
|
|
final_count = len(df_stocks_clean) |
|
removed_count = initial_count - final_count |
|
|
|
print(f"[STOCKS] Removed {removed_count} rows with null symbols ({final_count} remaining)") |
|
|
|
|
|
df_stocks_clean.to_parquet(stocks_path, index=False) |
|
|
|
|
|
remaining_nulls = df_stocks_clean['symbol'].isnull().sum() |
|
if remaining_nulls > 0: |
|
print(f"⚠️ Warning: {remaining_nulls} null symbols still remain") |
|
|
|
if __name__ == "__main__": |
|
remove_null_symbols() |
|
|