advisorai-data-enhanced / src /merge /remove_null_symbols.py
Maaroufabousaleh
f
c49b21b
raw
history blame
2.44 kB
#!/usr/bin/env python3
"""
Remove rows with null symbols from crypto and stock features.
This script ensures that all records have valid symbols for downstream processing.
"""
import pandas as pd
from pathlib import Path
def remove_null_symbols():
"""Remove rows with null symbols from crypto and stock features."""
# Process crypto features
crypto_path = Path("data/merged/features/crypto_features.parquet")
if crypto_path.exists():
df_crypto = pd.read_parquet(crypto_path)
initial_count = len(df_crypto)
null_count = df_crypto['symbol'].isnull().sum()
if null_count > 0:
# Remove null symbol rows
df_crypto_clean = df_crypto[df_crypto['symbol'].notnull()].copy()
final_count = len(df_crypto_clean)
removed_count = initial_count - final_count
print(f"[CRYPTO] Removed {removed_count} rows with null symbols ({final_count} remaining)")
# Save cleaned data
df_crypto_clean.to_parquet(crypto_path, index=False)
# Verify no null symbols remain
remaining_nulls = df_crypto_clean['symbol'].isnull().sum()
if remaining_nulls > 0:
print(f"⚠️ Warning: {remaining_nulls} null symbols still remain")
# Process stock features
stocks_path = Path("data/merged/features/stocks_features.parquet")
if stocks_path.exists():
df_stocks = pd.read_parquet(stocks_path)
initial_count = len(df_stocks)
null_count = df_stocks['symbol'].isnull().sum()
if null_count > 0:
# Remove null symbol rows
df_stocks_clean = df_stocks[df_stocks['symbol'].notnull()].copy()
final_count = len(df_stocks_clean)
removed_count = initial_count - final_count
print(f"[STOCKS] Removed {removed_count} rows with null symbols ({final_count} remaining)")
# Save cleaned data
df_stocks_clean.to_parquet(stocks_path, index=False)
# Verify no null symbols remain
remaining_nulls = df_stocks_clean['symbol'].isnull().sum()
if remaining_nulls > 0:
print(f"⚠️ Warning: {remaining_nulls} null symbols still remain")
if __name__ == "__main__":
remove_null_symbols()