#!/usr/bin/env python3 """ Remove rows with null symbols from crypto and stock features. This script ensures that all records have valid symbols for downstream processing. """ import pandas as pd from pathlib import Path def remove_null_symbols(): """Remove rows with null symbols from crypto and stock features.""" # Process crypto features crypto_path = Path("data/merged/features/crypto_features.parquet") if crypto_path.exists(): df_crypto = pd.read_parquet(crypto_path) initial_count = len(df_crypto) null_count = df_crypto['symbol'].isnull().sum() if null_count > 0: # Remove null symbol rows df_crypto_clean = df_crypto[df_crypto['symbol'].notnull()].copy() final_count = len(df_crypto_clean) removed_count = initial_count - final_count print(f"[CRYPTO] Removed {removed_count} rows with null symbols ({final_count} remaining)") # Save cleaned data df_crypto_clean.to_parquet(crypto_path, index=False) # Verify no null symbols remain remaining_nulls = df_crypto_clean['symbol'].isnull().sum() if remaining_nulls > 0: print(f"⚠️ Warning: {remaining_nulls} null symbols still remain") # Process stock features stocks_path = Path("data/merged/features/stocks_features.parquet") if stocks_path.exists(): df_stocks = pd.read_parquet(stocks_path) initial_count = len(df_stocks) null_count = df_stocks['symbol'].isnull().sum() if null_count > 0: # Remove null symbol rows df_stocks_clean = df_stocks[df_stocks['symbol'].notnull()].copy() final_count = len(df_stocks_clean) removed_count = initial_count - final_count print(f"[STOCKS] Removed {removed_count} rows with null symbols ({final_count} remaining)") # Save cleaned data df_stocks_clean.to_parquet(stocks_path, index=False) # Verify no null symbols remain remaining_nulls = df_stocks_clean['symbol'].isnull().sum() if remaining_nulls > 0: print(f"⚠️ Warning: {remaining_nulls} null symbols still remain") if __name__ == "__main__": remove_null_symbols()