#!/usr/bin/env python3 """ Test script for the enhanced symbol-first null handling strategy """ import pandas as pd import numpy as np import sys from pathlib import Path import json # Add the merge directory to path sys.path.append(str(Path(__file__).parent.parent)) from final_null_handler import FinalNullValueHandler def create_realistic_test_data(): """Create realistic test data with temporal patterns and symbol-specific characteristics""" # Create timestamps for the last 30 days timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H') timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist() symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL'] data = [] for symbol in symbols: for i, ts in enumerate(timestamp_ms[:100]): # 100 records per symbol if symbol in ['bitcoin', 'ethereum']: # Crypto data base_price = 50000 if symbol == 'bitcoin' else 3000 price_trend = i * 10 # Upward trend price = base_price + price_trend + np.random.normal(0, 500) record = { 'symbol': symbol, 'interval_timestamp': ts, 'price': price if np.random.random() > 0.2 else np.nan, # 20% nulls 'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan, 'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan, 'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan, 'rank': 1 if symbol == 'bitcoin' else 2, 'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan, 'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan, 'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan, 'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan, 'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan, 'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan, 'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan, 'stable': False } else: # Stock data base_price = 150 if symbol == 'AAPL' else 2800 price_trend = i * 0.5 # Modest upward trend price = base_price + price_trend + np.random.normal(0, 5) record = { 'symbol': symbol, 'interval_timestamp': ts, 'close': price if np.random.random() > 0.2 else np.nan, 'open': price * 0.995 if np.random.random() > 0.2 else np.nan, 'high': price * 1.02 if np.random.random() > 0.15 else np.nan, 'low': price * 0.98 if np.random.random() > 0.15 else np.nan, 'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan, 'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan, 'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan, 'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan, 'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan, 'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan, 'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan, 'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan, 'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan, 'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan, } data.append(record) return pd.DataFrame(data) def test_symbol_first_strategy(): """Test the symbol-first null handling strategy""" print("="*70) print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY") print("="*70) # Create realistic test data print("Creating realistic test data with temporal patterns...") df = create_realistic_test_data() print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns") print(f"Symbols: {df['symbol'].unique()}") print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}") # Analyze null patterns before processing print(f"\nNULL ANALYSIS BEFORE PROCESSING:") total_nulls_before = df.isnull().sum().sum() print(f"Total nulls: {total_nulls_before}") symbol_nulls_before = {} for symbol in df['symbol'].unique(): symbol_data = df[df['symbol'] == symbol] symbol_nulls = symbol_data.isnull().sum().sum() symbol_nulls_before[symbol] = symbol_nulls print(f" {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)") # Test the enhanced handler print(f"\nTESTING ENHANCED NULL HANDLER...") handler = FinalNullValueHandler() # Separate crypto and stock data for targeted processing crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum']) stock_mask = df['symbol'].isin(['AAPL', 'GOOGL']) results = {} if crypto_mask.any(): print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...") df_crypto = df[crypto_mask].copy() df_crypto_processed = handler.process_crypto_features(df_crypto) df.loc[crypto_mask] = df_crypto_processed crypto_nulls_after = df_crypto_processed.isnull().sum().sum() results['crypto'] = { 'nulls_before': df_crypto.isnull().sum().sum(), 'nulls_after': crypto_nulls_after, 'symbols': ['bitcoin', 'ethereum'] } if stock_mask.any(): print(f"\nProcessing stock data ({stock_mask.sum()} rows)...") df_stock = df[stock_mask].copy() df_stock_processed = handler.process_stock_features(df_stock) df.loc[stock_mask] = df_stock_processed stock_nulls_after = df_stock_processed.isnull().sum().sum() results['stock'] = { 'nulls_before': df_stock.isnull().sum().sum(), 'nulls_after': stock_nulls_after, 'symbols': ['AAPL', 'GOOGL'] } # Analyze results print(f"\nRESULTS ANALYSIS:") total_nulls_after = df.isnull().sum().sum() print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})") for asset_type, result in results.items(): nulls_filled = result['nulls_before'] - result['nulls_after'] fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0 print(f" {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)") # Symbol-level analysis print(f"\nSYMBOL-LEVEL ANALYSIS:") for symbol in df['symbol'].unique(): symbol_data = df[df['symbol'] == symbol] nulls_after = symbol_data.isnull().sum().sum() nulls_filled = symbol_nulls_before[symbol] - nulls_after fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0 print(f" {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)") # Quality checks print(f"\nQUALITY CHECKS:") infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum() print(f" Infinite values: {infinite_values}") print(f" Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}") # Test temporal interpolation effectiveness print(f"\nTEMPORAL INTERPOLATION TEST:") for symbol in df['symbol'].unique(): symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp') if 'price' in symbol_data.columns: price_series = symbol_data['price'] if len(price_series.dropna()) >= 2: # Check if we have reasonable price progression price_diff = price_series.dropna().diff().abs().mean() print(f" {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)") # Overall success assessment success = (total_nulls_after == 0 and infinite_values == 0 and all(result['nulls_after'] < result['nulls_before'] for result in results.values())) if success: print(f"\nāœ… ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!") print(f" - All nulls handled successfully") print(f" - No infinite values introduced") print(f" - Symbol-specific patterns preserved") print(f" - Temporal interpolation working") return True else: print(f"\nāŒ Test failed - review results above") return False def main(): """Main test function""" try: success = test_symbol_first_strategy() return 0 if success else 1 except Exception as e: print(f"āŒ Test failed with error: {str(e)}") import traceback traceback.print_exc() return 1 if __name__ == "__main__": exit_code = main() sys.exit(exit_code)