advisorai-data-enhanced / src /merge /test_enhanced_null_handling.py
Maaroufabousaleh
f
c49b21b
raw
history blame
9.78 kB
#!/usr/bin/env python3
"""
Test script for the enhanced symbol-first null handling strategy
"""
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import json
# Add the merge directory to path
sys.path.append(str(Path(__file__).parent.parent))
from final_null_handler import FinalNullValueHandler
def create_realistic_test_data():
"""Create realistic test data with temporal patterns and symbol-specific characteristics"""
# Create timestamps for the last 30 days
timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H')
timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist()
symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL']
data = []
for symbol in symbols:
for i, ts in enumerate(timestamp_ms[:100]): # 100 records per symbol
if symbol in ['bitcoin', 'ethereum']:
# Crypto data
base_price = 50000 if symbol == 'bitcoin' else 3000
price_trend = i * 10 # Upward trend
price = base_price + price_trend + np.random.normal(0, 500)
record = {
'symbol': symbol,
'interval_timestamp': ts,
'price': price if np.random.random() > 0.2 else np.nan, # 20% nulls
'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan,
'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan,
'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan,
'rank': 1 if symbol == 'bitcoin' else 2,
'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan,
'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan,
'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan,
'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan,
'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan,
'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan,
'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan,
'stable': False
}
else:
# Stock data
base_price = 150 if symbol == 'AAPL' else 2800
price_trend = i * 0.5 # Modest upward trend
price = base_price + price_trend + np.random.normal(0, 5)
record = {
'symbol': symbol,
'interval_timestamp': ts,
'close': price if np.random.random() > 0.2 else np.nan,
'open': price * 0.995 if np.random.random() > 0.2 else np.nan,
'high': price * 1.02 if np.random.random() > 0.15 else np.nan,
'low': price * 0.98 if np.random.random() > 0.15 else np.nan,
'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan,
'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan,
'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan,
'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan,
'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan,
'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan,
'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan,
'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan,
'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan,
'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan,
}
data.append(record)
return pd.DataFrame(data)
def test_symbol_first_strategy():
"""Test the symbol-first null handling strategy"""
print("="*70)
print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY")
print("="*70)
# Create realistic test data
print("Creating realistic test data with temporal patterns...")
df = create_realistic_test_data()
print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns")
print(f"Symbols: {df['symbol'].unique()}")
print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}")
# Analyze null patterns before processing
print(f"\nNULL ANALYSIS BEFORE PROCESSING:")
total_nulls_before = df.isnull().sum().sum()
print(f"Total nulls: {total_nulls_before}")
symbol_nulls_before = {}
for symbol in df['symbol'].unique():
symbol_data = df[df['symbol'] == symbol]
symbol_nulls = symbol_data.isnull().sum().sum()
symbol_nulls_before[symbol] = symbol_nulls
print(f" {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)")
# Test the enhanced handler
print(f"\nTESTING ENHANCED NULL HANDLER...")
handler = FinalNullValueHandler()
# Separate crypto and stock data for targeted processing
crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum'])
stock_mask = df['symbol'].isin(['AAPL', 'GOOGL'])
results = {}
if crypto_mask.any():
print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...")
df_crypto = df[crypto_mask].copy()
df_crypto_processed = handler.process_crypto_features(df_crypto)
df.loc[crypto_mask] = df_crypto_processed
crypto_nulls_after = df_crypto_processed.isnull().sum().sum()
results['crypto'] = {
'nulls_before': df_crypto.isnull().sum().sum(),
'nulls_after': crypto_nulls_after,
'symbols': ['bitcoin', 'ethereum']
}
if stock_mask.any():
print(f"\nProcessing stock data ({stock_mask.sum()} rows)...")
df_stock = df[stock_mask].copy()
df_stock_processed = handler.process_stock_features(df_stock)
df.loc[stock_mask] = df_stock_processed
stock_nulls_after = df_stock_processed.isnull().sum().sum()
results['stock'] = {
'nulls_before': df_stock.isnull().sum().sum(),
'nulls_after': stock_nulls_after,
'symbols': ['AAPL', 'GOOGL']
}
# Analyze results
print(f"\nRESULTS ANALYSIS:")
total_nulls_after = df.isnull().sum().sum()
print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})")
for asset_type, result in results.items():
nulls_filled = result['nulls_before'] - result['nulls_after']
fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0
print(f" {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
# Symbol-level analysis
print(f"\nSYMBOL-LEVEL ANALYSIS:")
for symbol in df['symbol'].unique():
symbol_data = df[df['symbol'] == symbol]
nulls_after = symbol_data.isnull().sum().sum()
nulls_filled = symbol_nulls_before[symbol] - nulls_after
fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0
print(f" {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
# Quality checks
print(f"\nQUALITY CHECKS:")
infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
print(f" Infinite values: {infinite_values}")
print(f" Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}")
# Test temporal interpolation effectiveness
print(f"\nTEMPORAL INTERPOLATION TEST:")
for symbol in df['symbol'].unique():
symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp')
if 'price' in symbol_data.columns:
price_series = symbol_data['price']
if len(price_series.dropna()) >= 2:
# Check if we have reasonable price progression
price_diff = price_series.dropna().diff().abs().mean()
print(f" {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)")
# Overall success assessment
success = (total_nulls_after == 0 and
infinite_values == 0 and
all(result['nulls_after'] < result['nulls_before'] for result in results.values()))
if success:
print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!")
print(f" - All nulls handled successfully")
print(f" - No infinite values introduced")
print(f" - Symbol-specific patterns preserved")
print(f" - Temporal interpolation working")
return True
else:
print(f"\n❌ Test failed - review results above")
return False
def main():
"""Main test function"""
try:
success = test_symbol_first_strategy()
return 0 if success else 1
except Exception as e:
print(f"❌ Test failed with error: {str(e)}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)