File size: 9,783 Bytes
c49b21b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 |
#!/usr/bin/env python3
"""
Test script for the enhanced symbol-first null handling strategy
"""
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import json
# Add the merge directory to path
sys.path.append(str(Path(__file__).parent.parent))
from final_null_handler import FinalNullValueHandler
def create_realistic_test_data():
"""Create realistic test data with temporal patterns and symbol-specific characteristics"""
# Create timestamps for the last 30 days
timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H')
timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist()
symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL']
data = []
for symbol in symbols:
for i, ts in enumerate(timestamp_ms[:100]): # 100 records per symbol
if symbol in ['bitcoin', 'ethereum']:
# Crypto data
base_price = 50000 if symbol == 'bitcoin' else 3000
price_trend = i * 10 # Upward trend
price = base_price + price_trend + np.random.normal(0, 500)
record = {
'symbol': symbol,
'interval_timestamp': ts,
'price': price if np.random.random() > 0.2 else np.nan, # 20% nulls
'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan,
'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan,
'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan,
'rank': 1 if symbol == 'bitcoin' else 2,
'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan,
'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan,
'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan,
'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan,
'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan,
'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan,
'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan,
'stable': False
}
else:
# Stock data
base_price = 150 if symbol == 'AAPL' else 2800
price_trend = i * 0.5 # Modest upward trend
price = base_price + price_trend + np.random.normal(0, 5)
record = {
'symbol': symbol,
'interval_timestamp': ts,
'close': price if np.random.random() > 0.2 else np.nan,
'open': price * 0.995 if np.random.random() > 0.2 else np.nan,
'high': price * 1.02 if np.random.random() > 0.15 else np.nan,
'low': price * 0.98 if np.random.random() > 0.15 else np.nan,
'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan,
'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan,
'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan,
'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan,
'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan,
'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan,
'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan,
'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan,
'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan,
'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan,
}
data.append(record)
return pd.DataFrame(data)
def test_symbol_first_strategy():
"""Test the symbol-first null handling strategy"""
print("="*70)
print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY")
print("="*70)
# Create realistic test data
print("Creating realistic test data with temporal patterns...")
df = create_realistic_test_data()
print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns")
print(f"Symbols: {df['symbol'].unique()}")
print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}")
# Analyze null patterns before processing
print(f"\nNULL ANALYSIS BEFORE PROCESSING:")
total_nulls_before = df.isnull().sum().sum()
print(f"Total nulls: {total_nulls_before}")
symbol_nulls_before = {}
for symbol in df['symbol'].unique():
symbol_data = df[df['symbol'] == symbol]
symbol_nulls = symbol_data.isnull().sum().sum()
symbol_nulls_before[symbol] = symbol_nulls
print(f" {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)")
# Test the enhanced handler
print(f"\nTESTING ENHANCED NULL HANDLER...")
handler = FinalNullValueHandler()
# Separate crypto and stock data for targeted processing
crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum'])
stock_mask = df['symbol'].isin(['AAPL', 'GOOGL'])
results = {}
if crypto_mask.any():
print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...")
df_crypto = df[crypto_mask].copy()
df_crypto_processed = handler.process_crypto_features(df_crypto)
df.loc[crypto_mask] = df_crypto_processed
crypto_nulls_after = df_crypto_processed.isnull().sum().sum()
results['crypto'] = {
'nulls_before': df_crypto.isnull().sum().sum(),
'nulls_after': crypto_nulls_after,
'symbols': ['bitcoin', 'ethereum']
}
if stock_mask.any():
print(f"\nProcessing stock data ({stock_mask.sum()} rows)...")
df_stock = df[stock_mask].copy()
df_stock_processed = handler.process_stock_features(df_stock)
df.loc[stock_mask] = df_stock_processed
stock_nulls_after = df_stock_processed.isnull().sum().sum()
results['stock'] = {
'nulls_before': df_stock.isnull().sum().sum(),
'nulls_after': stock_nulls_after,
'symbols': ['AAPL', 'GOOGL']
}
# Analyze results
print(f"\nRESULTS ANALYSIS:")
total_nulls_after = df.isnull().sum().sum()
print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})")
for asset_type, result in results.items():
nulls_filled = result['nulls_before'] - result['nulls_after']
fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0
print(f" {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
# Symbol-level analysis
print(f"\nSYMBOL-LEVEL ANALYSIS:")
for symbol in df['symbol'].unique():
symbol_data = df[df['symbol'] == symbol]
nulls_after = symbol_data.isnull().sum().sum()
nulls_filled = symbol_nulls_before[symbol] - nulls_after
fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0
print(f" {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
# Quality checks
print(f"\nQUALITY CHECKS:")
infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
print(f" Infinite values: {infinite_values}")
print(f" Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}")
# Test temporal interpolation effectiveness
print(f"\nTEMPORAL INTERPOLATION TEST:")
for symbol in df['symbol'].unique():
symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp')
if 'price' in symbol_data.columns:
price_series = symbol_data['price']
if len(price_series.dropna()) >= 2:
# Check if we have reasonable price progression
price_diff = price_series.dropna().diff().abs().mean()
print(f" {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)")
# Overall success assessment
success = (total_nulls_after == 0 and
infinite_values == 0 and
all(result['nulls_after'] < result['nulls_before'] for result in results.values()))
if success:
print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!")
print(f" - All nulls handled successfully")
print(f" - No infinite values introduced")
print(f" - Symbol-specific patterns preserved")
print(f" - Temporal interpolation working")
return True
else:
print(f"\n❌ Test failed - review results above")
return False
def main():
"""Main test function"""
try:
success = test_symbol_first_strategy()
return 0 if success else 1
except Exception as e:
print(f"❌ Test failed with error: {str(e)}")
import traceback
traceback.print_exc()
return 1
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
|