|
|
|
""" |
|
Test script for the enhanced symbol-first null handling strategy |
|
""" |
|
|
|
import pandas as pd |
|
import numpy as np |
|
import sys |
|
from pathlib import Path |
|
import json |
|
|
|
|
|
sys.path.append(str(Path(__file__).parent.parent)) |
|
|
|
from final_null_handler import FinalNullValueHandler |
|
|
|
def create_realistic_test_data(): |
|
"""Create realistic test data with temporal patterns and symbol-specific characteristics""" |
|
|
|
|
|
timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H') |
|
timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist() |
|
|
|
symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL'] |
|
data = [] |
|
|
|
for symbol in symbols: |
|
for i, ts in enumerate(timestamp_ms[:100]): |
|
|
|
if symbol in ['bitcoin', 'ethereum']: |
|
|
|
base_price = 50000 if symbol == 'bitcoin' else 3000 |
|
price_trend = i * 10 |
|
price = base_price + price_trend + np.random.normal(0, 500) |
|
|
|
record = { |
|
'symbol': symbol, |
|
'interval_timestamp': ts, |
|
'price': price if np.random.random() > 0.2 else np.nan, |
|
'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan, |
|
'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan, |
|
'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan, |
|
'rank': 1 if symbol == 'bitcoin' else 2, |
|
'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan, |
|
'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan, |
|
'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan, |
|
'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan, |
|
'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan, |
|
'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan, |
|
'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan, |
|
'stable': False |
|
} |
|
else: |
|
|
|
base_price = 150 if symbol == 'AAPL' else 2800 |
|
price_trend = i * 0.5 |
|
price = base_price + price_trend + np.random.normal(0, 5) |
|
|
|
record = { |
|
'symbol': symbol, |
|
'interval_timestamp': ts, |
|
'close': price if np.random.random() > 0.2 else np.nan, |
|
'open': price * 0.995 if np.random.random() > 0.2 else np.nan, |
|
'high': price * 1.02 if np.random.random() > 0.15 else np.nan, |
|
'low': price * 0.98 if np.random.random() > 0.15 else np.nan, |
|
'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan, |
|
'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan, |
|
'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan, |
|
'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan, |
|
'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan, |
|
'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan, |
|
'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan, |
|
'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan, |
|
'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan, |
|
'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan, |
|
} |
|
|
|
data.append(record) |
|
|
|
return pd.DataFrame(data) |
|
|
|
def test_symbol_first_strategy(): |
|
"""Test the symbol-first null handling strategy""" |
|
print("="*70) |
|
print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY") |
|
print("="*70) |
|
|
|
|
|
print("Creating realistic test data with temporal patterns...") |
|
df = create_realistic_test_data() |
|
|
|
print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns") |
|
print(f"Symbols: {df['symbol'].unique()}") |
|
print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}") |
|
|
|
|
|
print(f"\nNULL ANALYSIS BEFORE PROCESSING:") |
|
total_nulls_before = df.isnull().sum().sum() |
|
print(f"Total nulls: {total_nulls_before}") |
|
|
|
symbol_nulls_before = {} |
|
for symbol in df['symbol'].unique(): |
|
symbol_data = df[df['symbol'] == symbol] |
|
symbol_nulls = symbol_data.isnull().sum().sum() |
|
symbol_nulls_before[symbol] = symbol_nulls |
|
print(f" {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)") |
|
|
|
|
|
print(f"\nTESTING ENHANCED NULL HANDLER...") |
|
handler = FinalNullValueHandler() |
|
|
|
|
|
crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum']) |
|
stock_mask = df['symbol'].isin(['AAPL', 'GOOGL']) |
|
|
|
results = {} |
|
|
|
if crypto_mask.any(): |
|
print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...") |
|
df_crypto = df[crypto_mask].copy() |
|
df_crypto_processed = handler.process_crypto_features(df_crypto) |
|
df.loc[crypto_mask] = df_crypto_processed |
|
|
|
crypto_nulls_after = df_crypto_processed.isnull().sum().sum() |
|
results['crypto'] = { |
|
'nulls_before': df_crypto.isnull().sum().sum(), |
|
'nulls_after': crypto_nulls_after, |
|
'symbols': ['bitcoin', 'ethereum'] |
|
} |
|
|
|
if stock_mask.any(): |
|
print(f"\nProcessing stock data ({stock_mask.sum()} rows)...") |
|
df_stock = df[stock_mask].copy() |
|
df_stock_processed = handler.process_stock_features(df_stock) |
|
df.loc[stock_mask] = df_stock_processed |
|
|
|
stock_nulls_after = df_stock_processed.isnull().sum().sum() |
|
results['stock'] = { |
|
'nulls_before': df_stock.isnull().sum().sum(), |
|
'nulls_after': stock_nulls_after, |
|
'symbols': ['AAPL', 'GOOGL'] |
|
} |
|
|
|
|
|
print(f"\nRESULTS ANALYSIS:") |
|
total_nulls_after = df.isnull().sum().sum() |
|
print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})") |
|
|
|
for asset_type, result in results.items(): |
|
nulls_filled = result['nulls_before'] - result['nulls_after'] |
|
fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0 |
|
print(f" {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)") |
|
|
|
|
|
print(f"\nSYMBOL-LEVEL ANALYSIS:") |
|
for symbol in df['symbol'].unique(): |
|
symbol_data = df[df['symbol'] == symbol] |
|
nulls_after = symbol_data.isnull().sum().sum() |
|
nulls_filled = symbol_nulls_before[symbol] - nulls_after |
|
fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0 |
|
print(f" {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)") |
|
|
|
|
|
print(f"\nQUALITY CHECKS:") |
|
infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum() |
|
print(f" Infinite values: {infinite_values}") |
|
print(f" Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}") |
|
|
|
|
|
print(f"\nTEMPORAL INTERPOLATION TEST:") |
|
for symbol in df['symbol'].unique(): |
|
symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp') |
|
if 'price' in symbol_data.columns: |
|
price_series = symbol_data['price'] |
|
if len(price_series.dropna()) >= 2: |
|
|
|
price_diff = price_series.dropna().diff().abs().mean() |
|
print(f" {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)") |
|
|
|
|
|
success = (total_nulls_after == 0 and |
|
infinite_values == 0 and |
|
all(result['nulls_after'] < result['nulls_before'] for result in results.values())) |
|
|
|
if success: |
|
print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!") |
|
print(f" - All nulls handled successfully") |
|
print(f" - No infinite values introduced") |
|
print(f" - Symbol-specific patterns preserved") |
|
print(f" - Temporal interpolation working") |
|
return True |
|
else: |
|
print(f"\n❌ Test failed - review results above") |
|
return False |
|
|
|
def main(): |
|
"""Main test function""" |
|
try: |
|
success = test_symbol_first_strategy() |
|
return 0 if success else 1 |
|
except Exception as e: |
|
print(f"❌ Test failed with error: {str(e)}") |
|
import traceback |
|
traceback.print_exc() |
|
return 1 |
|
|
|
if __name__ == "__main__": |
|
exit_code = main() |
|
sys.exit(exit_code) |
|
|