Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

File size: 9,783 Bytes

c49b21b

#!/usr/bin/env python3
"""
Test script for the enhanced symbol-first null handling strategy
"""

import pandas as pd
import numpy as np
import sys
from pathlib import Path
import json

# Add the merge directory to path
sys.path.append(str(Path(__file__).parent.parent))

from final_null_handler import FinalNullValueHandler

def create_realistic_test_data():
    """Create realistic test data with temporal patterns and symbol-specific characteristics"""
    
    # Create timestamps for the last 30 days
    timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H')
    timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist()
    
    symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL']
    data = []
    
    for symbol in symbols:
        for i, ts in enumerate(timestamp_ms[:100]):  # 100 records per symbol
            
            if symbol in ['bitcoin', 'ethereum']:
                # Crypto data
                base_price = 50000 if symbol == 'bitcoin' else 3000
                price_trend = i * 10  # Upward trend
                price = base_price + price_trend + np.random.normal(0, 500)
                
                record = {
                    'symbol': symbol,
                    'interval_timestamp': ts,
                    'price': price if np.random.random() > 0.2 else np.nan,  # 20% nulls
                    'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan,
                    'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan,
                    'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan,
                    'rank': 1 if symbol == 'bitcoin' else 2,
                    'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan,
                    'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan,
                    'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan,
                    'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan,
                    'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan,
                    'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan,
                    'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan,
                    'stable': False
                }
            else:
                # Stock data
                base_price = 150 if symbol == 'AAPL' else 2800
                price_trend = i * 0.5  # Modest upward trend
                price = base_price + price_trend + np.random.normal(0, 5)
                
                record = {
                    'symbol': symbol,
                    'interval_timestamp': ts,
                    'close': price if np.random.random() > 0.2 else np.nan,
                    'open': price * 0.995 if np.random.random() > 0.2 else np.nan,
                    'high': price * 1.02 if np.random.random() > 0.15 else np.nan,
                    'low': price * 0.98 if np.random.random() > 0.15 else np.nan,
                    'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan,
                    'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan,
                    'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan,
                    'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan,
                    'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan,
                    'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan,
                    'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan,
                    'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan,
                    'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan,
                    'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan,
                }
            
            data.append(record)
    
    return pd.DataFrame(data)

def test_symbol_first_strategy():
    """Test the symbol-first null handling strategy"""
    print("="*70)
    print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY")
    print("="*70)
    
    # Create realistic test data
    print("Creating realistic test data with temporal patterns...")
    df = create_realistic_test_data()
    
    print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns")
    print(f"Symbols: {df['symbol'].unique()}")
    print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}")
    
    # Analyze null patterns before processing
    print(f"\nNULL ANALYSIS BEFORE PROCESSING:")
    total_nulls_before = df.isnull().sum().sum()
    print(f"Total nulls: {total_nulls_before}")
    
    symbol_nulls_before = {}
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol]
        symbol_nulls = symbol_data.isnull().sum().sum()
        symbol_nulls_before[symbol] = symbol_nulls
        print(f"  {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)")
    
    # Test the enhanced handler
    print(f"\nTESTING ENHANCED NULL HANDLER...")
    handler = FinalNullValueHandler()
    
    # Separate crypto and stock data for targeted processing
    crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum'])
    stock_mask = df['symbol'].isin(['AAPL', 'GOOGL'])
    
    results = {}
    
    if crypto_mask.any():
        print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...")
        df_crypto = df[crypto_mask].copy()
        df_crypto_processed = handler.process_crypto_features(df_crypto)
        df.loc[crypto_mask] = df_crypto_processed
        
        crypto_nulls_after = df_crypto_processed.isnull().sum().sum()
        results['crypto'] = {
            'nulls_before': df_crypto.isnull().sum().sum(),
            'nulls_after': crypto_nulls_after,
            'symbols': ['bitcoin', 'ethereum']
        }
    
    if stock_mask.any():
        print(f"\nProcessing stock data ({stock_mask.sum()} rows)...")
        df_stock = df[stock_mask].copy()
        df_stock_processed = handler.process_stock_features(df_stock)
        df.loc[stock_mask] = df_stock_processed
        
        stock_nulls_after = df_stock_processed.isnull().sum().sum()
        results['stock'] = {
            'nulls_before': df_stock.isnull().sum().sum(),
            'nulls_after': stock_nulls_after,
            'symbols': ['AAPL', 'GOOGL']
        }
    
    # Analyze results
    print(f"\nRESULTS ANALYSIS:")
    total_nulls_after = df.isnull().sum().sum()
    print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})")
    
    for asset_type, result in results.items():
        nulls_filled = result['nulls_before'] - result['nulls_after']
        fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0
        print(f"  {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
    
    # Symbol-level analysis
    print(f"\nSYMBOL-LEVEL ANALYSIS:")
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol]
        nulls_after = symbol_data.isnull().sum().sum()
        nulls_filled = symbol_nulls_before[symbol] - nulls_after
        fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0
        print(f"  {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
    
    # Quality checks
    print(f"\nQUALITY CHECKS:")
    infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
    print(f"  Infinite values: {infinite_values}")
    print(f"  Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}")
    
    # Test temporal interpolation effectiveness
    print(f"\nTEMPORAL INTERPOLATION TEST:")
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp')
        if 'price' in symbol_data.columns:
            price_series = symbol_data['price']
            if len(price_series.dropna()) >= 2:
                # Check if we have reasonable price progression
                price_diff = price_series.dropna().diff().abs().mean()
                print(f"  {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)")
    
    # Overall success assessment
    success = (total_nulls_after == 0 and 
               infinite_values == 0 and 
               all(result['nulls_after'] < result['nulls_before'] for result in results.values()))
    
    if success:
        print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!")
        print(f"   - All nulls handled successfully")
        print(f"   - No infinite values introduced")
        print(f"   - Symbol-specific patterns preserved")
        print(f"   - Temporal interpolation working")
        return True
    else:
        print(f"\n❌ Test failed - review results above")
        return False

def main():
    """Main test function"""
    try:
        success = test_symbol_first_strategy()
        return 0 if success else 1
    except Exception as e:
        print(f"❌ Test failed with error: {str(e)}")
        import traceback
        traceback.print_exc()
        return 1

if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)