File size: 9,783 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
#!/usr/bin/env python3
"""
Test script for the enhanced symbol-first null handling strategy
"""

import pandas as pd
import numpy as np
import sys
from pathlib import Path
import json

# Add the merge directory to path
sys.path.append(str(Path(__file__).parent.parent))

from final_null_handler import FinalNullValueHandler

def create_realistic_test_data():
    """Create realistic test data with temporal patterns and symbol-specific characteristics"""
    
    # Create timestamps for the last 30 days
    timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H')
    timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist()
    
    symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL']
    data = []
    
    for symbol in symbols:
        for i, ts in enumerate(timestamp_ms[:100]):  # 100 records per symbol
            
            if symbol in ['bitcoin', 'ethereum']:
                # Crypto data
                base_price = 50000 if symbol == 'bitcoin' else 3000
                price_trend = i * 10  # Upward trend
                price = base_price + price_trend + np.random.normal(0, 500)
                
                record = {
                    'symbol': symbol,
                    'interval_timestamp': ts,
                    'price': price if np.random.random() > 0.2 else np.nan,  # 20% nulls
                    'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan,
                    'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan,
                    'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan,
                    'rank': 1 if symbol == 'bitcoin' else 2,
                    'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan,
                    'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan,
                    'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan,
                    'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan,
                    'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan,
                    'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan,
                    'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan,
                    'stable': False
                }
            else:
                # Stock data
                base_price = 150 if symbol == 'AAPL' else 2800
                price_trend = i * 0.5  # Modest upward trend
                price = base_price + price_trend + np.random.normal(0, 5)
                
                record = {
                    'symbol': symbol,
                    'interval_timestamp': ts,
                    'close': price if np.random.random() > 0.2 else np.nan,
                    'open': price * 0.995 if np.random.random() > 0.2 else np.nan,
                    'high': price * 1.02 if np.random.random() > 0.15 else np.nan,
                    'low': price * 0.98 if np.random.random() > 0.15 else np.nan,
                    'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan,
                    'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan,
                    'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan,
                    'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan,
                    'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan,
                    'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan,
                    'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan,
                    'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan,
                    'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan,
                    'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan,
                }
            
            data.append(record)
    
    return pd.DataFrame(data)

def test_symbol_first_strategy():
    """Test the symbol-first null handling strategy"""
    print("="*70)
    print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY")
    print("="*70)
    
    # Create realistic test data
    print("Creating realistic test data with temporal patterns...")
    df = create_realistic_test_data()
    
    print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns")
    print(f"Symbols: {df['symbol'].unique()}")
    print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}")
    
    # Analyze null patterns before processing
    print(f"\nNULL ANALYSIS BEFORE PROCESSING:")
    total_nulls_before = df.isnull().sum().sum()
    print(f"Total nulls: {total_nulls_before}")
    
    symbol_nulls_before = {}
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol]
        symbol_nulls = symbol_data.isnull().sum().sum()
        symbol_nulls_before[symbol] = symbol_nulls
        print(f"  {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)")
    
    # Test the enhanced handler
    print(f"\nTESTING ENHANCED NULL HANDLER...")
    handler = FinalNullValueHandler()
    
    # Separate crypto and stock data for targeted processing
    crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum'])
    stock_mask = df['symbol'].isin(['AAPL', 'GOOGL'])
    
    results = {}
    
    if crypto_mask.any():
        print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...")
        df_crypto = df[crypto_mask].copy()
        df_crypto_processed = handler.process_crypto_features(df_crypto)
        df.loc[crypto_mask] = df_crypto_processed
        
        crypto_nulls_after = df_crypto_processed.isnull().sum().sum()
        results['crypto'] = {
            'nulls_before': df_crypto.isnull().sum().sum(),
            'nulls_after': crypto_nulls_after,
            'symbols': ['bitcoin', 'ethereum']
        }
    
    if stock_mask.any():
        print(f"\nProcessing stock data ({stock_mask.sum()} rows)...")
        df_stock = df[stock_mask].copy()
        df_stock_processed = handler.process_stock_features(df_stock)
        df.loc[stock_mask] = df_stock_processed
        
        stock_nulls_after = df_stock_processed.isnull().sum().sum()
        results['stock'] = {
            'nulls_before': df_stock.isnull().sum().sum(),
            'nulls_after': stock_nulls_after,
            'symbols': ['AAPL', 'GOOGL']
        }
    
    # Analyze results
    print(f"\nRESULTS ANALYSIS:")
    total_nulls_after = df.isnull().sum().sum()
    print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})")
    
    for asset_type, result in results.items():
        nulls_filled = result['nulls_before'] - result['nulls_after']
        fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0
        print(f"  {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
    
    # Symbol-level analysis
    print(f"\nSYMBOL-LEVEL ANALYSIS:")
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol]
        nulls_after = symbol_data.isnull().sum().sum()
        nulls_filled = symbol_nulls_before[symbol] - nulls_after
        fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0
        print(f"  {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
    
    # Quality checks
    print(f"\nQUALITY CHECKS:")
    infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
    print(f"  Infinite values: {infinite_values}")
    print(f"  Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}")
    
    # Test temporal interpolation effectiveness
    print(f"\nTEMPORAL INTERPOLATION TEST:")
    for symbol in df['symbol'].unique():
        symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp')
        if 'price' in symbol_data.columns:
            price_series = symbol_data['price']
            if len(price_series.dropna()) >= 2:
                # Check if we have reasonable price progression
                price_diff = price_series.dropna().diff().abs().mean()
                print(f"  {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)")
    
    # Overall success assessment
    success = (total_nulls_after == 0 and 
               infinite_values == 0 and 
               all(result['nulls_after'] < result['nulls_before'] for result in results.values()))
    
    if success:
        print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!")
        print(f"   - All nulls handled successfully")
        print(f"   - No infinite values introduced")
        print(f"   - Symbol-specific patterns preserved")
        print(f"   - Temporal interpolation working")
        return True
    else:
        print(f"\n❌ Test failed - review results above")
        return False

def main():
    """Main test function"""
    try:
        success = test_symbol_first_strategy()
        return 0 if success else 1
    except Exception as e:
        print(f"❌ Test failed with error: {str(e)}")
        import traceback
        traceback.print_exc()
        return 1

if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)