File size: 40,990 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
import pandas as pd
import numpy as np
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

class FinalNullValueHandler:
    """
    Advanced final null value handler with symbol-first temporal interpolation.
    
    Strategy Priority:
    1. Same symbol, nearby timestamps (interpolation/extrapolation)
    2. Same symbol, historical mean/median
    3. Similar symbols (same asset class)
    4. Global defaults with symbol-specific variation
    """
    
    def __init__(self):
        self.crypto_column_defaults = self._define_crypto_defaults()
        self.stock_column_defaults = self._define_stock_defaults()
        self.symbol_profiles = {}
        self.symbol_stats = {}  # Historical statistics per symbol
        
    def _analyze_symbol_statistics(self, df):
        """Analyze historical statistics for each symbol to guide intelligent filling"""
        stats = {}
        
        # Sort by timestamp for proper temporal analysis
        if 'interval_timestamp' in df.columns:
            df_sorted = df.sort_values(['symbol', 'interval_timestamp'])
        else:
            df_sorted = df.sort_values('symbol')
        
        for symbol in df['symbol'].unique():
            symbol_data = df_sorted[df_sorted['symbol'] == symbol].copy()
            
            symbol_stats = {
                'symbol': symbol,
                'total_records': len(symbol_data),
                'date_range': None,
                'typical_values': {},
                'volatility': {},
                'trends': {},
                'seasonal_patterns': {}
            }
            
            # Calculate date range if timestamp available
            if 'interval_timestamp' in symbol_data.columns:
                timestamps = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms')
                symbol_stats['date_range'] = {
                    'start': timestamps.min(),
                    'end': timestamps.max(),
                    'duration_days': (timestamps.max() - timestamps.min()).days
                }
            
            # Calculate typical values, volatility, and trends for numerical columns
            numerical_cols = symbol_data.select_dtypes(include=[np.number]).columns
            for col in numerical_cols:
                if col in ['interval_timestamp', 'backup_id']:
                    continue
                
                col_data = symbol_data[col].dropna()
                if len(col_data) > 0:
                    symbol_stats['typical_values'][col] = {
                        'mean': col_data.mean(),
                        'median': col_data.median(),
                        'std': col_data.std(),
                        'min': col_data.min(),
                        'max': col_data.max(),
                        'q25': col_data.quantile(0.25),
                        'q75': col_data.quantile(0.75),
                        'recent_mean': col_data.tail(min(10, len(col_data))).mean(),  # Last 10 values
                        'data_points': len(col_data)
                    }
                    
                    # Calculate volatility
                    if len(col_data) > 1:
                        symbol_stats['volatility'][col] = col_data.std() / (col_data.mean() + 1e-8)
                    
                    # Calculate trend if we have timestamp data
                    if 'interval_timestamp' in symbol_data.columns and len(col_data) >= 3:
                        # Simple linear trend
                        valid_rows = symbol_data[col].notna()
                        if valid_rows.sum() >= 3:
                            x = np.arange(len(symbol_data[valid_rows]))
                            y = symbol_data.loc[valid_rows, col].values
                            try:
                                trend_slope = np.polyfit(x, y, 1)[0]
                                symbol_stats['trends'][col] = trend_slope
                            except:
                                symbol_stats['trends'][col] = 0
            
            stats[symbol] = symbol_stats
        
        return stats
    
    def _temporal_interpolation_fill(self, df, symbol, column):
        """
        Fill nulls using temporal interpolation within the same symbol
        
        Priority:
        1. Linear interpolation between known values
        2. Forward fill from last known value
        3. Backward fill from next known value
        4. Exponential smoothing for trend continuation
        """
        try:
            symbol_mask = df['symbol'] == symbol
            symbol_data = df.loc[symbol_mask].copy()
            
            if column not in symbol_data.columns or symbol_data[column].notna().sum() == 0:
                return None
            
            # Sort by timestamp if available and remove duplicates
            if 'interval_timestamp' in symbol_data.columns:
                symbol_data = symbol_data.sort_values('interval_timestamp')
                # Drop duplicate timestamps for this symbol to avoid reindex issues
                symbol_data = symbol_data.drop_duplicates(subset=['interval_timestamp'], keep='first')
            
            # Reset index to avoid any index issues
            symbol_data = symbol_data.reset_index(drop=True)
            filled_series = symbol_data[column].copy()
            
            # 1. Linear interpolation (works best with timestamp ordering)
            if 'interval_timestamp' in symbol_data.columns and len(symbol_data) > 1:
                # Try time-based interpolation with safe fallback
                try:
                    original_index = filled_series.index
                    datetime_index = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms')
                    
                    # Ensure unique datetime index
                    if datetime_index.duplicated().any():
                        # Add microseconds to make unique
                        for i, is_dup in enumerate(datetime_index.duplicated(keep='first')):
                            if is_dup:
                                datetime_index.iloc[i] += pd.Timedelta(microseconds=i+1)
                    
                    filled_series.index = datetime_index
                    filled_series = filled_series.interpolate(method='time')
                    filled_series.index = original_index  # Restore original index
                except Exception:
                    # Fallback to linear interpolation if time interpolation fails
                    filled_series = filled_series.interpolate(method='linear')
            else:
                filled_series = filled_series.interpolate(method='linear')
            
            # 2. Forward fill
            filled_series = filled_series.ffill()
            
            # 3. Backward fill  
            filled_series = filled_series.bfill()
            
            # 4. If still has nulls, use trend extrapolation
            if filled_series.isna().any() and symbol in self.symbol_stats:
                symbol_stat = self.symbol_stats[symbol]
                if column in symbol_stat.get('typical_values', {}):
                    typical_val = symbol_stat['typical_values'][column]['recent_mean']
                    trend = symbol_stat.get('trends', {}).get(column, 0)
                    
                    # Apply trend-based extrapolation for remaining nulls
                    for idx in filled_series[filled_series.isna()].index:
                        # Simple trend continuation
                        filled_series[idx] = typical_val + trend * (idx % 10)  # Modest trend application
            
            return filled_series
            
        except Exception as e:
            # If all else fails, return None to trigger fallback behavior
            print(f"Warning: Temporal interpolation failed for {symbol} {column}: {e}")
            return None
    
    def _similar_symbol_fill(self, df, symbol, column, asset_type):
        """
        Fill nulls using similar symbols in the same asset class
        """
        if asset_type == 'crypto':
            # For crypto, use symbols with similar rank or market cap
            target_stats = self.symbol_stats.get(symbol, {})
            target_rank = target_stats.get('typical_values', {}).get('rank', {}).get('median', 999)
            
            similar_symbols = []
            for sym, stats in self.symbol_stats.items():
                if sym == symbol:
                    continue
                    
                sym_rank = stats.get('typical_values', {}).get('rank', {}).get('median', 999)
                if abs(sym_rank - target_rank) <= 50:  # Similar rank range
                    similar_symbols.append(sym)
            
        else:  # stock
            # For stocks, use symbols with similar market cap or sector
            target_stats = self.symbol_stats.get(symbol, {})
            target_mcap = target_stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0)
            
            similar_symbols = []
            for sym, stats in self.symbol_stats.items():
                if sym == symbol:
                    continue
                    
                sym_mcap = stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0)
                if target_mcap > 0 and sym_mcap > 0:
                    ratio = max(sym_mcap, target_mcap) / min(sym_mcap, target_mcap)
                    if ratio <= 5:  # Within 5x market cap
                        similar_symbols.append(sym)
        
        if not similar_symbols:
            return None
        
        # Get values from similar symbols
        similar_data = df[df['symbol'].isin(similar_symbols)][column].dropna()
        if len(similar_data) > 0:
            # Use weighted average based on similarity
            return similar_data.median()  # Robust central tendency
        
        return None
    
    def _intelligent_symbol_fill(self, df, symbol, column):
        """
        Intelligent filling strategy prioritizing symbol-specific data
        
        Returns the best estimate for null values in the specified column for the given symbol
        """
        # Strategy 1: Temporal interpolation within same symbol
        temporal_result = self._temporal_interpolation_fill(df, symbol, column)
        if temporal_result is not None and temporal_result.notna().any():
            return temporal_result
        
        # Strategy 2: Use historical statistics from same symbol
        if symbol in self.symbol_stats and column in self.symbol_stats[symbol]['typical_values']:
            stats = self.symbol_stats[symbol]['typical_values'][column]
            
            # Choose appropriate central tendency based on data characteristics
            if stats['data_points'] >= 10:
                # Use recent mean for frequently updated data
                return stats['recent_mean']
            elif stats['data_points'] >= 3:
                # Use median for small datasets (more robust)
                return stats['median']
            else:
                # Use mean for very small datasets
                return stats['mean']
        
        # Strategy 3: Use similar symbols
        asset_type = 'crypto' if symbol in df.columns and any(
            col in df.columns for col in ['rank', 'dominance', 'performance.day']
        ) else 'stock'
        
        similar_fill = self._similar_symbol_fill(df, symbol, column, asset_type)
        if similar_fill is not None:
            return similar_fill
        
        # Strategy 4: Global fallback with symbol variation
        return None  # Will be handled by existing default logic
        
    def _define_crypto_defaults(self):
        """Define intelligent defaults for crypto-specific columns"""
        return {
            # Crypto market data
            'dominance': 0.001,  # Very small dominance for minor cryptos
            'rank': 999,  # Low rank for unknown cryptos
            'stable': 0,  # Most cryptos are not stablecoins (use 0 instead of False)
            'marketcap': 1000000,  # $1M default market cap
            'transaction_count': 100,  # Minimal transaction count
            'transaction_volume': 10000,  # Minimal transaction volume
            'tx_price_correlation': 0.5,  # Neutral correlation
            
            # Exchange prices (use main price as baseline)
            'exchangePrices.binance': None,  # Will be filled with main price
            'exchangePrices.coinbase': None,
            'exchangePrices.kraken': None,
            'exchangePrices.bybit': None,
            'exchangePrices.kucoin': None,
            'exchangePrices.okx': None,
            'exchangePrices.mexc': None,
            'exchangePrices.gateio': None,
            'exchangePrices.bitget': None,
            'exchangePrices.bitmart': None,
            'exchangePrices.bingx': None,
            'exchangePrices.cryptocom': None,
            
            # Exchange symbols (use main symbol as baseline)
            'symbols.binance': None,  # Will be filled with main symbol
            'symbols.coinbase': None,
            'symbols.kraken': None,
            'symbols.bybit': None,
            'symbols.kucoin': None,
            'symbols.okx': None,
            'symbols.mexc': None,
            'symbols.gateio': None,
            'symbols.bitget': None,
            'symbols.bitmart': None,
            'symbols.bingx': None,
            'symbols.cryptocom': None,
            
            # Performance metrics (neutral/small changes)
            'performance.day': 0.0,
            'performance.hour': 0.0,
            'performance.hour4': 0.0,
            'performance.min1': 0.0,
            'performance.min15': 0.0,
            'performance.min5': 0.0,
            'performance.month': 0.0,
            'performance.month3': 0.0,
            'performance.week': 0.0,
            'performance.year': 0.0,
            
            # Rank differences (no change)
            'rankDiffs.day': 0,
            'rankDiffs.hour': 0,
            'rankDiffs.hour4': 0,
            'rankDiffs.min1': 0,
            'rankDiffs.min15': 0,
            'rankDiffs.min5': 0,
            'rankDiffs.month': 0,
            'rankDiffs.month3': 0,
            'rankDiffs.week': 0,
            'rankDiffs.year': 0,
            
            # Technical indicators
            'bb_width': 0.02,  # Small bollinger band width
            'cg_id': None,  # Will be derived from symbol
        }
    
    def _define_stock_defaults(self):
        """Define intelligent defaults for stock-specific columns"""
        return {
            # Stock market data
            'stock_market': 'NASDAQ',  # Default market
            'marketCapitalization': 1000000000,  # $1B default
            'shareOutstanding': 100000000,  # 100M shares default
            'mspr': 0,  # Neutral momentum
            
            # News and sentiment data
            'news_activity_score_x': 0,
            'news_activity_score_y': 0,
            'news_articles_count_x': 0,
            'news_articles_count_y': 0,
            'news_highlights_count_x': 0,
            'news_highlights_count_y': 0,
            'news_match_score_max_x': 0,
            'news_match_score_max_y': 0,
            'news_match_score_mean_x': 0,
            'news_match_score_mean_y': 0,
            'news_mentions_count_x': 0,
            'news_mentions_count_y': 0,
            'news_sentiment_max_x': 0.5,  # Neutral sentiment
            'news_sentiment_max_y': 0.5,
            'news_sentiment_mean_x': 0.5,
            'news_sentiment_mean_y': 0.5,
            'news_sentiment_min_x': 0.5,
            'news_sentiment_min_y': 0.5,
            'news_sentiment_range_x': 0,
            'news_sentiment_range_y': 0,
            'news_sentiment_std': 0,
            'news_sentiment_std_x': 0,
            'news_sentiment_std_y': 0,
            
            # Analyst ratings
            'buy': 5,  # Moderate buy recommendations
            'hold': 10,  # More hold recommendations
            'sell': 2,  # Few sell recommendations
            'strongBuy': 3,
            'strongSell': 1,
            
            # Technical indicators
            'volume_price_momentum': 0.0,  # Neutral momentum
        }
    
    def _create_symbol_profiles(self, df):
        """Create profiles for each symbol to guide intelligent filling"""
        profiles = {}
        
        for symbol in df['symbol'].unique():
            symbol_data = df[df['symbol'] == symbol]
            
            # Determine if it's crypto or stock
            is_crypto = 'rank' in symbol_data.columns and symbol_data['rank'].notna().any()
            if not is_crypto:
                is_crypto = any(col.startswith('performance.') for col in symbol_data.columns)
            
            # Calculate key statistics
            profile = {
                'symbol': symbol,
                'is_crypto': is_crypto,
                'total_records': len(symbol_data),
                'data_density': symbol_data.notna().mean().mean(),
                'has_price_data': 'price' in symbol_data.columns and symbol_data['price'].notna().any(),
                'typical_price': symbol_data.get('price', pd.Series([100])).median(),
                'typical_volume': symbol_data.get('volume', pd.Series([1000000])).median(),
                'typical_marketcap': symbol_data.get('marketcap', symbol_data.get('marketCapitalization', pd.Series([1000000000]))).median()
            }
            
            profiles[symbol] = profile
            
        return profiles
    
    def _intelligent_fill_value(self, df, symbol, column, default_value):
        """Generate intelligent fill value based on symbol context"""
        profile = self.symbol_profiles.get(symbol, {})
        
        # Add symbol-specific variation to prevent homogenization
        symbol_hash = hash(f"{symbol}_{column}") % 1000
        variation_factor = (symbol_hash / 1000.0 - 0.5) * 0.1  # Β±5% variation
        
        if default_value is None:
            return None
        elif isinstance(default_value, (int, float)):
            if default_value == 0:
                return 0  # Keep zeros as zeros
            else:
                return default_value * (1 + variation_factor)
        else:
            return default_value
    
    def _fill_exchange_prices_advanced(self, df):
        """Advanced exchange price filling using symbol-first strategy"""
        exchange_price_cols = [col for col in df.columns if col.startswith('exchangePrices.')]
        
        if not exchange_price_cols or 'price' not in df.columns:
            return df
        
        df_result = df.copy()
        
        for symbol in df['symbol'].unique():
            symbol_mask = df['symbol'] == symbol
            symbol_data = df.loc[symbol_mask]
            
            # First try to get main price from symbol's own data
            main_price_series = self._intelligent_symbol_fill(df, symbol, 'price')
            if main_price_series is None or (isinstance(main_price_series, pd.Series) and main_price_series.isna().all()):
                continue
            
            if isinstance(main_price_series, pd.Series):
                main_price = main_price_series.median()
            else:
                main_price = main_price_series
            
            if pd.isna(main_price):
                continue
            
            # Fill exchange prices for this symbol
            for exchange_col in exchange_price_cols:
                if symbol_data[exchange_col].isna().any():
                    # First try temporal interpolation for this exchange
                    exchange_filled = self._intelligent_symbol_fill(df, symbol, exchange_col)
                    
                    if exchange_filled is not None:
                        if isinstance(exchange_filled, pd.Series):
                            df_result.loc[symbol_mask, exchange_col] = exchange_filled
                        else:
                            null_mask = df_result.loc[symbol_mask, exchange_col].isna()
                            df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_filled
                    else:
                        # Fallback: use main price with small exchange-specific variation
                        exchange_hash = hash(f"{symbol}_{exchange_col}") % 100
                        variation = (exchange_hash / 100.0 - 0.5) * 0.01  # Β±0.5%
                        exchange_price = main_price * (1 + variation)
                        null_mask = df_result.loc[symbol_mask, exchange_col].isna()
                        df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_price
        
        return df_result
    
    def _fill_exchange_symbols(self, df):
        """Fill exchange symbols with main symbol + exchange-specific formatting"""
        exchange_symbol_cols = [col for col in df.columns if col.startswith('symbols.')]
        
        if not exchange_symbol_cols or 'symbol' not in df.columns:
            return df
        
        df_result = df.copy()
        
        # Exchange-specific symbol formatting
        exchange_formats = {
            'symbols.binance': lambda s: f"{s.upper()}USDT" if s.lower() != 'bitcoin' else "BTCUSDT",
            'symbols.coinbase': lambda s: f"{s.upper()}-USD",
            'symbols.kraken': lambda s: f"{s.upper()}USD" if len(s) <= 3 else f"{s.upper()}/USD",
            'symbols.bybit': lambda s: f"{s.upper()}USDT",
            'symbols.kucoin': lambda s: f"{s.upper()}-USDT",
            'symbols.okx': lambda s: f"{s.upper()}-USDT",
            'symbols.mexc': lambda s: f"{s.upper()}_USDT",
            'symbols.gateio': lambda s: f"{s.upper()}_USDT",
            'symbols.bitget': lambda s: f"{s.upper()}USDT",
            'symbols.bitmart': lambda s: f"{s.upper()}_USDT",
            'symbols.bingx': lambda s: f"{s.upper()}-USDT",
            'symbols.cryptocom': lambda s: f"{s.upper()}_USDT"
        }
        
        for symbol in df['symbol'].unique():
            symbol_mask = df['symbol'] == symbol
            
            for exchange_col in exchange_symbol_cols:
                if df.loc[symbol_mask, exchange_col].isna().all():
                    formatter = exchange_formats.get(exchange_col, lambda s: s.upper())
                    try:
                        exchange_symbol = formatter(symbol)
                        df_result.loc[symbol_mask, exchange_col] = exchange_symbol
                    except Exception:
                        df_result.loc[symbol_mask, exchange_col] = symbol.upper()
        
        return df_result
    
    def _fill_cg_id(self, df):
        """Fill CoinGecko ID based on symbol"""
        if 'cg_id' not in df.columns:
            return df
        
        df_result = df.copy()
        
        # Common CoinGecko ID mappings
        cg_id_mapping = {
            'bitcoin': 'bitcoin',
            'btc': 'bitcoin',
            'ethereum': 'ethereum',
            'eth': 'ethereum',
            'binancecoin': 'binancecoin',
            'bnb': 'binancecoin',
            'cardano': 'cardano',
            'ada': 'cardano',
            'solana': 'solana',
            'sol': 'solana',
            'xrp': 'ripple',
            'ripple': 'ripple',
            'dogecoin': 'dogecoin',
            'doge': 'dogecoin',
            'polkadot': 'polkadot',
            'dot': 'polkadot',
            'avalanche-2': 'avalanche-2',
            'avax': 'avalanche-2',
            'chainlink': 'chainlink',
            'link': 'chainlink',
            'polygon': 'matic-network',
            'matic': 'matic-network',
            'litecoin': 'litecoin',
            'ltc': 'litecoin',
            'uniswap': 'uniswap',
            'uni': 'uniswap'
        }
        
        for symbol in df['symbol'].unique():
            symbol_mask = df['symbol'] == symbol
            
            if df.loc[symbol_mask, 'cg_id'].isna().all():
                cg_id = cg_id_mapping.get(symbol.lower(), symbol.lower())
                df_result.loc[symbol_mask, 'cg_id'] = cg_id
        
        return df_result
    
    def process_crypto_features(self, df):
        """Process crypto features with advanced symbol-first null handling"""
        print("Processing crypto features with symbol-first strategy...")
        df_result = df.copy()
        
        # Step 1: Analyze symbol statistics for intelligent filling
        print("Analyzing symbol statistics...")
        self.symbol_stats = self._analyze_symbol_statistics(df_result)
        print(f"Analyzed {len(self.symbol_stats)} symbols")
        
        # Step 2: Create symbol profiles
        self.symbol_profiles = self._create_symbol_profiles(df_result)
        
        # Step 3: Symbol-first null handling for key columns
        priority_columns = [
            'price', 'volume', 'marketcap', 'dominance', 'rank',
            'performance.day', 'performance.week', 'performance.month',
            'rsi', 'macd', 'transaction_count', 'transaction_volume'
        ]
        
        for column in priority_columns:
            if column in df_result.columns and df_result[column].isna().any():
                print(f"Processing {column} with symbol-first strategy...")
                
                for symbol in df_result['symbol'].unique():
                    symbol_mask = df_result['symbol'] == symbol
                    null_mask = df_result[column].isna()
                    fill_mask = symbol_mask & null_mask
                    
                    if fill_mask.any():
                        # Use intelligent symbol-first filling
                        fill_result = self._intelligent_symbol_fill(df_result, symbol, column)
                        
                        if fill_result is not None:
                            if isinstance(fill_result, pd.Series):
                                # If we got a series back (from temporal interpolation)
                                # Make sure the series aligns with the symbol mask
                                symbol_indices = df_result[symbol_mask].index
                                if len(fill_result) == len(symbol_indices):
                                    # Map the series values to the correct indices
                                    for i, idx in enumerate(symbol_indices):
                                        if pd.notna(fill_result.iloc[i]):
                                            df_result.loc[idx, column] = fill_result.iloc[i]
                                else:
                                    # Fallback: use median of the series
                                    fill_value = fill_result.median()
                                    if pd.notna(fill_value):
                                        df_result.loc[fill_mask, column] = fill_value
                            else:
                                # If we got a scalar value
                                df_result.loc[fill_mask, column] = fill_result
        
        # Step 4: Handle exchange prices with cross-reference to main price
        df_result = self._fill_exchange_prices_advanced(df_result)
        
        # Step 5: Handle exchange symbols with proper formatting
        df_result = self._fill_exchange_symbols(df_result)
        
        # Step 6: Handle CoinGecko IDs
        df_result = self._fill_cg_id(df_result)
        
        # Step 7: Fill remaining columns with intelligent defaults
        for column in df_result.columns:
            if df_result[column].isna().any():
                default_value = self.crypto_column_defaults.get(column)
                
                if default_value is not None:
                    for symbol in df_result['symbol'].unique():
                        symbol_mask = df_result['symbol'] == symbol
                        null_mask = df_result[column].isna()
                        fill_mask = symbol_mask & null_mask
                        
                        if fill_mask.any():
                            try:
                                fill_value = self._intelligent_fill_value(
                                    df_result, symbol, column, default_value
                                )
                                df_result.loc[fill_mask, column] = fill_value
                            except Exception as e:
                                print(f"Warning: Failed to fill {column} for {symbol}: {e}")
                                # Skip this column for this symbol
                                continue
        
        return df_result
    
    def process_stock_features(self, df):
        """Process stock features with advanced symbol-first null handling"""
        print("Processing stock features with symbol-first strategy...")
        df_result = df.copy()
        
        # Step 1: Analyze symbol statistics for intelligent filling
        print("Analyzing symbol statistics...")
        self.symbol_stats = self._analyze_symbol_statistics(df_result)
        print(f"Analyzed {len(self.symbol_stats)} symbols")
        
        # Step 2: Create symbol profiles
        self.symbol_profiles = self._create_symbol_profiles(df_result)
        
        # Step 3: Symbol-first null handling for key columns
        priority_columns = [
            'close', 'open', 'high', 'low', 'volume', 'prev_close',
            'marketCapitalization', 'shareOutstanding',
            'rsi', 'macd', 'atr', 'bb_position',
            'news_sentiment_mean_x', 'news_sentiment_mean_y',
            'buy', 'sell', 'hold', 'strongBuy', 'strongSell'
        ]
        
        for column in priority_columns:
            if column in df_result.columns and df_result[column].isna().any():
                print(f"Processing {column} with symbol-first strategy...")
                
                for symbol in df_result['symbol'].unique():
                    symbol_mask = df_result['symbol'] == symbol
                    null_mask = df_result[column].isna()
                    fill_mask = symbol_mask & null_mask
                    
                    if fill_mask.any():
                        # Use intelligent symbol-first filling
                        fill_result = self._intelligent_symbol_fill(df_result, symbol, column)
                        
                        if fill_result is not None:
                            if isinstance(fill_result, pd.Series):
                                # If we got a series back (from temporal interpolation)
                                # Make sure the series aligns with the symbol mask
                                symbol_indices = df_result[symbol_mask].index
                                if len(fill_result) == len(symbol_indices):
                                    # Map the series values to the correct indices
                                    for i, idx in enumerate(symbol_indices):
                                        if pd.notna(fill_result.iloc[i]):
                                            df_result.loc[idx, column] = fill_result.iloc[i]
                                else:
                                    # Fallback: use median of the series
                                    fill_value = fill_result.median()
                                    if pd.notna(fill_value):
                                        df_result.loc[fill_mask, column] = fill_value
                            else:
                                # If we got a scalar value
                                df_result.loc[fill_mask, column] = fill_result
        
        # Step 4: Fill remaining columns with intelligent defaults
        for column in df_result.columns:
            if df_result[column].isna().any():
                default_value = self.stock_column_defaults.get(column)
                
                if default_value is not None:
                    for symbol in df_result['symbol'].unique():
                        symbol_mask = df_result['symbol'] == symbol
                        null_mask = df_result[column].isna()
                        fill_mask = symbol_mask & null_mask
                        
                        if fill_mask.any():
                            try:
                                fill_value = self._intelligent_fill_value(
                                    df_result, symbol, column, default_value
                                )
                                df_result.loc[fill_mask, column] = fill_value
                            except Exception as e:
                                print(f"Warning: Failed to fill {column} for {symbol}: {e}")
                                # Skip this column for this symbol
                                continue
        
        return df_result
    
    def generate_report(self, df_before, df_after, feature_type):
        """Generate a comprehensive report of null value handling with symbol-first strategy details"""
        before_nulls = df_before.isnull().sum()
        after_nulls = df_after.isnull().sum()
        
        null_reduction = before_nulls - after_nulls
        columns_fixed = null_reduction[null_reduction > 0]
        
        # Analyze symbol coverage
        symbol_analysis = {}
        if 'symbol' in df_before.columns:
            for symbol in df_before['symbol'].unique():
                symbol_before = int(df_before[df_before['symbol'] == symbol].isnull().sum().sum())
                symbol_after = int(df_after[df_after['symbol'] == symbol].isnull().sum().sum())
                symbol_analysis[symbol] = {
                    'nulls_before': symbol_before,
                    'nulls_after': symbol_after,
                    'nulls_filled': symbol_before - symbol_after,
                    'records': int(len(df_before[df_before['symbol'] == symbol]))
                }
        
        # Analyze temporal coverage if timestamp available
        temporal_analysis = {}
        if 'interval_timestamp' in df_before.columns:
            df_before_ts = df_before.copy()
            df_after_ts = df_after.copy()
            df_before_ts['date'] = pd.to_datetime(df_before_ts['interval_timestamp'], unit='ms').dt.date
            df_after_ts['date'] = pd.to_datetime(df_after_ts['interval_timestamp'], unit='ms').dt.date
            
            for date in df_before_ts['date'].unique():
                date_before = int(df_before_ts[df_before_ts['date'] == date].isnull().sum().sum())
                date_after = int(df_after_ts[df_after_ts['date'] == date].isnull().sum().sum())
                temporal_analysis[str(date)] = {
                    'nulls_before': date_before,
                    'nulls_after': date_after,
                    'nulls_filled': date_before - date_after
                }
        
        report = {
            'feature_type': feature_type,
            'timestamp': pd.Timestamp.now().isoformat(),
            'strategy': 'symbol-first-temporal-interpolation',
            'total_rows': int(len(df_after)),
            'total_columns': int(len(df_after.columns)),
            'unique_symbols': int(len(df_after['symbol'].unique())) if 'symbol' in df_after.columns else 0,
            'columns_with_nulls_before': int((before_nulls > 0).sum()),
            'columns_with_nulls_after': int((after_nulls > 0).sum()),
            'total_nulls_before': int(before_nulls.sum()),
            'total_nulls_after': int(after_nulls.sum()),
            'total_nulls_filled': int(null_reduction.sum()),
            'columns_fixed': int(len(columns_fixed)),
            'null_reduction_rate': float((null_reduction.sum() / before_nulls.sum()) if before_nulls.sum() > 0 else 0),
            'remaining_null_columns': {str(k): int(v) for k, v in after_nulls[after_nulls > 0].to_dict().items()},
            'fixed_columns_detail': {str(k): int(v) for k, v in null_reduction[null_reduction > 0].to_dict().items()},
            'symbol_analysis': symbol_analysis,
            'temporal_analysis': temporal_analysis,
            'strategy_details': {
                'symbol_stats_analyzed': len(self.symbol_stats),
                'temporal_interpolation_used': True,
                'similar_symbol_fallback': True,
                'intelligent_defaults': True
            }
        }
        
        return report


def process_crypto_features_file(input_path, output_path=None):
    """Process crypto features file"""
    if output_path is None:
        output_path = input_path
    
    print(f"Loading crypto features from {input_path}...")
    df = pd.read_parquet(input_path)
    
    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
    print(f"Null values before processing: {df.isnull().sum().sum()}")
    
    handler = FinalNullValueHandler()
    df_processed = handler.process_crypto_features(df)
    
    print(f"Null values after processing: {df_processed.isnull().sum().sum()}")
    
    # Generate report
    report = handler.generate_report(df, df_processed, 'crypto')
    
    # Save processed data
    df_processed.to_parquet(output_path, index=False)
    print(f"Saved processed crypto features to {output_path}")
    
    # Save report
    report_path = str(output_path).replace('.parquet', '_null_handling_report.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)
    print(f"Saved processing report to {report_path}")
    
    return df_processed, report


def process_stock_features_file(input_path, output_path=None):
    """Process stock features file"""
    if output_path is None:
        output_path = input_path
    
    print(f"Loading stock features from {input_path}...")
    df = pd.read_parquet(input_path)
    
    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
    print(f"Null values before processing: {df.isnull().sum().sum()}")
    
    handler = FinalNullValueHandler()
    df_processed = handler.process_stock_features(df)
    
    print(f"Null values after processing: {df_processed.isnull().sum().sum()}")
    
    # Generate report
    report = handler.generate_report(df, df_processed, 'stock')
    
    # Save processed data
    df_processed.to_parquet(output_path, index=False)
    print(f"Saved processed stock features to {output_path}")
    
    # Save report
    report_path = str(output_path).replace('.parquet', '_null_handling_report.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)
    print(f"Saved processing report to {report_path}")
    
    return df_processed, report


def main():
    """Main function to process both crypto and stock features"""
    crypto_path = Path("data/merged/features/crypto_features.parquet")
    stocks_path = Path("data/merged/features/stocks_features.parquet")
    
    processed_files = []
    
    # Process crypto features
    if crypto_path.exists():
        try:
            df_crypto, report_crypto = process_crypto_features_file(crypto_path)
            processed_files.append(('crypto', crypto_path, report_crypto))
            print(f"βœ“ Crypto features processed: {report_crypto['total_nulls_filled']} nulls filled")
        except Exception as e:
            print(f"βœ— Error processing crypto features: {e}")
    else:
        print(f"Warning: {crypto_path} not found")
    
    # Process stock features
    if stocks_path.exists():
        try:
            df_stocks, report_stocks = process_stock_features_file(stocks_path)
            processed_files.append(('stocks', stocks_path, report_stocks))
            print(f"βœ“ Stock features processed: {report_stocks['total_nulls_filled']} nulls filled")
        except Exception as e:
            print(f"βœ— Error processing stock features: {e}")
    else:
        print(f"Warning: {stocks_path} not found")
    
    # Summary report
    if processed_files:
        print("\n" + "="*60)
        print("FINAL NULL VALUE HANDLING SUMMARY")
        print("="*60)
        
        total_nulls_filled = 0
        for file_type, file_path, report in processed_files:
            total_nulls_filled += report['total_nulls_filled']
            print(f"\n{file_type.upper()} FEATURES:")
            print(f"  File: {file_path}")
            print(f"  Rows: {report['total_rows']:,}")
            print(f"  Columns: {report['total_columns']}")
            print(f"  Nulls filled: {report['total_nulls_filled']:,}")
            print(f"  Columns fixed: {report['columns_fixed']}")
            print(f"  Remaining null columns: {len(report['remaining_null_columns'])}")
            
            if report['remaining_null_columns']:
                print(f"  Still have nulls: {list(report['remaining_null_columns'].keys())}")
        
        print(f"\nTOTAL NULLS FILLED ACROSS ALL FILES: {total_nulls_filled:,}")
        print("="*60)
    else:
        print("No files were processed successfully.")


if __name__ == "__main__":
    main()