Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

File size: 40,990 Bytes

c49b21b

import pandas as pd
import numpy as np
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

class FinalNullValueHandler:
    """
    Advanced final null value handler with symbol-first temporal interpolation.
    
    Strategy Priority:
    1. Same symbol, nearby timestamps (interpolation/extrapolation)
    2. Same symbol, historical mean/median
    3. Similar symbols (same asset class)
    4. Global defaults with symbol-specific variation
    """
    
    def __init__(self):
        self.crypto_column_defaults = self._define_crypto_defaults()
        self.stock_column_defaults = self._define_stock_defaults()
        self.symbol_profiles = {}
        self.symbol_stats = {}  # Historical statistics per symbol
        
    def _analyze_symbol_statistics(self, df):
        """Analyze historical statistics for each symbol to guide intelligent filling"""
        stats = {}
        
        # Sort by timestamp for proper temporal analysis
        if 'interval_timestamp' in df.columns:
            df_sorted = df.sort_values(['symbol', 'interval_timestamp'])
        else:
            df_sorted = df.sort_values('symbol')
        
        for symbol in df['symbol'].unique():
            symbol_data = df_sorted[df_sorted['symbol'] == symbol].copy()
            
            symbol_stats = {
                'symbol': symbol,
                'total_records': len(symbol_data),
                'date_range': None,
                'typical_values': {},
                'volatility': {},
                'trends': {},
                'seasonal_patterns': {}
            }
            
            # Calculate date range if timestamp available
            if 'interval_timestamp' in symbol_data.columns:
                timestamps = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms')
                symbol_stats['date_range'] = {
                    'start': timestamps.min(),
                    'end': timestamps.max(),
                    'duration_days': (timestamps.max() - timestamps.min()).days
                }
            
            # Calculate typical values, volatility, and trends for numerical columns
            numerical_cols = symbol_data.select_dtypes(include=[np.number]).columns
            for col in numerical_cols:
                if col in ['interval_timestamp', 'backup_id']:
                    continue
                
                col_data = symbol_data[col].dropna()
                if len(col_data) > 0:
                    symbol_stats['typical_values'][col] = {
                        'mean': col_data.mean(),
                        'median': col_data.median(),
                        'std': col_data.std(),
                        'min': col_data.min(),
                        'max': col_data.max(),
                        'q25': col_data.quantile(0.25),
                        'q75': col_data.quantile(0.75),
                        'recent_mean': col_data.tail(min(10, len(col_data))).mean(),  # Last 10 values
                        'data_points': len(col_data)
                    }
                    
                    # Calculate volatility
                    if len(col_data) > 1:
                        symbol_stats['volatility'][col] = col_data.std() / (col_data.mean() + 1e-8)
                    
                    # Calculate trend if we have timestamp data
                    if 'interval_timestamp' in symbol_data.columns and len(col_data) >= 3:
                        # Simple linear trend
                        valid_rows = symbol_data[col].notna()
                        if valid_rows.sum() >= 3:
                            x = np.arange(len(symbol_data[valid_rows]))
                            y = symbol_data.loc[valid_rows, col].values
                            try:
                                trend_slope = np.polyfit(x, y, 1)[0]
                                symbol_stats['trends'][col] = trend_slope
                            except:
                                symbol_stats['trends'][col] = 0
            
            stats[symbol] = symbol_stats
        
        return stats
    
    def _temporal_interpolation_fill(self, df, symbol, column):
        """
        Fill nulls using temporal interpolation within the same symbol
        
        Priority:
        1. Linear interpolation between known values
        2. Forward fill from last known value
        3. Backward fill from next known value
        4. Exponential smoothing for trend continuation
        """
        try:
            symbol_mask = df['symbol'] == symbol
            symbol_data = df.loc[symbol_mask].copy()
            
            if column not in symbol_data.columns or symbol_data[column].notna().sum() == 0:
                return None
            
            # Sort by timestamp if available and remove duplicates
            if 'interval_timestamp' in symbol_data.columns:
                symbol_data = symbol_data.sort_values('interval_timestamp')
                # Drop duplicate timestamps for this symbol to avoid reindex issues
                symbol_data = symbol_data.drop_duplicates(subset=['interval_timestamp'], keep='first')
            
            # Reset index to avoid any index issues
            symbol_data = symbol_data.reset_index(drop=True)
            filled_series = symbol_data[column].copy()
            
            # 1. Linear interpolation (works best with timestamp ordering)
            if 'interval_timestamp' in symbol_data.columns and len(symbol_data) > 1:
                # Try time-based interpolation with safe fallback
                try:
                    original_index = filled_series.index
                    datetime_index = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms')
                    
                    # Ensure unique datetime index
                    if datetime_index.duplicated().any():
                        # Add microseconds to make unique
                        for i, is_dup in enumerate(datetime_index.duplicated(keep='first')):
                            if is_dup:
                                datetime_index.iloc[i] += pd.Timedelta(microseconds=i+1)
                    
                    filled_series.index = datetime_index
                    filled_series = filled_series.interpolate(method='time')
                    filled_series.index = original_index  # Restore original index
                except Exception:
                    # Fallback to linear interpolation if time interpolation fails
                    filled_series = filled_series.interpolate(method='linear')
            else:
                filled_series = filled_series.interpolate(method='linear')
            
            # 2. Forward fill
            filled_series = filled_series.ffill()
            
            # 3. Backward fill  
            filled_series = filled_series.bfill()
            
            # 4. If still has nulls, use trend extrapolation
            if filled_series.isna().any() and symbol in self.symbol_stats:
                symbol_stat = self.symbol_stats[symbol]
                if column in symbol_stat.get('typical_values', {}):
                    typical_val = symbol_stat['typical_values'][column]['recent_mean']
                    trend = symbol_stat.get('trends', {}).get(column, 0)
                    
                    # Apply trend-based extrapolation for remaining nulls
                    for idx in filled_series[filled_series.isna()].index:
                        # Simple trend continuation
                        filled_series[idx] = typical_val + trend * (idx % 10)  # Modest trend application
            
            return filled_series
            
        except Exception as e:
            # If all else fails, return None to trigger fallback behavior
            print(f"Warning: Temporal interpolation failed for {symbol} {column}: {e}")
            return None
    
    def _similar_symbol_fill(self, df, symbol, column, asset_type):
        """
        Fill nulls using similar symbols in the same asset class
        """
        if asset_type == 'crypto':
            # For crypto, use symbols with similar rank or market cap
            target_stats = self.symbol_stats.get(symbol, {})
            target_rank = target_stats.get('typical_values', {}).get('rank', {}).get('median', 999)
            
            similar_symbols = []
            for sym, stats in self.symbol_stats.items():
                if sym == symbol:
                    continue
                    
                sym_rank = stats.get('typical_values', {}).get('rank', {}).get('median', 999)
                if abs(sym_rank - target_rank) <= 50:  # Similar rank range
                    similar_symbols.append(sym)
            
        else:  # stock
            # For stocks, use symbols with similar market cap or sector
            target_stats = self.symbol_stats.get(symbol, {})
            target_mcap = target_stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0)
            
            similar_symbols = []
            for sym, stats in self.symbol_stats.items():
                if sym == symbol:
                    continue
                    
                sym_mcap = stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0)
                if target_mcap > 0 and sym_mcap > 0:
                    ratio = max(sym_mcap, target_mcap) / min(sym_mcap, target_mcap)
                    if ratio <= 5:  # Within 5x market cap
                        similar_symbols.append(sym)
        
        if not similar_symbols:
            return None
        
        # Get values from similar symbols
        similar_data = df[df['symbol'].isin(similar_symbols)][column].dropna()
        if len(similar_data) > 0:
            # Use weighted average based on similarity
            return similar_data.median()  # Robust central tendency
        
        return None
    
    def _intelligent_symbol_fill(self, df, symbol, column):
        """
        Intelligent filling strategy prioritizing symbol-specific data
        
        Returns the best estimate for null values in the specified column for the given symbol
        """
        # Strategy 1: Temporal interpolation within same symbol
        temporal_result = self._temporal_interpolation_fill(df, symbol, column)
        if temporal_result is not None and temporal_result.notna().any():
            return temporal_result
        
        # Strategy 2: Use historical statistics from same symbol
        if symbol in self.symbol_stats and column in self.symbol_stats[symbol]['typical_values']:
            stats = self.symbol_stats[symbol]['typical_values'][column]
            
            # Choose appropriate central tendency based on data characteristics
            if stats['data_points'] >= 10:
                # Use recent mean for frequently updated data
                return stats['recent_mean']
            elif stats['data_points'] >= 3:
                # Use median for small datasets (more robust)
                return stats['median']
            else:
                # Use mean for very small datasets
                return stats['mean']
        
        # Strategy 3: Use similar symbols
        asset_type = 'crypto' if symbol in df.columns and any(
            col in df.columns for col in ['rank', 'dominance', 'performance.day']
        ) else 'stock'
        
        similar_fill = self._similar_symbol_fill(df, symbol, column, asset_type)
        if similar_fill is not None:
            return similar_fill
        
        # Strategy 4: Global fallback with symbol variation
        return None  # Will be handled by existing default logic
        
    def _define_crypto_defaults(self):
        """Define intelligent defaults for crypto-specific columns"""
        return {
            # Crypto market data
            'dominance': 0.001,  # Very small dominance for minor cryptos
            'rank': 999,  # Low rank for unknown cryptos
            'stable': 0,  # Most cryptos are not stablecoins (use 0 instead of False)
            'marketcap': 1000000,  # $1M default market cap
            'transaction_count': 100,  # Minimal transaction count
            'transaction_volume': 10000,  # Minimal transaction volume
            'tx_price_correlation': 0.5,  # Neutral correlation
            
            # Exchange prices (use main price as baseline)
            'exchangePrices.binance': None,  # Will be filled with main price
            'exchangePrices.coinbase': None,
            'exchangePrices.kraken': None,
            'exchangePrices.bybit': None,
            'exchangePrices.kucoin': None,
            'exchangePrices.okx': None,
            'exchangePrices.mexc': None,
            'exchangePrices.gateio': None,
            'exchangePrices.bitget': None,
            'exchangePrices.bitmart': None,
            'exchangePrices.bingx': None,
            'exchangePrices.cryptocom': None,
            
            # Exchange symbols (use main symbol as baseline)
            'symbols.binance': None,  # Will be filled with main symbol
            'symbols.coinbase': None,
            'symbols.kraken': None,
            'symbols.bybit': None,
            'symbols.kucoin': None,
            'symbols.okx': None,
            'symbols.mexc': None,
            'symbols.gateio': None,
            'symbols.bitget': None,
            'symbols.bitmart': None,
            'symbols.bingx': None,
            'symbols.cryptocom': None,
            
            # Performance metrics (neutral/small changes)
            'performance.day': 0.0,
            'performance.hour': 0.0,
            'performance.hour4': 0.0,
            'performance.min1': 0.0,
            'performance.min15': 0.0,
            'performance.min5': 0.0,
            'performance.month': 0.0,
            'performance.month3': 0.0,
            'performance.week': 0.0,
            'performance.year': 0.0,
            
            # Rank differences (no change)
            'rankDiffs.day': 0,
            'rankDiffs.hour': 0,
            'rankDiffs.hour4': 0,
            'rankDiffs.min1': 0,
            'rankDiffs.min15': 0,
            'rankDiffs.min5': 0,
            'rankDiffs.month': 0,
            'rankDiffs.month3': 0,
            'rankDiffs.week': 0,
            'rankDiffs.year': 0,
            
            # Technical indicators
            'bb_width': 0.02,  # Small bollinger band width
            'cg_id': None,  # Will be derived from symbol
        }
    
    def _define_stock_defaults(self):
        """Define intelligent defaults for stock-specific columns"""
        return {
            # Stock market data
            'stock_market': 'NASDAQ',  # Default market
            'marketCapitalization': 1000000000,  # $1B default
            'shareOutstanding': 100000000,  # 100M shares default
            'mspr': 0,  # Neutral momentum
            
            # News and sentiment data
            'news_activity_score_x': 0,
            'news_activity_score_y': 0,
            'news_articles_count_x': 0,
            'news_articles_count_y': 0,
            'news_highlights_count_x': 0,
            'news_highlights_count_y': 0,
            'news_match_score_max_x': 0,
            'news_match_score_max_y': 0,
            'news_match_score_mean_x': 0,
            'news_match_score_mean_y': 0,
            'news_mentions_count_x': 0,
            'news_mentions_count_y': 0,
            'news_sentiment_max_x': 0.5,  # Neutral sentiment
            'news_sentiment_max_y': 0.5,
            'news_sentiment_mean_x': 0.5,
            'news_sentiment_mean_y': 0.5,
            'news_sentiment_min_x': 0.5,
            'news_sentiment_min_y': 0.5,
            'news_sentiment_range_x': 0,
            'news_sentiment_range_y': 0,
            'news_sentiment_std': 0,
            'news_sentiment_std_x': 0,
            'news_sentiment_std_y': 0,
            
            # Analyst ratings
            'buy': 5,  # Moderate buy recommendations
            'hold': 10,  # More hold recommendations
            'sell': 2,  # Few sell recommendations
            'strongBuy': 3,
            'strongSell': 1,
            
            # Technical indicators
            'volume_price_momentum': 0.0,  # Neutral momentum
        }
    
    def _create_symbol_profiles(self, df):
        """Create profiles for each symbol to guide intelligent filling"""
        profiles = {}
        
        for symbol in df['symbol'].unique():
            symbol_data = df[df['symbol'] == symbol]
            
            # Determine if it's crypto or stock
            is_crypto = 'rank' in symbol_data.columns and symbol_data['rank'].notna().any()
            if not is_crypto:
                is_crypto = any(col.startswith('performance.') for col in symbol_data.columns)
            
            # Calculate key statistics
            profile = {
                'symbol': symbol,
                'is_crypto': is_crypto,
                'total_records': len(symbol_data),
                'data_density': symbol_data.notna().mean().mean(),
                'has_price_data': 'price' in symbol_data.columns and symbol_data['price'].notna().any(),
                'typical_price': symbol_data.get('price', pd.Series([100])).median(),
                'typical_volume': symbol_data.get('volume', pd.Series([1000000])).median(),
                'typical_marketcap': symbol_data.get('marketcap', symbol_data.get('marketCapitalization', pd.Series([1000000000]))).median()
            }
            
            profiles[symbol] = profile
            
        return profiles
    
    def _intelligent_fill_value(self, df, symbol, column, default_value):
        """Generate intelligent fill value based on symbol context"""
        profile = self.symbol_profiles.get(symbol, {})
        
        # Add symbol-specific variation to prevent homogenization
        symbol_hash = hash(f"{symbol}_{column}") % 1000
        variation_factor = (symbol_hash / 1000.0 - 0.5) * 0.1  # ±5% variation
        
        if default_value is None:
            return None
        elif isinstance(default_value, (int, float)):
            if default_value == 0:
                return 0  # Keep zeros as zeros
            else:
                return default_value * (1 + variation_factor)
        else:
            return default_value
    
    def _fill_exchange_prices_advanced(self, df):
        """Advanced exchange price filling using symbol-first strategy"""
        exchange_price_cols = [col for col in df.columns if col.startswith('exchangePrices.')]
        
        if not exchange_price_cols or 'price' not in df.columns:
            return df
        
        df_result = df.copy()
        
        for symbol in df['symbol'].unique():
            symbol_mask = df['symbol'] == symbol
            symbol_data = df.loc[symbol_mask]
            
            # First try to get main price from symbol's own data
            main_price_series = self._intelligent_symbol_fill(df, symbol, 'price')
            if main_price_series is None or (isinstance(main_price_series, pd.Series) and main_price_series.isna().all()):
                continue
            
            if isinstance(main_price_series, pd.Series):
                main_price = main_price_series.median()
            else:
                main_price = main_price_series
            
            if pd.isna(main_price):
                continue
            
            # Fill exchange prices for this symbol
            for exchange_col in exchange_price_cols:
                if symbol_data[exchange_col].isna().any():
                    # First try temporal interpolation for this exchange
                    exchange_filled = self._intelligent_symbol_fill(df, symbol, exchange_col)
                    
                    if exchange_filled is not None:
                        if isinstance(exchange_filled, pd.Series):
                            df_result.loc[symbol_mask, exchange_col] = exchange_filled
                        else:
                            null_mask = df_result.loc[symbol_mask, exchange_col].isna()
                            df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_filled
                    else:
                        # Fallback: use main price with small exchange-specific variation
                        exchange_hash = hash(f"{symbol}_{exchange_col}") % 100
                        variation = (exchange_hash / 100.0 - 0.5) * 0.01  # ±0.5%
                        exchange_price = main_price * (1 + variation)
                        null_mask = df_result.loc[symbol_mask, exchange_col].isna()
                        df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_price
        
        return df_result
    
    def _fill_exchange_symbols(self, df):
        """Fill exchange symbols with main symbol + exchange-specific formatting"""
        exchange_symbol_cols = [col for col in df.columns if col.startswith('symbols.')]
        
        if not exchange_symbol_cols or 'symbol' not in df.columns:
            return df
        
        df_result = df.copy()
        
        # Exchange-specific symbol formatting
        exchange_formats = {
            'symbols.binance': lambda s: f"{s.upper()}USDT" if s.lower() != 'bitcoin' else "BTCUSDT",
            'symbols.coinbase': lambda s: f"{s.upper()}-USD",
            'symbols.kraken': lambda s: f"{s.upper()}USD" if len(s) <= 3 else f"{s.upper()}/USD",
            'symbols.bybit': lambda s: f"{s.upper()}USDT",
            'symbols.kucoin': lambda s: f"{s.upper()}-USDT",
            'symbols.okx': lambda s: f"{s.upper()}-USDT",
            'symbols.mexc': lambda s: f"{s.upper()}_USDT",
            'symbols.gateio': lambda s: f"{s.upper()}_USDT",
            'symbols.bitget': lambda s: f"{s.upper()}USDT",
            'symbols.bitmart': lambda s: f"{s.upper()}_USDT",
            'symbols.bingx': lambda s: f"{s.upper()}-USDT",
            'symbols.cryptocom': lambda s: f"{s.upper()}_USDT"
        }
        
        for symbol in df['symbol'].unique():
            symbol_mask = df['symbol'] == symbol
            
            for exchange_col in exchange_symbol_cols:
                if df.loc[symbol_mask, exchange_col].isna().all():
                    formatter = exchange_formats.get(exchange_col, lambda s: s.upper())
                    try:
                        exchange_symbol = formatter(symbol)
                        df_result.loc[symbol_mask, exchange_col] = exchange_symbol
                    except Exception:
                        df_result.loc[symbol_mask, exchange_col] = symbol.upper()
        
        return df_result
    
    def _fill_cg_id(self, df):
        """Fill CoinGecko ID based on symbol"""
        if 'cg_id' not in df.columns:
            return df
        
        df_result = df.copy()
        
        # Common CoinGecko ID mappings
        cg_id_mapping = {
            'bitcoin': 'bitcoin',
            'btc': 'bitcoin',
            'ethereum': 'ethereum',
            'eth': 'ethereum',
            'binancecoin': 'binancecoin',
            'bnb': 'binancecoin',
            'cardano': 'cardano',
            'ada': 'cardano',
            'solana': 'solana',
            'sol': 'solana',
            'xrp': 'ripple',
            'ripple': 'ripple',
            'dogecoin': 'dogecoin',
            'doge': 'dogecoin',
            'polkadot': 'polkadot',
            'dot': 'polkadot',
            'avalanche-2': 'avalanche-2',
            'avax': 'avalanche-2',
            'chainlink': 'chainlink',
            'link': 'chainlink',
            'polygon': 'matic-network',
            'matic': 'matic-network',
            'litecoin': 'litecoin',
            'ltc': 'litecoin',
            'uniswap': 'uniswap',
            'uni': 'uniswap'
        }
        
        for symbol in df['symbol'].unique():
            symbol_mask = df['symbol'] == symbol
            
            if df.loc[symbol_mask, 'cg_id'].isna().all():
                cg_id = cg_id_mapping.get(symbol.lower(), symbol.lower())
                df_result.loc[symbol_mask, 'cg_id'] = cg_id
        
        return df_result
    
    def process_crypto_features(self, df):
        """Process crypto features with advanced symbol-first null handling"""
        print("Processing crypto features with symbol-first strategy...")
        df_result = df.copy()
        
        # Step 1: Analyze symbol statistics for intelligent filling
        print("Analyzing symbol statistics...")
        self.symbol_stats = self._analyze_symbol_statistics(df_result)
        print(f"Analyzed {len(self.symbol_stats)} symbols")
        
        # Step 2: Create symbol profiles
        self.symbol_profiles = self._create_symbol_profiles(df_result)
        
        # Step 3: Symbol-first null handling for key columns
        priority_columns = [
            'price', 'volume', 'marketcap', 'dominance', 'rank',
            'performance.day', 'performance.week', 'performance.month',
            'rsi', 'macd', 'transaction_count', 'transaction_volume'
        ]
        
        for column in priority_columns:
            if column in df_result.columns and df_result[column].isna().any():
                print(f"Processing {column} with symbol-first strategy...")
                
                for symbol in df_result['symbol'].unique():
                    symbol_mask = df_result['symbol'] == symbol
                    null_mask = df_result[column].isna()
                    fill_mask = symbol_mask & null_mask
                    
                    if fill_mask.any():
                        # Use intelligent symbol-first filling
                        fill_result = self._intelligent_symbol_fill(df_result, symbol, column)
                        
                        if fill_result is not None:
                            if isinstance(fill_result, pd.Series):
                                # If we got a series back (from temporal interpolation)
                                # Make sure the series aligns with the symbol mask
                                symbol_indices = df_result[symbol_mask].index
                                if len(fill_result) == len(symbol_indices):
                                    # Map the series values to the correct indices
                                    for i, idx in enumerate(symbol_indices):
                                        if pd.notna(fill_result.iloc[i]):
                                            df_result.loc[idx, column] = fill_result.iloc[i]
                                else:
                                    # Fallback: use median of the series
                                    fill_value = fill_result.median()
                                    if pd.notna(fill_value):
                                        df_result.loc[fill_mask, column] = fill_value
                            else:
                                # If we got a scalar value
                                df_result.loc[fill_mask, column] = fill_result
        
        # Step 4: Handle exchange prices with cross-reference to main price
        df_result = self._fill_exchange_prices_advanced(df_result)
        
        # Step 5: Handle exchange symbols with proper formatting
        df_result = self._fill_exchange_symbols(df_result)
        
        # Step 6: Handle CoinGecko IDs
        df_result = self._fill_cg_id(df_result)
        
        # Step 7: Fill remaining columns with intelligent defaults
        for column in df_result.columns:
            if df_result[column].isna().any():
                default_value = self.crypto_column_defaults.get(column)
                
                if default_value is not None:
                    for symbol in df_result['symbol'].unique():
                        symbol_mask = df_result['symbol'] == symbol
                        null_mask = df_result[column].isna()
                        fill_mask = symbol_mask & null_mask
                        
                        if fill_mask.any():
                            try:
                                fill_value = self._intelligent_fill_value(
                                    df_result, symbol, column, default_value
                                )
                                df_result.loc[fill_mask, column] = fill_value
                            except Exception as e:
                                print(f"Warning: Failed to fill {column} for {symbol}: {e}")
                                # Skip this column for this symbol
                                continue
        
        return df_result
    
    def process_stock_features(self, df):
        """Process stock features with advanced symbol-first null handling"""
        print("Processing stock features with symbol-first strategy...")
        df_result = df.copy()
        
        # Step 1: Analyze symbol statistics for intelligent filling
        print("Analyzing symbol statistics...")
        self.symbol_stats = self._analyze_symbol_statistics(df_result)
        print(f"Analyzed {len(self.symbol_stats)} symbols")
        
        # Step 2: Create symbol profiles
        self.symbol_profiles = self._create_symbol_profiles(df_result)
        
        # Step 3: Symbol-first null handling for key columns
        priority_columns = [
            'close', 'open', 'high', 'low', 'volume', 'prev_close',
            'marketCapitalization', 'shareOutstanding',
            'rsi', 'macd', 'atr', 'bb_position',
            'news_sentiment_mean_x', 'news_sentiment_mean_y',
            'buy', 'sell', 'hold', 'strongBuy', 'strongSell'
        ]
        
        for column in priority_columns:
            if column in df_result.columns and df_result[column].isna().any():
                print(f"Processing {column} with symbol-first strategy...")
                
                for symbol in df_result['symbol'].unique():
                    symbol_mask = df_result['symbol'] == symbol
                    null_mask = df_result[column].isna()
                    fill_mask = symbol_mask & null_mask
                    
                    if fill_mask.any():
                        # Use intelligent symbol-first filling
                        fill_result = self._intelligent_symbol_fill(df_result, symbol, column)
                        
                        if fill_result is not None:
                            if isinstance(fill_result, pd.Series):
                                # If we got a series back (from temporal interpolation)
                                # Make sure the series aligns with the symbol mask
                                symbol_indices = df_result[symbol_mask].index
                                if len(fill_result) == len(symbol_indices):
                                    # Map the series values to the correct indices
                                    for i, idx in enumerate(symbol_indices):
                                        if pd.notna(fill_result.iloc[i]):
                                            df_result.loc[idx, column] = fill_result.iloc[i]
                                else:
                                    # Fallback: use median of the series
                                    fill_value = fill_result.median()
                                    if pd.notna(fill_value):
                                        df_result.loc[fill_mask, column] = fill_value
                            else:
                                # If we got a scalar value
                                df_result.loc[fill_mask, column] = fill_result
        
        # Step 4: Fill remaining columns with intelligent defaults
        for column in df_result.columns:
            if df_result[column].isna().any():
                default_value = self.stock_column_defaults.get(column)
                
                if default_value is not None:
                    for symbol in df_result['symbol'].unique():
                        symbol_mask = df_result['symbol'] == symbol
                        null_mask = df_result[column].isna()
                        fill_mask = symbol_mask & null_mask
                        
                        if fill_mask.any():
                            try:
                                fill_value = self._intelligent_fill_value(
                                    df_result, symbol, column, default_value
                                )
                                df_result.loc[fill_mask, column] = fill_value
                            except Exception as e:
                                print(f"Warning: Failed to fill {column} for {symbol}: {e}")
                                # Skip this column for this symbol
                                continue
        
        return df_result
    
    def generate_report(self, df_before, df_after, feature_type):
        """Generate a comprehensive report of null value handling with symbol-first strategy details"""
        before_nulls = df_before.isnull().sum()
        after_nulls = df_after.isnull().sum()
        
        null_reduction = before_nulls - after_nulls
        columns_fixed = null_reduction[null_reduction > 0]
        
        # Analyze symbol coverage
        symbol_analysis = {}
        if 'symbol' in df_before.columns:
            for symbol in df_before['symbol'].unique():
                symbol_before = int(df_before[df_before['symbol'] == symbol].isnull().sum().sum())
                symbol_after = int(df_after[df_after['symbol'] == symbol].isnull().sum().sum())
                symbol_analysis[symbol] = {
                    'nulls_before': symbol_before,
                    'nulls_after': symbol_after,
                    'nulls_filled': symbol_before - symbol_after,
                    'records': int(len(df_before[df_before['symbol'] == symbol]))
                }
        
        # Analyze temporal coverage if timestamp available
        temporal_analysis = {}
        if 'interval_timestamp' in df_before.columns:
            df_before_ts = df_before.copy()
            df_after_ts = df_after.copy()
            df_before_ts['date'] = pd.to_datetime(df_before_ts['interval_timestamp'], unit='ms').dt.date
            df_after_ts['date'] = pd.to_datetime(df_after_ts['interval_timestamp'], unit='ms').dt.date
            
            for date in df_before_ts['date'].unique():
                date_before = int(df_before_ts[df_before_ts['date'] == date].isnull().sum().sum())
                date_after = int(df_after_ts[df_after_ts['date'] == date].isnull().sum().sum())
                temporal_analysis[str(date)] = {
                    'nulls_before': date_before,
                    'nulls_after': date_after,
                    'nulls_filled': date_before - date_after
                }
        
        report = {
            'feature_type': feature_type,
            'timestamp': pd.Timestamp.now().isoformat(),
            'strategy': 'symbol-first-temporal-interpolation',
            'total_rows': int(len(df_after)),
            'total_columns': int(len(df_after.columns)),
            'unique_symbols': int(len(df_after['symbol'].unique())) if 'symbol' in df_after.columns else 0,
            'columns_with_nulls_before': int((before_nulls > 0).sum()),
            'columns_with_nulls_after': int((after_nulls > 0).sum()),
            'total_nulls_before': int(before_nulls.sum()),
            'total_nulls_after': int(after_nulls.sum()),
            'total_nulls_filled': int(null_reduction.sum()),
            'columns_fixed': int(len(columns_fixed)),
            'null_reduction_rate': float((null_reduction.sum() / before_nulls.sum()) if before_nulls.sum() > 0 else 0),
            'remaining_null_columns': {str(k): int(v) for k, v in after_nulls[after_nulls > 0].to_dict().items()},
            'fixed_columns_detail': {str(k): int(v) for k, v in null_reduction[null_reduction > 0].to_dict().items()},
            'symbol_analysis': symbol_analysis,
            'temporal_analysis': temporal_analysis,
            'strategy_details': {
                'symbol_stats_analyzed': len(self.symbol_stats),
                'temporal_interpolation_used': True,
                'similar_symbol_fallback': True,
                'intelligent_defaults': True
            }
        }
        
        return report


def process_crypto_features_file(input_path, output_path=None):
    """Process crypto features file"""
    if output_path is None:
        output_path = input_path
    
    print(f"Loading crypto features from {input_path}...")
    df = pd.read_parquet(input_path)
    
    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
    print(f"Null values before processing: {df.isnull().sum().sum()}")
    
    handler = FinalNullValueHandler()
    df_processed = handler.process_crypto_features(df)
    
    print(f"Null values after processing: {df_processed.isnull().sum().sum()}")
    
    # Generate report
    report = handler.generate_report(df, df_processed, 'crypto')
    
    # Save processed data
    df_processed.to_parquet(output_path, index=False)
    print(f"Saved processed crypto features to {output_path}")
    
    # Save report
    report_path = str(output_path).replace('.parquet', '_null_handling_report.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)
    print(f"Saved processing report to {report_path}")
    
    return df_processed, report


def process_stock_features_file(input_path, output_path=None):
    """Process stock features file"""
    if output_path is None:
        output_path = input_path
    
    print(f"Loading stock features from {input_path}...")
    df = pd.read_parquet(input_path)
    
    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
    print(f"Null values before processing: {df.isnull().sum().sum()}")
    
    handler = FinalNullValueHandler()
    df_processed = handler.process_stock_features(df)
    
    print(f"Null values after processing: {df_processed.isnull().sum().sum()}")
    
    # Generate report
    report = handler.generate_report(df, df_processed, 'stock')
    
    # Save processed data
    df_processed.to_parquet(output_path, index=False)
    print(f"Saved processed stock features to {output_path}")
    
    # Save report
    report_path = str(output_path).replace('.parquet', '_null_handling_report.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)
    print(f"Saved processing report to {report_path}")
    
    return df_processed, report


def main():
    """Main function to process both crypto and stock features"""
    crypto_path = Path("data/merged/features/crypto_features.parquet")
    stocks_path = Path("data/merged/features/stocks_features.parquet")
    
    processed_files = []
    
    # Process crypto features
    if crypto_path.exists():
        try:
            df_crypto, report_crypto = process_crypto_features_file(crypto_path)
            processed_files.append(('crypto', crypto_path, report_crypto))
            print(f"✓ Crypto features processed: {report_crypto['total_nulls_filled']} nulls filled")
        except Exception as e:
            print(f"✗ Error processing crypto features: {e}")
    else:
        print(f"Warning: {crypto_path} not found")
    
    # Process stock features
    if stocks_path.exists():
        try:
            df_stocks, report_stocks = process_stock_features_file(stocks_path)
            processed_files.append(('stocks', stocks_path, report_stocks))
            print(f"✓ Stock features processed: {report_stocks['total_nulls_filled']} nulls filled")
        except Exception as e:
            print(f"✗ Error processing stock features: {e}")
    else:
        print(f"Warning: {stocks_path} not found")
    
    # Summary report
    if processed_files:
        print("\n" + "="*60)
        print("FINAL NULL VALUE HANDLING SUMMARY")
        print("="*60)
        
        total_nulls_filled = 0
        for file_type, file_path, report in processed_files:
            total_nulls_filled += report['total_nulls_filled']
            print(f"\n{file_type.upper()} FEATURES:")
            print(f"  File: {file_path}")
            print(f"  Rows: {report['total_rows']:,}")
            print(f"  Columns: {report['total_columns']}")
            print(f"  Nulls filled: {report['total_nulls_filled']:,}")
            print(f"  Columns fixed: {report['columns_fixed']}")
            print(f"  Remaining null columns: {len(report['remaining_null_columns'])}")
            
            if report['remaining_null_columns']:
                print(f"  Still have nulls: {list(report['remaining_null_columns'].keys())}")
        
        print(f"\nTOTAL NULLS FILLED ACROSS ALL FILES: {total_nulls_filled:,}")
        print("="*60)
    else:
        print("No files were processed successfully.")


if __name__ == "__main__":
    main()