Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

File size: 20,602 Bytes

c49b21b

import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

class ImprovedStockDataImputer:
    """
    Enhanced imputation that prevents data homogenization by using
    symbol-specific patterns and relationships.
    """
    
    def __init__(self, preserve_symbol_diversity=True):
        self.preserve_symbol_diversity = preserve_symbol_diversity
        self.symbol_profiles = {}
        self.scalers = {}
        
    def _create_symbol_profiles(self, df):
        """Create profiles for each symbol to guide imputation."""
        profiles = {}
        
        for symbol in df['symbol'].unique():
            symbol_data = df[df['symbol'] == symbol]
            
            # Calculate symbol-specific statistics with proper null handling
            price_col = None
            for col in ['price', 'close', 'close_alpaca', 'open', 'high', 'low']:
                if col in symbol_data.columns and not symbol_data[col].isnull().all():
                    price_col = col
                    break
            
            volume_col = None
            for col in ['volume', 'volume_alpaca']:
                if col in symbol_data.columns and not symbol_data[col].isnull().all():
                    volume_col = col
                    break
            
            profile = {
                'symbol': symbol,
                'price_level': symbol_data[price_col].median() if price_col else 100.0,  # Default to 100
                'price_volatility': symbol_data[price_col].std() if price_col else 2.0,  # Default volatility
                'volume_level': symbol_data[volume_col].median() if volume_col else 1000.0,  # Default volume
                'is_crypto': symbol_data['is_crypto'].mode().iloc[0] if 'is_crypto' in symbol_data.columns and not symbol_data['is_crypto'].isnull().all() else 0,
                'typical_rsi': symbol_data['rsi'].median() if 'rsi' in symbol_data.columns and not symbol_data['rsi'].isnull().all() else 50.0,
                'data_availability': len(symbol_data) / len(df) if len(df) > 0 else 0
            }
            
            # Ensure no None values in profile
            for key, value in profile.items():
                if value is None or (isinstance(value, float) and np.isnan(value)):
                    if key == 'price_level':
                        profile[key] = 100.0
                    elif key == 'price_volatility':
                        profile[key] = 2.0
                    elif key == 'volume_level':
                        profile[key] = 1000.0
                    elif key == 'typical_rsi':
                        profile[key] = 50.0
                    elif key == 'is_crypto':
                        profile[key] = 0
                    else:
                        profile[key] = 0.0
            
            profiles[symbol] = profile
            
        return profiles
    
    def _impute_with_symbol_context(self, df, column, symbol_profiles):
        """Impute values using symbol-specific context to prevent homogenization."""
        
        df_result = df.copy()
        
        for symbol in df['symbol'].unique():
            symbol_mask = df['symbol'] == symbol
            symbol_data = df.loc[symbol_mask, column]
            
            if symbol_data.isnull().sum() == 0:
                continue  # No missing values for this symbol
            
            profile = symbol_profiles.get(symbol, {})
            
            # Strategy depends on column type and symbol characteristics
            if column in ['price', 'open', 'high', 'low', 'close']:
                # Price data - use interpolation with symbol-specific bounds
                interpolated = symbol_data.interpolate(method='linear', limit_direction='both')
                
                # If still missing, use symbol's typical price level with noise
                if interpolated.isnull().any():
                    base_price = profile.get('price_level', 100.0)
                    volatility = profile.get('price_volatility', base_price * 0.02)
                    
                    # Add symbol-specific noise to prevent identical values
                    symbol_hash = hash(symbol) % 1000 / 1000  # 0-1 range
                    noise_factor = (symbol_hash - 0.5) * 0.1  # -5% to +5%
                    adjusted_price = base_price * (1 + noise_factor)
                    
                    interpolated = interpolated.fillna(adjusted_price)
                
                df_result.loc[symbol_mask, column] = interpolated
                
            elif column in ['volume', 'volume_alpaca']:
                # Volume data - use forward fill then symbol-specific median
                filled = symbol_data.fillna(method='ffill').fillna(method='bfill')
                
                if filled.isnull().any():
                    # Use symbol's typical volume with variation
                    base_volume = profile.get('volume_level', 1000.0)
                    symbol_hash = hash(symbol + column) % 1000 / 1000
                    volume_multiplier = 0.5 + symbol_hash  # 0.5x to 1.5x variation
                    adjusted_volume = base_volume * volume_multiplier
                    filled = filled.fillna(adjusted_volume)
                
                df_result.loc[symbol_mask, column] = filled
                
            elif column in ['rsi', 'stoch_k', 'stoch_d']:
                # Oscillator indicators - use symbol-specific typical values
                symbol_median = symbol_data.median()
                
                if pd.isna(symbol_median):
                    # Use symbol-specific baseline with variation
                    symbol_hash = hash(symbol + column) % 1000 / 1000
                    if column == 'rsi':
                        # RSI: 30-70 range with symbol variation
                        baseline = 30 + (symbol_hash * 40)  # 30-70 range
                    else:  # stochastic
                        baseline = 20 + (symbol_hash * 60)  # 20-80 range
                else:
                    baseline = symbol_median
                
                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
                
            elif column in ['macd', 'macd_signal', 'macd_histogram']:
                # MACD - can be positive/negative, use symbol-specific pattern
                symbol_median = symbol_data.median()
                
                if pd.isna(symbol_median):
                    # Use price-level dependent MACD estimation with null safety
                    price_level = profile.get('price_level', 100.0)  # Default to 100 if None
                    if price_level is None or np.isnan(price_level):
                        price_level = 100.0
                    
                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1
                    # Scale MACD relative to price level
                    baseline = (price_level * 0.001) * symbol_hash
                else:
                    baseline = symbol_median
                
                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
                
            else:
                # Generic numeric imputation with symbol variation
                symbol_median = symbol_data.median()
                
                if pd.isna(symbol_median):
                    # Use overall median but add symbol-specific variation
                    overall_median = df[column].median()
                    if pd.isna(overall_median):
                        overall_median = 0
                    
                    # Add symbol-specific variation (±10%)
                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1
                    variation = overall_median * 0.1 * symbol_hash
                    baseline = overall_median + variation
                else:
                    baseline = symbol_median
                
                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
        
        return df_result[column]
    
    def fit_transform(self, df):
        """Apply improved imputation with anti-homogenization measures."""
        
        df_imputed = df.copy()
        df_imputed = df_imputed.sort_values(['symbol', 'interval_timestamp'])
        
        # Create symbol profiles
        self.symbol_profiles = self._create_symbol_profiles(df_imputed)
        
        print(f"Created profiles for {len(self.symbol_profiles)} unique symbols")
        
        # 1. Handle categorical/flag columns (same as before)
        categorical_cols = [
            'symbol', 'stock_market', 'is_crypto', 'is_stock', 'is_other',
            'alpaca_data_available', 'is_trading_hours', 'is_weekend'
        ]
        
        for col in categorical_cols:
            if col in df_imputed.columns:
                df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill')
        
        # 2. Price and volume data - symbol-specific imputation
        price_volume_cols = [
            'price', 'open', 'high', 'low', 'close', 'volume',
            'open_alpaca', 'high_alpaca', 'low_alpaca', 'close_alpaca', 'volume_alpaca',
            'bid_price', 'ask_price', 'bid_price_alpaca', 'ask_price_alpaca', 'price_alpaca'
        ]
        
        for col in price_volume_cols:
            if col in df_imputed.columns and df_imputed[col].isnull().any():
                print(f"Imputing {col} with symbol-specific context...")
                df_imputed[col] = self._impute_with_symbol_context(
                    df_imputed, col, self.symbol_profiles
                )
        
        # 3. Technical indicators - symbol-specific imputation
        tech_indicators = [
            'rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_position',
            'stoch_k', 'stoch_d', 'cci', 'roc_5', 'roc_10', 'mfi', 'rsi_macd_signal',
            'ema_convergence', 'true_range_pct'
        ]
        
        for col in tech_indicators:
            if col in df_imputed.columns and df_imputed[col].isnull().any():
                print(f"Imputing {col} with symbol-specific context...")
                df_imputed[col] = self._impute_with_symbol_context(
                    df_imputed, col, self.symbol_profiles
                )
        
        # 4. Volume/price change features - symbol-specific
        change_features = [
            'price_change_1', 'price_change_7', 'price_change_14', 'volume_ratio',
            'volatility_7', 'price_volume_trend', 'volatility_consistency'
        ]
        
        for col in change_features:
            if col in df_imputed.columns and df_imputed[col].isnull().any():
                df_imputed[col] = self._impute_with_symbol_context(
                    df_imputed, col, self.symbol_profiles
                )
        
        # 5. On-chain features (crypto only)
        onchain_features = [
            'total_fees', 'total_gas_used', 'avg_gas_price', 'tx_count_7d_change',
            'tx_count_sma_7', 'tx_volume_7d_change', 'tx_volume_sma_7',
            'gas_used_7d_change', 'gas_used_sma_7', 'gas_price_7d_change',
            'gas_price_sma_7', 'fees_7d_change', 'avg_tx_size'
        ]
        
        for col in onchain_features:
            if col in df_imputed.columns and df_imputed[col].isnull().any():
                # Only impute for crypto assets
                crypto_mask = df_imputed['is_crypto'] == 1
                non_crypto_mask = df_imputed['is_crypto'] != 1
                
                if crypto_mask.any():
                    crypto_data = df_imputed.loc[crypto_mask]
                    crypto_imputed = self._impute_with_symbol_context(
                        crypto_data, col, self.symbol_profiles
                    )
                    df_imputed.loc[crypto_mask, col] = crypto_imputed
                
                # Fill non-crypto with 0
                df_imputed.loc[non_crypto_mask, col] = df_imputed.loc[non_crypto_mask, col].fillna(0)
        
        # 6. Handle remaining columns with simple strategies
        remaining_strategies = {
            'quality_metrics': [
                'data_quality_score', 'core_features_completeness', 'technical_indicators_completeness',
                'onchain_features_completeness', 'price_data_completeness', 
                'overall_feature_completeness', 'data_completeness_score'
            ],
            'news_sentiment': [
                'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min',
                'news_sentiment_max', 'news_sentiment_range', 'news_match_score_mean',
                'news_match_score_max', 'news_mentions_count', 'news_articles_count',
                'news_highlights_count', 'news_activity_score', 'sentiment_score'
            ],
            'zero_fill': [
                'trade_count', 'trade_count_alpaca', 'bid_size', 'ask_size',
                'bid_size_alpaca', 'ask_size_alpaca', 'size', 'size_alpaca'
            ]
        }
        
        # Quality metrics - use median but add small variation
        for col in remaining_strategies['quality_metrics']:
            if col in df_imputed.columns and df_imputed[col].isnull().any():
                median_val = df_imputed[col].median()
                if pd.isna(median_val):
                    median_val = 0.5  # Default for quality metrics
                median_val = np.clip(median_val, 0, 1)
                
                # Add tiny symbol-specific variation
                for symbol in df_imputed['symbol'].unique():
                    mask = df_imputed['symbol'] == symbol
                    symbol_hash = hash(symbol + col) % 100 / 10000  # Very small variation
                    fill_val = np.clip(median_val + symbol_hash, 0, 1)
                    df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(fill_val)
        
        # News sentiment - neutral with symbol variation
        for col in remaining_strategies['news_sentiment']:
            if col in df_imputed.columns and df_imputed[col].isnull().any():
                if 'sentiment' in col.lower():
                    # Slight variation around neutral
                    for symbol in df_imputed['symbol'].unique():
                        mask = df_imputed['symbol'] == symbol
                        symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1  # -0.1 to +0.1
                        df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(symbol_hash)
                elif 'count' in col.lower():
                    df_imputed[col] = df_imputed[col].fillna(0)
                else:
                    median_val = df_imputed[col].median()
                    if pd.isna(median_val):
                        median_val = 0
                    df_imputed[col] = df_imputed[col].fillna(median_val)
        
        # Zero fill
        for col in remaining_strategies['zero_fill']:
            if col in df_imputed.columns:
                df_imputed[col] = df_imputed[col].fillna(0)
        
        # Handle any remaining columns
        remaining_numeric = df_imputed.select_dtypes(include=[np.number]).columns
        remaining_with_nulls = [col for col in remaining_numeric if df_imputed[col].isnull().any()]
        
        for col in remaining_with_nulls:
            if col not in ['id', 'id_alpaca', 'backup_id']:
                print(f"Imputing remaining column: {col}")
                df_imputed[col] = self._impute_with_symbol_context(
                    df_imputed, col, self.symbol_profiles
                )
        
        print("[INFO] Imputation complete with anti-homogenization measures")
        print(f"[INFO] Final null counts: {df_imputed.isnull().sum().sum()}")
        return df_imputed

# Usage function with validation
def impute_with_validation(file_path, output_path=None):
    """Impute data and validate no homogenization occurred."""
    
    try:
        print(f"[INFO] Loading data from: {file_path}")
        df = pd.read_parquet(file_path)
        print(f"[INFO] Loaded data shape: {df.shape}")
        print(f"[INFO] Initial null counts: {df.isnull().sum().sum()}")
    except Exception as e:
        print(f"[ERROR] Failed to load data: {e}")
        return None
    
    # Sample symbols for validation
    symbols_sample = df['symbol'].unique()[:5]
    print(f"[INFO] Processing {len(df['symbol'].unique())} unique symbols")
    
    # Initialize and run imputer
    imputer = ImprovedStockDataImputer()
    df_imputed = imputer.fit_transform(df)
    
    # Combine alpaca data with main data where available
    alpaca_combinations = [
        ('high', 'high_alpaca'),
        ('low', 'low_alpaca'),
        ('close', 'close_alpaca'),
        ('open', 'open_alpaca'),
        ('volume', 'volume_alpaca')
    ]
    
    for main_col, alpaca_col in alpaca_combinations:
        if main_col in df_imputed.columns and alpaca_col in df_imputed.columns:
            df_imputed[main_col] = df_imputed[main_col].combine_first(df_imputed[alpaca_col])
            print(f"[INFO] Combined {main_col} with {alpaca_col}")
    
    # Drop unwanted columns before saving
    drop_cols = [
        '_filename', '_original_format', 'alpaca_data_available',
        'ask_exchange', 'ask_exchange_alpaca',
        'bid_exchange', 'bid_exchange_alpaca',
        'conditions', 'conditions_alpaca', 'conditions_trade', 'conditions_trade_alpaca',
        'symbol_quote', 'symbol_quote_alpaca', 'symbol_trade', 'symbol_trade_alpaca',
        'tape', 'tape_alpaca', 'tape_trade', 'tape_trade_alpaca',
        'id', 'id_alpaca',
        'is_new_symbol', 'price', 'timestamp_dt',
        'alpaca_merge_timestamp', 'timestamp', 'timestamp_alpaca',
        'estimateCurrency', 'exchange', 'exchange_alpaca', 'exchange_company',
        'finnhubIndustry', 'headline',
        'sentiment_timestamp', 'logo',
        'ticker', 'stock_market',
        'weburl', 'latest_news_timestamp', 'day_of_week', 'feature_timestamp', 
        'interval_timestamp_dt', 'is_crypto', 'is_other', 'is_stock',
        'country', 'currency', 'datetime', 'ipo', 'name', 'period', 'phone', 
        'year', 'month', 'latest_news_timestamp_x', 'latest_news_timestamp_y'
    ]
    
    original_cols = len(df_imputed.columns)
    for col in drop_cols:
        if col in df_imputed.columns:
            df_imputed = df_imputed.drop(columns=col)
    
    print(f"[INFO] Dropped {original_cols - len(df_imputed.columns)} unwanted columns")
    
    # Reorder columns: 'symbol' first, 'interval_timestamp' second, rest follow
    cols = list(df_imputed.columns)
    if 'symbol' in cols and 'interval_timestamp' in cols:
        rest = [c for c in cols if c not in ['symbol', 'interval_timestamp']]
        df_imputed = df_imputed[['symbol', 'interval_timestamp'] + rest]
        print("[INFO] Reordered columns with symbol and interval_timestamp first")

    # Save results
    if output_path:
        # Clean up data types
        if 'backup_id' in df_imputed.columns:
            df_imputed['backup_id'] = df_imputed['backup_id'].astype(str)
        
        try:
            df_imputed.to_parquet(output_path, compression='snappy')
            print(f"[INFO] Successfully saved imputed data to: {output_path}")
        except Exception as e:
            print(f"[ERROR] Failed to save data: {e}")
            return None
    
    print(f"[INFO] Final dataset shape: {df_imputed.shape}")
    return df_imputed

# Example usage
def main():
    input_file = "data/merged/features/stocks_features.parquet"
    output_file = input_file

    print("[INFO] Starting stock data imputation process...")
    df_clean = impute_with_validation(input_file, output_file)
    
    if df_clean is not None:
        print(f"[INFO] Data imputation completed successfully!")
        print(f"[INFO] Final shape: {df_clean.shape}")
        print(f"[INFO] Remaining nulls: {df_clean.isnull().sum().sum()}")
        
        # Quick validation
        print("\n=== VALIDATION SUMMARY ===")
        print(f"Unique symbols: {df_clean['symbol'].nunique()}")
        if 'close' in df_clean.columns:
            print(f"Price range: ${df_clean['close'].min():.2f} - ${df_clean['close'].max():.2f}")
        if 'volume' in df_clean.columns:
            print(f"Volume range: {df_clean['volume'].min():.0f} - {df_clean['volume'].max():.0f}")
    else:
        print("[ERROR] Failed to load or impute data.")

if __name__ == "__main__":
    main()