|
import pandas as pd |
|
import numpy as np |
|
from sklearn.impute import KNNImputer |
|
from sklearn.preprocessing import StandardScaler |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
|
|
class CryptoDataImputerFixed: |
|
""" |
|
Specialized imputation for cryptocurrency data that preserves unique |
|
characteristics of different crypto assets and prevents homogenization. |
|
""" |
|
|
|
def __init__(self, preserve_crypto_diversity=True): |
|
self.preserve_crypto_diversity = preserve_crypto_diversity |
|
self.crypto_profiles = {} |
|
self.scalers = {} |
|
|
|
def _create_crypto_profiles(self, df): |
|
"""Create profiles for each cryptocurrency to guide imputation.""" |
|
profiles = {} |
|
|
|
for symbol in df['symbol'].unique(): |
|
symbol_data = df[df['symbol'] == symbol] |
|
|
|
|
|
|
|
stable_mode = symbol_data['stable'].mode() if 'stable' in symbol_data.columns else pd.Series() |
|
is_stablecoin = stable_mode.iloc[0] if not stable_mode.empty else False |
|
network_mode = symbol_data['blockchain_network'].mode() if 'blockchain_network' in symbol_data.columns else pd.Series() |
|
blockchain_network = network_mode.iloc[0] if not network_mode.empty else None |
|
|
|
profile = { |
|
'symbol': symbol, |
|
'price_level': symbol_data['price'].median() if 'price' in symbol_data.columns else None, |
|
'price_volatility': symbol_data['price'].std() if 'price' in symbol_data.columns else None, |
|
'volume_level': symbol_data['volume'].median() if 'volume' in symbol_data.columns else None, |
|
'marketcap_level': symbol_data['marketcap'].median() if 'marketcap' in symbol_data.columns else None, |
|
'dominance_level': symbol_data['dominance'].median() if 'dominance' in symbol_data.columns else None, |
|
'rank': symbol_data['rank'].median() if 'rank' in symbol_data.columns else None, |
|
'is_stablecoin': is_stablecoin, |
|
'typical_rsi': symbol_data['rsi'].median() if 'rsi' in symbol_data.columns else None, |
|
'blockchain_network': blockchain_network, |
|
'has_onchain_data': symbol_data['transaction_count'].notna().any() if 'transaction_count' in symbol_data.columns else False, |
|
'exchange_coverage': len([col for col in symbol_data.columns if col.startswith('symbols.') and symbol_data[col].notna().any()]), |
|
'data_availability': len(symbol_data) / len(df) if len(df) > 0 else 0 |
|
} |
|
|
|
profiles[symbol] = profile |
|
|
|
return profiles |
|
|
|
def _impute_with_crypto_context(self, df, column, crypto_profiles): |
|
"""Impute values using crypto-specific context to prevent homogenization.""" |
|
|
|
df_result = df.copy() |
|
|
|
for symbol in df['symbol'].unique(): |
|
symbol_mask = df['symbol'] == symbol |
|
symbol_data = df.loc[symbol_mask, column] |
|
|
|
if symbol_data.isnull().sum() == 0: |
|
continue |
|
|
|
profile = crypto_profiles.get(symbol, {}) |
|
is_stablecoin = profile.get('is_stablecoin', False) |
|
rank = profile.get('rank', 999) |
|
|
|
|
|
if column in ['price', 'open', 'high', 'low', 'close']: |
|
|
|
if is_stablecoin: |
|
|
|
base_price = 1.0 |
|
symbol_hash = hash(symbol + column) % 1000 / 100000 |
|
adjusted_price = base_price + symbol_hash |
|
else: |
|
|
|
interpolated = symbol_data.interpolate(method='linear', limit_direction='both') |
|
|
|
|
|
if interpolated.isnull().any() and profile.get('price_level'): |
|
base_price = profile['price_level'] |
|
volatility = profile.get('price_volatility', base_price * 0.05) |
|
|
|
|
|
symbol_hash = hash(symbol) % 1000 / 1000 |
|
volatility_multiplier = 1 + (rank / 100) |
|
noise_factor = (symbol_hash - 0.5) * 0.2 * volatility_multiplier |
|
adjusted_price = base_price * (1 + noise_factor) |
|
else: |
|
adjusted_price = interpolated |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(adjusted_price) |
|
|
|
elif column in ['volume', 'volume_alpaca']: |
|
|
|
filled = symbol_data.fillna(method='ffill').fillna(method='bfill') |
|
|
|
if filled.isnull().any(): |
|
base_volume = profile.get('volume_level', 1000000) |
|
|
|
if rank and rank <= 10: |
|
volume_multiplier = 5 + (hash(symbol + column) % 1000 / 200) |
|
elif rank and rank <= 50: |
|
volume_multiplier = 1 + (hash(symbol + column) % 1000 / 500) |
|
else: |
|
volume_multiplier = 0.1 + (hash(symbol + column) % 1000 / 1000) |
|
|
|
adjusted_volume = base_volume * volume_multiplier |
|
filled = filled.fillna(adjusted_volume) |
|
|
|
df_result.loc[symbol_mask, column] = filled |
|
|
|
elif column in ['marketcap']: |
|
|
|
if profile.get('marketcap_level'): |
|
baseline = profile['marketcap_level'] |
|
else: |
|
|
|
if rank and rank <= 10: |
|
baseline = 10_000_000_000 |
|
elif rank and rank <= 50: |
|
baseline = 1_000_000_000 |
|
elif rank and rank <= 100: |
|
baseline = 100_000_000 |
|
else: |
|
baseline = 10_000_000 |
|
|
|
|
|
symbol_hash = hash(symbol + column) % 1000 / 1000 |
|
baseline *= (0.5 + symbol_hash) |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
elif column in ['dominance']: |
|
|
|
if rank and rank <= 5: |
|
|
|
symbol_hash = hash(symbol + column) % 1000 / 1000 |
|
if symbol.upper() == 'BTC': |
|
baseline = 0.4 + (symbol_hash * 0.2) |
|
elif symbol.upper() == 'ETH': |
|
baseline = 0.15 + (symbol_hash * 0.1) |
|
else: |
|
baseline = 0.01 + (symbol_hash * 0.05) |
|
else: |
|
baseline = 0.001 + (hash(symbol + column) % 1000 / 100000) |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
elif column in ['rsi', 'stoch_k', 'stoch_d']: |
|
|
|
symbol_median = symbol_data.median() |
|
|
|
if pd.isna(symbol_median): |
|
symbol_hash = hash(symbol + column) % 1000 / 1000 |
|
if column == 'rsi': |
|
|
|
if rank and rank <= 10: |
|
baseline = 20 + (symbol_hash * 60) |
|
else: |
|
baseline = 10 + (symbol_hash * 80) |
|
else: |
|
baseline = 10 + (symbol_hash * 80) |
|
else: |
|
baseline = symbol_median |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
elif column in ['macd', 'macd_signal', 'macd_histogram']: |
|
|
|
symbol_median = symbol_data.median() |
|
|
|
if pd.isna(symbol_median): |
|
price_level = profile.get('price_level', 1) |
|
symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 |
|
|
|
volatility_factor = 2 if rank and rank > 50 else 1 |
|
baseline = (price_level * 0.01 * volatility_factor) * symbol_hash |
|
else: |
|
baseline = symbol_median |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
elif column.startswith('performance.'): |
|
|
|
symbol_median = symbol_data.median() |
|
|
|
if pd.isna(symbol_median): |
|
symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 |
|
|
|
|
|
if 'year' in column: |
|
baseline = symbol_hash * 5 |
|
elif 'month' in column: |
|
baseline = symbol_hash * 2 |
|
elif 'week' in column: |
|
baseline = symbol_hash * 0.5 |
|
elif 'day' in column: |
|
baseline = symbol_hash * 0.2 |
|
else: |
|
baseline = symbol_hash * 0.05 |
|
|
|
|
|
if rank and rank > 50: |
|
baseline *= 2 |
|
|
|
else: |
|
baseline = symbol_median |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
elif column.startswith('tx_') or column.startswith('gas_') or column in [ |
|
'transaction_volume', 'transaction_count', 'total_fees', 'total_gas_used', |
|
'avg_gas_price', 'avg_tx_size', 'fees_7d_change', 'gas_used_7d_change', 'gas_price_7d_change' |
|
] or '_7d_change' in column: |
|
|
|
network = profile.get('blockchain_network', 'unknown') |
|
|
|
|
|
if '7d_change' in column: |
|
|
|
symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 |
|
|
|
if 'fees' in column.lower(): |
|
|
|
baseline = symbol_hash * 0.5 |
|
elif 'gas' in column.lower(): |
|
|
|
baseline = symbol_hash * 0.3 |
|
else: |
|
|
|
baseline = symbol_hash * 0.4 |
|
|
|
|
|
if rank and rank > 100: |
|
baseline *= 2 |
|
|
|
elif network in ['ethereum', 'bitcoin', 'polygon', 'bsc', 'avalanche']: |
|
|
|
symbol_median = symbol_data.median() |
|
|
|
if pd.isna(symbol_median): |
|
|
|
symbol_hash = hash(symbol + column) % 1000 / 1000 |
|
|
|
if 'count' in column.lower(): |
|
if network == 'ethereum': |
|
baseline = 1000000 * (1 + symbol_hash) |
|
elif network == 'bitcoin': |
|
baseline = 300000 * (1 + symbol_hash) |
|
else: |
|
baseline = 500000 * (1 + symbol_hash) |
|
elif 'gas' in column.lower(): |
|
if network == 'ethereum': |
|
baseline = 50 * (1 + symbol_hash) |
|
else: |
|
baseline = 5 * (1 + symbol_hash) |
|
elif 'fee' in column.lower(): |
|
baseline = 1000000 * (1 + symbol_hash) |
|
else: |
|
|
|
baseline = symbol_hash * 1000 |
|
else: |
|
baseline = symbol_median |
|
else: |
|
|
|
if '7d_change' in column: |
|
|
|
pass |
|
else: |
|
baseline = 0 |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
elif column.startswith('exchangePrices.') or column.startswith('symbols.'): |
|
|
|
exchange = column.split('.')[1] if '.' in column else 'unknown' |
|
|
|
if column.startswith('exchangePrices.'): |
|
|
|
main_price = profile.get('price_level', 100) |
|
if main_price and not is_stablecoin: |
|
|
|
exchange_hash = hash(symbol + exchange) % 200 / 10000 |
|
baseline = main_price * (1 + exchange_hash) |
|
else: |
|
baseline = main_price or 1 |
|
else: |
|
|
|
continue |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
else: |
|
|
|
symbol_median = symbol_data.median() |
|
|
|
if pd.isna(symbol_median): |
|
overall_median = df[column].median() |
|
if pd.isna(overall_median): |
|
overall_median = 0 |
|
|
|
|
|
symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 |
|
volatility_factor = 2 if rank and rank > 100 else 1 |
|
variation = overall_median * 0.2 * symbol_hash * volatility_factor |
|
baseline = overall_median + variation |
|
else: |
|
baseline = symbol_median |
|
|
|
df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) |
|
|
|
return df_result[column] |
|
|
|
def _force_fill_stubborn_nulls(self, df): |
|
"""Aggressively fill any remaining nulls with appropriate defaults.""" |
|
|
|
|
|
stubborn_cols = ['fees_7d_change', 'gas_used_7d_change', 'gas_price_7d_change'] |
|
|
|
for col in stubborn_cols: |
|
if col in df.columns: |
|
null_count_before = df[col].isnull().sum() |
|
if null_count_before > 0: |
|
|
|
df[col] = df.groupby('symbol')[col].transform(lambda x: x.fillna(x.median())) |
|
|
|
|
|
still_null = df[col].isnull() |
|
if still_null.any(): |
|
for symbol in df[still_null]['symbol'].unique(): |
|
symbol_mask = (df['symbol'] == symbol) & df[col].isnull() |
|
if symbol_mask.any(): |
|
|
|
symbol_hash = hash(symbol + col) % 2000 / 1000 - 1 |
|
|
|
if 'fees' in col.lower(): |
|
fill_value = symbol_hash * 0.3 |
|
elif 'gas' in col.lower(): |
|
fill_value = symbol_hash * 0.25 |
|
else: |
|
fill_value = symbol_hash * 0.2 |
|
|
|
df.loc[symbol_mask, col] = fill_value |
|
|
|
|
|
remaining_nulls = df[col].isnull().sum() |
|
if remaining_nulls > 0: |
|
print(f"[WARNING] Nuclear fill: {remaining_nulls} nulls in {col} filled with 0") |
|
df[col] = df[col].fillna(0) |
|
|
|
return df |
|
|
|
def _nuclear_null_elimination(self, df): |
|
"""Final pass to eliminate ALL nulls with extreme prejudice.""" |
|
print("[INFO] Performing nuclear null elimination...") |
|
|
|
|
|
numeric_cols = df.select_dtypes(include=[np.number]).columns |
|
|
|
for col in numeric_cols: |
|
null_count = df[col].isnull().sum() |
|
if null_count > 0: |
|
print(f"[NUCLEAR] Eliminating {null_count} nulls in {col}") |
|
|
|
|
|
if '7d_change' in col or 'change' in col.lower(): |
|
|
|
for symbol in df['symbol'].unique(): |
|
symbol_mask = (df['symbol'] == symbol) & df[col].isnull() |
|
if symbol_mask.any(): |
|
symbol_hash = hash(symbol + col) % 2000 / 1000 - 1 |
|
if 'fees' in col.lower(): |
|
fill_value = symbol_hash * 0.3 |
|
elif 'gas' in col.lower(): |
|
fill_value = symbol_hash * 0.25 |
|
else: |
|
fill_value = symbol_hash * 0.2 |
|
df.loc[symbol_mask, col] = fill_value |
|
|
|
elif 'timestamp' in col.lower(): |
|
|
|
df[col] = df[col].fillna(method='ffill').fillna(method='bfill').fillna(0) |
|
|
|
elif col in ['price', 'open', 'high', 'low', 'close']: |
|
|
|
for symbol in df['symbol'].unique(): |
|
symbol_mask = (df['symbol'] == symbol) & df[col].isnull() |
|
if symbol_mask.any(): |
|
symbol_price = df[df['symbol'] == symbol][col].median() |
|
if pd.isna(symbol_price): |
|
symbol_hash = hash(symbol + col) % 10000 / 100 |
|
symbol_price = 1 + symbol_hash |
|
df.loc[symbol_mask, col] = symbol_price |
|
|
|
else: |
|
|
|
median_val = df[col].median() |
|
if pd.isna(median_val): |
|
median_val = 0 |
|
df[col] = df[col].fillna(median_val) |
|
|
|
|
|
remaining_nulls = df[col].isnull().sum() |
|
if remaining_nulls > 0: |
|
print(f"[NUCLEAR] Force filling {remaining_nulls} remaining nulls in {col} with 0") |
|
df[col] = df[col].fillna(0) |
|
|
|
return df |
|
|
|
def _enhanced_sentiment_imputation(self, df): |
|
"""Enhanced sentiment imputation that creates realistic, diverse sentiment values.""" |
|
|
|
print(f"[INFO] Starting enhanced sentiment imputation...") |
|
|
|
|
|
core_sentiment_cols = ['sentiment_score', 'neg', 'neu', 'pos'] |
|
|
|
for col in core_sentiment_cols: |
|
if col in df.columns: |
|
null_count_before = df[col].isnull().sum() |
|
if null_count_before > 0: |
|
print(f"[INFO] Processing {col}: {null_count_before} nulls to fill") |
|
|
|
|
|
for col in core_sentiment_cols: |
|
if col in df.columns and df[col].isnull().any(): |
|
print(f"Enhanced imputation for {col}...") |
|
|
|
for symbol in df['symbol'].unique(): |
|
symbol_mask = df['symbol'] == symbol |
|
symbol_sentiment = df.loc[symbol_mask, col] |
|
|
|
if symbol_sentiment.isnull().any(): |
|
|
|
filled = symbol_sentiment.fillna(method='ffill').fillna(method='bfill') |
|
|
|
|
|
if filled.isnull().any(): |
|
symbol_hash = hash(symbol + col) % 10000 / 10000 |
|
symbol_upper = symbol.upper() |
|
|
|
|
|
stablecoins = ['USDT', 'USDC', 'BUSD', 'DAI', 'TUSD', 'USDP'] |
|
major_cryptos = ['BTC', 'ETH', 'BNB', 'ADA', 'XRP', 'SOL', 'DOT', 'AVAX'] |
|
|
|
if col == 'sentiment_score': |
|
|
|
if any(stable in symbol_upper for stable in stablecoins): |
|
fill_value = (symbol_hash - 0.5) * 0.1 |
|
elif any(major in symbol_upper for major in major_cryptos): |
|
fill_value = 0.1 + (symbol_hash - 0.5) * 0.4 |
|
else: |
|
fill_value = (symbol_hash - 0.5) * 0.6 |
|
fill_value = np.clip(fill_value, -1.0, 1.0) |
|
|
|
elif col == 'neu': |
|
|
|
if any(stable in symbol_upper for stable in stablecoins): |
|
fill_value = 0.85 + symbol_hash * 0.1 |
|
elif any(major in symbol_upper for major in major_cryptos): |
|
fill_value = 0.65 + symbol_hash * 0.2 |
|
else: |
|
fill_value = 0.55 + symbol_hash * 0.3 |
|
fill_value = np.clip(fill_value, 0.0, 1.0) |
|
|
|
elif col == 'pos': |
|
|
|
if any(stable in symbol_upper for stable in stablecoins): |
|
fill_value = 0.05 + symbol_hash * 0.05 |
|
elif any(major in symbol_upper for major in major_cryptos): |
|
fill_value = 0.15 + symbol_hash * 0.15 |
|
else: |
|
fill_value = 0.10 + symbol_hash * 0.25 |
|
fill_value = np.clip(fill_value, 0.0, 1.0) |
|
|
|
elif col == 'neg': |
|
|
|
if any(stable in symbol_upper for stable in stablecoins): |
|
fill_value = 0.05 + symbol_hash * 0.05 |
|
elif any(major in symbol_upper for major in major_cryptos): |
|
fill_value = 0.10 + symbol_hash * 0.10 |
|
else: |
|
fill_value = 0.15 + symbol_hash * 0.15 |
|
fill_value = np.clip(fill_value, 0.0, 1.0) |
|
|
|
filled = filled.fillna(fill_value) |
|
|
|
df.loc[symbol_mask, col] = filled |
|
|
|
|
|
if all(col in df.columns for col in ['neg', 'neu', 'pos']): |
|
print("Normalizing sentiment scores...") |
|
for idx in df.index: |
|
neg_val = df.at[idx, 'neg'] |
|
neu_val = df.at[idx, 'neu'] |
|
pos_val = df.at[idx, 'pos'] |
|
|
|
current_sum = neg_val + neu_val + pos_val |
|
if current_sum > 0: |
|
df.at[idx, 'neg'] = neg_val / current_sum |
|
df.at[idx, 'neu'] = neu_val / current_sum |
|
df.at[idx, 'pos'] = pos_val / current_sum |
|
else: |
|
|
|
df.at[idx, 'neg'] = 0.1 |
|
df.at[idx, 'neu'] = 0.8 |
|
df.at[idx, 'pos'] = 0.1 |
|
|
|
|
|
other_sentiment_features = [ |
|
'social_sentiment_mean', 'social_sentiment_std', 'social_sentiment_count', |
|
'social_confidence_mean', 'combined_sentiment', 'sentiment_agreement', |
|
'sentiment_change_1', 'sentiment_sma_7', 'sentiment_momentum' |
|
] |
|
|
|
for col in other_sentiment_features: |
|
if col in df.columns and df[col].isnull().any(): |
|
if 'sentiment' in col.lower() and 'count' not in col.lower(): |
|
|
|
for symbol in df['symbol'].unique(): |
|
mask = df['symbol'] == symbol |
|
symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1 |
|
df.loc[mask, col] = df.loc[mask, col].fillna(symbol_hash) |
|
elif 'count' in col.lower(): |
|
df[col] = df[col].fillna(0) |
|
else: |
|
median_val = df[col].median() |
|
if pd.isna(median_val): |
|
median_val = 0 |
|
df[col] = df[col].fillna(median_val) |
|
|
|
|
|
print(f"[INFO] Enhanced sentiment imputation completed:") |
|
for col in core_sentiment_cols: |
|
if col in df.columns: |
|
null_count_after = df[col].isnull().sum() |
|
print(f" {col}: {null_count_after} nulls remaining") |
|
|
|
return df |
|
|
|
def fit_transform(self, df): |
|
"""Apply crypto-specific imputation with anti-homogenization measures.""" |
|
|
|
df_imputed = df.copy() |
|
df_imputed = df_imputed.sort_values(['symbol', 'interval_timestamp']) |
|
|
|
|
|
self.crypto_profiles = self._create_crypto_profiles(df_imputed) |
|
|
|
print(f"Created profiles for {len(self.crypto_profiles)} unique cryptocurrencies") |
|
|
|
|
|
categorical_cols = [ |
|
'symbol', 'cg_id', 'blockchain_network', 'stable', 'is_crypto', 'is_stock', |
|
'is_other', 'alpaca_data_available', 'is_trading_hours', 'is_weekend' |
|
] |
|
|
|
for col in categorical_cols: |
|
if col in df_imputed.columns: |
|
if col in ['is_crypto']: |
|
df_imputed[col] = df_imputed[col].fillna(1) |
|
elif col in ['is_stock', 'is_other']: |
|
df_imputed[col] = df_imputed[col].fillna(0) |
|
elif col in ['stable']: |
|
|
|
stablecoin_symbols = ['USDT', 'USDC', 'BUSD', 'DAI', 'TUSD', 'USDP'] |
|
for symbol in stablecoin_symbols: |
|
mask = df_imputed['symbol'].str.contains(symbol, case=False, na=False) |
|
df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(True) |
|
df_imputed[col] = df_imputed[col].fillna(False) |
|
else: |
|
df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill') |
|
|
|
|
|
exchange_symbol_cols = [col for col in df_imputed.columns if col.startswith('symbols.')] |
|
for col in exchange_symbol_cols: |
|
if df_imputed[col].dtype == 'object': |
|
|
|
df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill') |
|
|
|
|
|
core_market_cols = [ |
|
'price', 'marketcap', 'volume', 'dominance', 'rank', |
|
'open', 'high', 'low', 'close' |
|
] |
|
|
|
for col in core_market_cols: |
|
if col in df_imputed.columns and df_imputed[col].isnull().any(): |
|
print(f"Imputing {col} with crypto-specific context...") |
|
df_imputed[col] = self._impute_with_crypto_context( |
|
df_imputed, col, self.crypto_profiles |
|
) |
|
|
|
|
|
exchange_price_cols = [col for col in df_imputed.columns if col.startswith('exchangePrices.')] |
|
for col in exchange_price_cols: |
|
if df_imputed[col].isnull().any(): |
|
print(f"Imputing {col} with crypto-specific context...") |
|
df_imputed[col] = self._impute_with_crypto_context( |
|
df_imputed, col, self.crypto_profiles |
|
) |
|
|
|
|
|
performance_cols = [col for col in df_imputed.columns if col.startswith('performance.') or col.startswith('rankDiffs.')] |
|
for col in performance_cols: |
|
if df_imputed[col].isnull().any(): |
|
print(f"Imputing {col} with crypto-specific context...") |
|
df_imputed[col] = self._impute_with_crypto_context( |
|
df_imputed, col, self.crypto_profiles |
|
) |
|
|
|
|
|
tech_indicators = [ |
|
'rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_position', |
|
'stoch_k', 'stoch_d', 'cci', 'roc_5', 'roc_10', 'mfi', 'rsi_macd_signal', |
|
'ema_convergence', 'true_range_pct' |
|
] |
|
|
|
for col in tech_indicators: |
|
if col in df_imputed.columns and df_imputed[col].isnull().any(): |
|
print(f"Imputing {col} with crypto-specific context...") |
|
df_imputed[col] = self._impute_with_crypto_context( |
|
df_imputed, col, self.crypto_profiles |
|
) |
|
|
|
|
|
change_features = [ |
|
'price_change_1', 'price_change_7', 'price_change_14', 'volume_ratio', |
|
'volatility_7', 'price_volume_trend', 'volatility_consistency' |
|
] |
|
|
|
for col in change_features: |
|
if col in df_imputed.columns and df_imputed[col].isnull().any(): |
|
df_imputed[col] = self._impute_with_crypto_context( |
|
df_imputed, col, self.crypto_profiles |
|
) |
|
|
|
|
|
onchain_features = [ |
|
'transaction_volume', 'total_fees', 'total_gas_used', 'avg_gas_price', |
|
'transaction_count', 'tx_count_7d_change', 'tx_count_sma_7', |
|
'tx_volume_7d_change', 'tx_volume_sma_7', 'gas_used_7d_change', |
|
'gas_used_sma_7', 'gas_price_7d_change', 'gas_price_sma_7', |
|
'fees_7d_change', 'avg_tx_size', 'tx_price_correlation' |
|
] |
|
|
|
for col in onchain_features: |
|
if col in df_imputed.columns and df_imputed[col].isnull().any(): |
|
print(f"Imputing {col} with crypto on-chain context...") |
|
df_imputed[col] = self._impute_with_crypto_context( |
|
df_imputed, col, self.crypto_profiles |
|
) |
|
|
|
|
|
df_imputed = self._force_fill_stubborn_nulls(df_imputed) |
|
|
|
|
|
sentiment_features = [ |
|
'social_sentiment_mean', 'social_sentiment_std', 'social_sentiment_count', |
|
'social_confidence_mean', 'combined_sentiment', 'sentiment_agreement', |
|
'sentiment_change_1', 'sentiment_sma_7', 'sentiment_momentum', |
|
'sentiment_score', 'neg', 'neu', 'pos' |
|
] |
|
|
|
for col in sentiment_features: |
|
if col in df_imputed.columns and df_imputed[col].isnull().any(): |
|
if 'sentiment' in col.lower() and 'count' not in col.lower(): |
|
|
|
for symbol in df_imputed['symbol'].unique(): |
|
mask = df_imputed['symbol'] == symbol |
|
symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1 |
|
df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(symbol_hash) |
|
elif 'count' in col.lower(): |
|
df_imputed[col] = df_imputed[col].fillna(0) |
|
else: |
|
median_val = df_imputed[col].median() |
|
df_imputed[col] = df_imputed[col].fillna(median_val) |
|
|
|
|
|
quality_features = [ |
|
'data_quality_score', 'core_features_completeness', 'technical_indicators_completeness', |
|
'onchain_features_completeness', 'price_data_completeness', |
|
'overall_feature_completeness', 'data_completeness_score' |
|
] |
|
|
|
for col in quality_features: |
|
if col in df_imputed.columns and df_imputed[col].isnull().any(): |
|
median_val = np.clip(df_imputed[col].median(), 0, 1) |
|
|
|
for symbol in df_imputed['symbol'].unique(): |
|
mask = df_imputed['symbol'] == symbol |
|
symbol_hash = hash(symbol + col) % 100 / 10000 |
|
fill_val = np.clip(median_val + symbol_hash, 0, 1) |
|
df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(fill_val) |
|
|
|
|
|
temporal_features = ['hour', 'day_of_week', 'is_weekend', 'is_trading_hours'] |
|
for col in temporal_features: |
|
if col in df_imputed.columns and df_imputed[col].isnull().any(): |
|
if col == 'hour': |
|
df_imputed[col] = df_imputed[col].fillna(12) |
|
elif col == 'day_of_week': |
|
df_imputed[col] = df_imputed[col].fillna(3) |
|
elif col == 'is_weekend': |
|
df_imputed[col] = df_imputed[col].fillna(0) |
|
elif col == 'is_trading_hours': |
|
df_imputed[col] = df_imputed[col].fillna(1) |
|
|
|
|
|
remaining_numeric = df_imputed.select_dtypes(include=[np.number]).columns |
|
remaining_with_nulls = [col for col in remaining_numeric if df_imputed[col].isnull().any()] |
|
|
|
for col in remaining_with_nulls: |
|
if col not in ['id', 'id_alpaca', 'backup_id'] and not col.endswith('_timestamp'): |
|
print(f"Imputing remaining column {col}...") |
|
df_imputed[col] = self._impute_with_crypto_context( |
|
df_imputed, col, self.crypto_profiles |
|
) |
|
|
|
|
|
df_imputed = self._nuclear_null_elimination(df_imputed) |
|
|
|
print("[INFO] Crypto imputation complete with anti-homogenization measures") |
|
return df_imputed |
|
|
|
|
|
def impute_crypto_with_validation_fixed(file_path, output_path=None): |
|
"""Impute crypto data and validate no homogenization occurred.""" |
|
try: |
|
df = pd.read_parquet(file_path) |
|
except Exception as e: |
|
print(f"[ERROR] Failed to load file: {e}") |
|
return None |
|
|
|
|
|
symbols_sample = df['symbol'].unique()[:5] |
|
|
|
imputer = CryptoDataImputerFixed() |
|
df_imputed = imputer.fit_transform(df) |
|
|
|
|
|
problematic_cols = ['gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change'] |
|
for col in problematic_cols: |
|
if col in df_imputed.columns: |
|
null_count = df_imputed[col].isnull().sum() |
|
if null_count > 0: |
|
print(f"[EMERGENCY] Still {null_count} nulls in {col} - applying emergency fix") |
|
|
|
for symbol in df_imputed['symbol'].unique(): |
|
symbol_mask = (df_imputed['symbol'] == symbol) & df_imputed[col].isnull() |
|
if symbol_mask.any(): |
|
symbol_hash = hash(symbol + col) % 2000 / 1000 - 1 |
|
if 'fees' in col.lower(): |
|
fill_value = symbol_hash * 0.3 |
|
elif 'gas' in col.lower(): |
|
fill_value = symbol_hash * 0.25 |
|
else: |
|
fill_value = symbol_hash * 0.2 |
|
df_imputed.loc[symbol_mask, col] = fill_value |
|
|
|
|
|
df_imputed[col] = df_imputed[col].fillna(0) |
|
print(f"[EMERGENCY] {col} nulls after emergency fix: {df_imputed[col].isnull().sum()}") |
|
|
|
|
|
price_cols = ['high', 'low', 'close', 'volume', 'open'] |
|
for col in price_cols: |
|
alpaca_col = f"{col}_alpaca" |
|
if col in df_imputed.columns and alpaca_col in df_imputed.columns: |
|
df_imputed[col] = df_imputed[col].combine_first(df_imputed[alpaca_col]) |
|
|
|
|
|
drop_cols = [ |
|
'_filename', '_original_format', 'alpaca_data_available', |
|
'ask_exchange', 'ask_exchange_alpaca', 'bid_exchange', 'bid_exchange_alpaca', |
|
'conditions', 'conditions_alpaca', 'conditions_trade', 'conditions_trade_alpaca', |
|
'symbol_quote', 'symbol_quote_alpaca', 'symbol_trade', 'symbol_trade_alpaca', |
|
'tape', 'tape_alpaca', 'tape_trade', 'tape_trade_alpaca', |
|
'id', 'id_alpaca', 'is_new_symbol', 'timestamp_dt', |
|
'estimateCurrency', 'exchange', 'exchange_alpaca', 'exchange_company', |
|
'finnhubIndustry', 'logo', 'ticker', 'weburl', 'latest_news_timestamp', 'volume_price_momentum', |
|
'country', 'currency', 'ipo', 'name', 'period', 'phone', 'year', 'month', 'symbols.kraken', |
|
'datetime', 'headline', 'blockchain_network', 'symbols.cryptocom', 'symbols.bitmart', 'symbols.kucoin', 'symbols.okx', |
|
'symbols.coinbase','symbols.binance','symbols.mexc','symbols.bybit','symbols.bingx', 'symbols.huobi', 'symbols.bitget', 'symbols.gateio', |
|
'interval_timestamp_dt', 'interval_timestamp_alpaca', 'interval_timestamp_trade', 'feature_timestamp', 'alpaca_merge_timestamp', 'sentiment_timestamp', |
|
'hour', 'day_of_week', 'is_weekend', 'is_trading_hours', 'is_crypto', 'is_stock', 'is_other', 'gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change' |
|
] |
|
|
|
|
|
alpaca_cols = [col for col in df_imputed.columns if col.endswith('_alpaca')] |
|
drop_cols.extend(alpaca_cols) |
|
|
|
for col in drop_cols: |
|
if col in df_imputed.columns: |
|
df_imputed = df_imputed.drop(columns=col) |
|
|
|
|
|
cols = list(df_imputed.columns) |
|
if 'symbol' in cols and 'interval_timestamp' in cols: |
|
rest = [c for c in cols if c not in ['symbol', 'interval_timestamp']] |
|
df_imputed = df_imputed[['symbol', 'interval_timestamp'] + rest] |
|
|
|
|
|
for col in problematic_cols: |
|
if col in df_imputed.columns: |
|
null_count = df_imputed[col].isnull().sum() |
|
if null_count > 0: |
|
print(f"[FINAL CHECK] Still {null_count} nulls in {col} - final nuclear fill") |
|
df_imputed[col] = df_imputed[col].fillna(0) |
|
|
|
|
|
print("\n[VALIDATION] Checking for homogenization...") |
|
for symbol in symbols_sample: |
|
symbol_data = df_imputed[df_imputed['symbol'] == symbol] |
|
if len(symbol_data) > 0: |
|
price_mean = symbol_data['price'].mean() if 'price' in symbol_data.columns else 0 |
|
volume_mean = symbol_data['volume'].mean() if 'volume' in symbol_data.columns else 0 |
|
print(f" {symbol}: Price={price_mean:.2f}, Volume={volume_mean:.0f}") |
|
|
|
|
|
if output_path: |
|
|
|
if 'backup_id' in df_imputed.columns: |
|
df_imputed['backup_id'] = df_imputed['backup_id'].astype(str) |
|
|
|
try: |
|
df_imputed.to_parquet(output_path, compression='snappy') |
|
print(f"[INFO] Crypto data imputed and saved to: {output_path}") |
|
except Exception as e: |
|
print(f"[ERROR] Failed to save file: {e}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return df_imputed |
|
|
|
|
|
def main(): |
|
input_file = "data/merged/features/crypto_features.parquet" |
|
output_file = input_file |
|
|
|
df_clean = impute_crypto_with_validation_fixed(input_file, output_file) |
|
if df_clean is not None: |
|
print(f"\n[SUCCESS] Crypto data processing completed!") |
|
print(f"Final shape: {df_clean.shape}") |
|
print(f"Null values remaining: {df_clean.isnull().sum().sum()}") |
|
|
|
|
|
problematic_cols = ['gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change'] |
|
for col in problematic_cols: |
|
if col in df_clean.columns: |
|
nulls = df_clean[col].isnull().sum() |
|
print(f"[FINAL VERIFICATION] {col}: {nulls} nulls") |
|
else: |
|
print("[ERROR] Failed to load or impute crypto data.") |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|