Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

advisorai-data-enhanced / src /merge /test_enhanced_null_handling.py

Maaroufabousaleh

c49b21b about 1 month ago

9.78 kB

	#!/usr/bin/env python3
	"""
	Test script for the enhanced symbol-first null handling strategy
	"""

	import pandas as pd
	import numpy as np
	import sys
	from pathlib import Path
	import json

	# Add the merge directory to path
	sys.path.append(str(Path(__file__).parent.parent))

	from final_null_handler import FinalNullValueHandler

	def create_realistic_test_data():
	"""Create realistic test data with temporal patterns and symbol-specific characteristics"""

	# Create timestamps for the last 30 days
	timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H')
	timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist()

	symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL']
	data = []

	for symbol in symbols:
	for i, ts in enumerate(timestamp_ms[:100]): # 100 records per symbol

	if symbol in ['bitcoin', 'ethereum']:
	# Crypto data
	base_price = 50000 if symbol == 'bitcoin' else 3000
	price_trend = i * 10 # Upward trend
	price = base_price + price_trend + np.random.normal(0, 500)

	record = {
	'symbol': symbol,
	'interval_timestamp': ts,
	'price': price if np.random.random() > 0.2 else np.nan, # 20% nulls
	'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan,
	'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan,
	'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan,
	'rank': 1 if symbol == 'bitcoin' else 2,
	'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan,
	'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan,
	'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan,
	'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan,
	'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan,
	'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan,
	'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan,
	'stable': False
	}
	else:
	# Stock data
	base_price = 150 if symbol == 'AAPL' else 2800
	price_trend = i * 0.5 # Modest upward trend
	price = base_price + price_trend + np.random.normal(0, 5)

	record = {
	'symbol': symbol,
	'interval_timestamp': ts,
	'close': price if np.random.random() > 0.2 else np.nan,
	'open': price * 0.995 if np.random.random() > 0.2 else np.nan,
	'high': price * 1.02 if np.random.random() > 0.15 else np.nan,
	'low': price * 0.98 if np.random.random() > 0.15 else np.nan,
	'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan,
	'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan,
	'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan,
	'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan,
	'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan,
	'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan,
	'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan,
	'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan,
	'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan,
	'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan,
	}

	data.append(record)

	return pd.DataFrame(data)

	def test_symbol_first_strategy():
	"""Test the symbol-first null handling strategy"""
	print("="*70)
	print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY")
	print("="*70)

	# Create realistic test data
	print("Creating realistic test data with temporal patterns...")
	df = create_realistic_test_data()

	print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns")
	print(f"Symbols: {df['symbol'].unique()}")
	print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}")

	# Analyze null patterns before processing
	print(f"\nNULL ANALYSIS BEFORE PROCESSING:")
	total_nulls_before = df.isnull().sum().sum()
	print(f"Total nulls: {total_nulls_before}")

	symbol_nulls_before = {}
	for symbol in df['symbol'].unique():
	symbol_data = df[df['symbol'] == symbol]
	symbol_nulls = symbol_data.isnull().sum().sum()
	symbol_nulls_before[symbol] = symbol_nulls
	print(f" {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)")

	# Test the enhanced handler
	print(f"\nTESTING ENHANCED NULL HANDLER...")
	handler = FinalNullValueHandler()

	# Separate crypto and stock data for targeted processing
	crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum'])
	stock_mask = df['symbol'].isin(['AAPL', 'GOOGL'])

	results = {}

	if crypto_mask.any():
	print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...")
	df_crypto = df[crypto_mask].copy()
	df_crypto_processed = handler.process_crypto_features(df_crypto)
	df.loc[crypto_mask] = df_crypto_processed

	crypto_nulls_after = df_crypto_processed.isnull().sum().sum()
	results['crypto'] = {
	'nulls_before': df_crypto.isnull().sum().sum(),
	'nulls_after': crypto_nulls_after,
	'symbols': ['bitcoin', 'ethereum']
	}

	if stock_mask.any():
	print(f"\nProcessing stock data ({stock_mask.sum()} rows)...")
	df_stock = df[stock_mask].copy()
	df_stock_processed = handler.process_stock_features(df_stock)
	df.loc[stock_mask] = df_stock_processed

	stock_nulls_after = df_stock_processed.isnull().sum().sum()
	results['stock'] = {
	'nulls_before': df_stock.isnull().sum().sum(),
	'nulls_after': stock_nulls_after,
	'symbols': ['AAPL', 'GOOGL']
	}

	# Analyze results
	print(f"\nRESULTS ANALYSIS:")
	total_nulls_after = df.isnull().sum().sum()
	print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})")

	for asset_type, result in results.items():
	nulls_filled = result['nulls_before'] - result['nulls_after']
	fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0
	print(f" {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")

	# Symbol-level analysis
	print(f"\nSYMBOL-LEVEL ANALYSIS:")
	for symbol in df['symbol'].unique():
	symbol_data = df[df['symbol'] == symbol]
	nulls_after = symbol_data.isnull().sum().sum()
	nulls_filled = symbol_nulls_before[symbol] - nulls_after
	fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0
	print(f" {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")

	# Quality checks
	print(f"\nQUALITY CHECKS:")
	infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
	print(f" Infinite values: {infinite_values}")
	print(f" Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}")

	# Test temporal interpolation effectiveness
	print(f"\nTEMPORAL INTERPOLATION TEST:")
	for symbol in df['symbol'].unique():
	symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp')
	if 'price' in symbol_data.columns:
	price_series = symbol_data['price']
	if len(price_series.dropna()) >= 2:
	# Check if we have reasonable price progression
	price_diff = price_series.dropna().diff().abs().mean()
	print(f" {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)")

	# Overall success assessment
	success = (total_nulls_after == 0 and
	infinite_values == 0 and
	all(result['nulls_after'] < result['nulls_before'] for result in results.values()))

	if success:
	print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!")
	print(f" - All nulls handled successfully")
	print(f" - No infinite values introduced")
	print(f" - Symbol-specific patterns preserved")
	print(f" - Temporal interpolation working")
	return True
	else:
	print(f"\n❌ Test failed - review results above")
	return False

	def main():
	"""Main test function"""
	try:
	success = test_symbol_first_strategy()
	return 0 if success else 1
	except Exception as e:
	print(f"❌ Test failed with error: {str(e)}")
	import traceback
	traceback.print_exc()
	return 1

	if __name__ == "__main__":
	exit_code = main()
	sys.exit(exit_code)