Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

advisorai-data-enhanced / src /merge /test_null_filling_merge.py

Maaroufabousaleh

c49b21b about 1 month ago

6.96 kB

	#!/usr/bin/env python3
	"""
	Test script for null filling during merge operations
	"""

	import pandas as pd
	import numpy as np
	import os
	import sys
	from pathlib import Path

	# Add the merge directory to path
	sys.path.append(str(Path(__file__).parent))

	from merge_temp import fill_nulls_from_temp

	def create_test_data():
	"""Create test data with strategic null values"""

	# Create merged data with some null values
	merged_data = {
	'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'],
	'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000],
	'price': [150.0, np.nan, 50000.0, np.nan, 4000.0], # AAPL and BTC have nulls
	'volume': [1000000, 1200000, np.nan, 800000, np.nan], # BTC and ETH have nulls
	'rsi': [65.0, np.nan, 70.0, 45.0, np.nan], # AAPL and ETH have nulls
	'macd': [1.5, 1.8, np.nan, -0.5, 2.1] # BTC has null
	}
	df_merged = pd.DataFrame(merged_data)

	# Create temp data that can fill some of the nulls
	temp_data = {
	'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'],
	'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000],
	'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0], # Can fill AAPL and BTC nulls
	'volume': [950000, 1150000, 2000000, 780000, 500000, 400000], # Can fill BTC and ETH nulls
	'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0], # Can fill AAPL and ETH nulls
	'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8], # Can fill BTC null
	'new_feature': [100, 200, 300, 400, 500, 600] # New feature not in merged
	}
	df_temp = pd.DataFrame(temp_data)

	return df_merged, df_temp

	def test_null_filling():
	"""Test the null filling functionality"""
	print("="*60)
	print("TESTING NULL FILLING DURING MERGE")
	print("="*60)

	# Create test data
	df_merged, df_temp = create_test_data()

	print("BEFORE NULL FILLING:")
	print(f"Merged data shape: {df_merged.shape}")
	print(f"Temp data shape: {df_temp.shape}")
	print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}")
	print("\nNull values by column in merged data:")
	for col in df_merged.columns:
	null_count = df_merged[col].isnull().sum()
	if null_count > 0:
	print(f" {col}: {null_count} nulls")

	print(f"\nMerged data preview:")
	print(df_merged.to_string())
	print(f"\nTemp data preview:")
	print(df_temp.to_string())

	# Test the null filling function
	df_merged_copy = df_merged.copy()
	nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp)

	print(f"\nAFTER NULL FILLING:")
	print(f"Nulls filled: {nulls_filled}")
	print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}")
	print("\nRemaining null values by column:")
	for col in df_merged_copy.columns:
	null_count = df_merged_copy[col].isnull().sum()
	if null_count > 0:
	print(f" {col}: {null_count} nulls")

	print(f"\nFilled data preview:")
	print(df_merged_copy.to_string())

	# Verify specific cases
	print(f"\nVERIFICATION:")

	# Check AAPL price at timestamp 1640995260000 (should be filled)
	aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') &
	(df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0]
	print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)")

	# Check BTC volume at timestamp 1640995200000 (should be filled)
	btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') &
	(df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0]
	print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)")

	# Check if new features are NOT added (function should only fill existing columns)
	has_new_feature = 'new_feature' in df_merged_copy.columns
	print(f"New feature added: {has_new_feature} (should be False)")

	# Calculate success rate
	original_nulls = df_merged.isnull().sum().sum()
	remaining_nulls = df_merged_copy.isnull().sum().sum()
	filled_nulls = original_nulls - remaining_nulls

	if filled_nulls == nulls_filled:
	print(f"✅ Null counting is consistent: {filled_nulls} nulls filled")
	else:
	print(f"❌ Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}")

	if nulls_filled > 0:
	fill_rate = (nulls_filled / original_nulls) * 100
	print(f"✅ Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})")
	return True
	else:
	print("❌ No nulls were filled")
	return False

	def test_edge_cases():
	"""Test edge cases for null filling"""
	print(f"\n" + "="*60)
	print("TESTING EDGE CASES")
	print("="*60)

	# Test with empty dataframes
	df_empty = pd.DataFrame()
	df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]})

	print("Test 1: Empty merged dataframe")
	nulls_filled = fill_nulls_from_temp(df_empty, df_test)
	print(f"Nulls filled: {nulls_filled} (should be 0)")

	print("Test 2: Empty temp dataframe")
	df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]})
	nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty)
	print(f"Nulls filled: {nulls_filled} (should be 0)")

	# Test with no matching keys
	print("Test 3: No matching symbol+timestamp combinations")
	df_merged_nomatch = pd.DataFrame({
	'symbol': ['A'],
	'interval_timestamp': [111],
	'value': [np.nan]
	})
	df_temp_nomatch = pd.DataFrame({
	'symbol': ['B'],
	'interval_timestamp': [222],
	'value': [100]
	})
	nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch)
	print(f"Nulls filled: {nulls_filled} (should be 0)")

	# Test with no common columns
	print("Test 4: No common columns")
	df_merged_nocols = pd.DataFrame({
	'symbol': ['A'],
	'interval_timestamp': [123],
	'col1': [np.nan]
	})
	df_temp_nocols = pd.DataFrame({
	'symbol': ['A'],
	'interval_timestamp': [123],
	'col2': [100]
	})
	nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols)
	print(f"Nulls filled: {nulls_filled} (should be 0)")

	print("✅ All edge case tests completed")

	def main():
	"""Run all tests"""
	success = test_null_filling()
	test_edge_cases()

	print(f"\n" + "="*60)
	print("TEST SUMMARY")
	print("="*60)

	if success:
	print("🎉 Null filling functionality is working correctly!")
	return 0
	else:
	print("❌ Null filling functionality has issues")
	return 1

	if __name__ == "__main__":
	exit_code = main()
	sys.exit(exit_code)