advisorai-data-enhanced / src /merge /test_null_filling_merge.py
Maaroufabousaleh
f
c49b21b
raw
history blame
6.96 kB
#!/usr/bin/env python3
"""
Test script for null filling during merge operations
"""
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
# Add the merge directory to path
sys.path.append(str(Path(__file__).parent))
from merge_temp import fill_nulls_from_temp
def create_test_data():
"""Create test data with strategic null values"""
# Create merged data with some null values
merged_data = {
'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'],
'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000],
'price': [150.0, np.nan, 50000.0, np.nan, 4000.0], # AAPL and BTC have nulls
'volume': [1000000, 1200000, np.nan, 800000, np.nan], # BTC and ETH have nulls
'rsi': [65.0, np.nan, 70.0, 45.0, np.nan], # AAPL and ETH have nulls
'macd': [1.5, 1.8, np.nan, -0.5, 2.1] # BTC has null
}
df_merged = pd.DataFrame(merged_data)
# Create temp data that can fill some of the nulls
temp_data = {
'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'],
'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000],
'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0], # Can fill AAPL and BTC nulls
'volume': [950000, 1150000, 2000000, 780000, 500000, 400000], # Can fill BTC and ETH nulls
'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0], # Can fill AAPL and ETH nulls
'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8], # Can fill BTC null
'new_feature': [100, 200, 300, 400, 500, 600] # New feature not in merged
}
df_temp = pd.DataFrame(temp_data)
return df_merged, df_temp
def test_null_filling():
"""Test the null filling functionality"""
print("="*60)
print("TESTING NULL FILLING DURING MERGE")
print("="*60)
# Create test data
df_merged, df_temp = create_test_data()
print("BEFORE NULL FILLING:")
print(f"Merged data shape: {df_merged.shape}")
print(f"Temp data shape: {df_temp.shape}")
print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}")
print("\nNull values by column in merged data:")
for col in df_merged.columns:
null_count = df_merged[col].isnull().sum()
if null_count > 0:
print(f" {col}: {null_count} nulls")
print(f"\nMerged data preview:")
print(df_merged.to_string())
print(f"\nTemp data preview:")
print(df_temp.to_string())
# Test the null filling function
df_merged_copy = df_merged.copy()
nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp)
print(f"\nAFTER NULL FILLING:")
print(f"Nulls filled: {nulls_filled}")
print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}")
print("\nRemaining null values by column:")
for col in df_merged_copy.columns:
null_count = df_merged_copy[col].isnull().sum()
if null_count > 0:
print(f" {col}: {null_count} nulls")
print(f"\nFilled data preview:")
print(df_merged_copy.to_string())
# Verify specific cases
print(f"\nVERIFICATION:")
# Check AAPL price at timestamp 1640995260000 (should be filled)
aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') &
(df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0]
print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)")
# Check BTC volume at timestamp 1640995200000 (should be filled)
btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') &
(df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0]
print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)")
# Check if new features are NOT added (function should only fill existing columns)
has_new_feature = 'new_feature' in df_merged_copy.columns
print(f"New feature added: {has_new_feature} (should be False)")
# Calculate success rate
original_nulls = df_merged.isnull().sum().sum()
remaining_nulls = df_merged_copy.isnull().sum().sum()
filled_nulls = original_nulls - remaining_nulls
if filled_nulls == nulls_filled:
print(f"βœ… Null counting is consistent: {filled_nulls} nulls filled")
else:
print(f"❌ Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}")
if nulls_filled > 0:
fill_rate = (nulls_filled / original_nulls) * 100
print(f"βœ… Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})")
return True
else:
print("❌ No nulls were filled")
return False
def test_edge_cases():
"""Test edge cases for null filling"""
print(f"\n" + "="*60)
print("TESTING EDGE CASES")
print("="*60)
# Test with empty dataframes
df_empty = pd.DataFrame()
df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]})
print("Test 1: Empty merged dataframe")
nulls_filled = fill_nulls_from_temp(df_empty, df_test)
print(f"Nulls filled: {nulls_filled} (should be 0)")
print("Test 2: Empty temp dataframe")
df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]})
nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty)
print(f"Nulls filled: {nulls_filled} (should be 0)")
# Test with no matching keys
print("Test 3: No matching symbol+timestamp combinations")
df_merged_nomatch = pd.DataFrame({
'symbol': ['A'],
'interval_timestamp': [111],
'value': [np.nan]
})
df_temp_nomatch = pd.DataFrame({
'symbol': ['B'],
'interval_timestamp': [222],
'value': [100]
})
nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch)
print(f"Nulls filled: {nulls_filled} (should be 0)")
# Test with no common columns
print("Test 4: No common columns")
df_merged_nocols = pd.DataFrame({
'symbol': ['A'],
'interval_timestamp': [123],
'col1': [np.nan]
})
df_temp_nocols = pd.DataFrame({
'symbol': ['A'],
'interval_timestamp': [123],
'col2': [100]
})
nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols)
print(f"Nulls filled: {nulls_filled} (should be 0)")
print("βœ… All edge case tests completed")
def main():
"""Run all tests"""
success = test_null_filling()
test_edge_cases()
print(f"\n" + "="*60)
print("TEST SUMMARY")
print("="*60)
if success:
print("πŸŽ‰ Null filling functionality is working correctly!")
return 0
else:
print("❌ Null filling functionality has issues")
return 1
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)