File size: 6,962 Bytes
c49b21b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
#!/usr/bin/env python3
"""
Test script for null filling during merge operations
"""
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
# Add the merge directory to path
sys.path.append(str(Path(__file__).parent))
from merge_temp import fill_nulls_from_temp
def create_test_data():
"""Create test data with strategic null values"""
# Create merged data with some null values
merged_data = {
'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'],
'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000],
'price': [150.0, np.nan, 50000.0, np.nan, 4000.0], # AAPL and BTC have nulls
'volume': [1000000, 1200000, np.nan, 800000, np.nan], # BTC and ETH have nulls
'rsi': [65.0, np.nan, 70.0, 45.0, np.nan], # AAPL and ETH have nulls
'macd': [1.5, 1.8, np.nan, -0.5, 2.1] # BTC has null
}
df_merged = pd.DataFrame(merged_data)
# Create temp data that can fill some of the nulls
temp_data = {
'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'],
'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000],
'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0], # Can fill AAPL and BTC nulls
'volume': [950000, 1150000, 2000000, 780000, 500000, 400000], # Can fill BTC and ETH nulls
'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0], # Can fill AAPL and ETH nulls
'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8], # Can fill BTC null
'new_feature': [100, 200, 300, 400, 500, 600] # New feature not in merged
}
df_temp = pd.DataFrame(temp_data)
return df_merged, df_temp
def test_null_filling():
"""Test the null filling functionality"""
print("="*60)
print("TESTING NULL FILLING DURING MERGE")
print("="*60)
# Create test data
df_merged, df_temp = create_test_data()
print("BEFORE NULL FILLING:")
print(f"Merged data shape: {df_merged.shape}")
print(f"Temp data shape: {df_temp.shape}")
print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}")
print("\nNull values by column in merged data:")
for col in df_merged.columns:
null_count = df_merged[col].isnull().sum()
if null_count > 0:
print(f" {col}: {null_count} nulls")
print(f"\nMerged data preview:")
print(df_merged.to_string())
print(f"\nTemp data preview:")
print(df_temp.to_string())
# Test the null filling function
df_merged_copy = df_merged.copy()
nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp)
print(f"\nAFTER NULL FILLING:")
print(f"Nulls filled: {nulls_filled}")
print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}")
print("\nRemaining null values by column:")
for col in df_merged_copy.columns:
null_count = df_merged_copy[col].isnull().sum()
if null_count > 0:
print(f" {col}: {null_count} nulls")
print(f"\nFilled data preview:")
print(df_merged_copy.to_string())
# Verify specific cases
print(f"\nVERIFICATION:")
# Check AAPL price at timestamp 1640995260000 (should be filled)
aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') &
(df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0]
print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)")
# Check BTC volume at timestamp 1640995200000 (should be filled)
btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') &
(df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0]
print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)")
# Check if new features are NOT added (function should only fill existing columns)
has_new_feature = 'new_feature' in df_merged_copy.columns
print(f"New feature added: {has_new_feature} (should be False)")
# Calculate success rate
original_nulls = df_merged.isnull().sum().sum()
remaining_nulls = df_merged_copy.isnull().sum().sum()
filled_nulls = original_nulls - remaining_nulls
if filled_nulls == nulls_filled:
print(f"β
Null counting is consistent: {filled_nulls} nulls filled")
else:
print(f"β Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}")
if nulls_filled > 0:
fill_rate = (nulls_filled / original_nulls) * 100
print(f"β
Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})")
return True
else:
print("β No nulls were filled")
return False
def test_edge_cases():
"""Test edge cases for null filling"""
print(f"\n" + "="*60)
print("TESTING EDGE CASES")
print("="*60)
# Test with empty dataframes
df_empty = pd.DataFrame()
df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]})
print("Test 1: Empty merged dataframe")
nulls_filled = fill_nulls_from_temp(df_empty, df_test)
print(f"Nulls filled: {nulls_filled} (should be 0)")
print("Test 2: Empty temp dataframe")
df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]})
nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty)
print(f"Nulls filled: {nulls_filled} (should be 0)")
# Test with no matching keys
print("Test 3: No matching symbol+timestamp combinations")
df_merged_nomatch = pd.DataFrame({
'symbol': ['A'],
'interval_timestamp': [111],
'value': [np.nan]
})
df_temp_nomatch = pd.DataFrame({
'symbol': ['B'],
'interval_timestamp': [222],
'value': [100]
})
nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch)
print(f"Nulls filled: {nulls_filled} (should be 0)")
# Test with no common columns
print("Test 4: No common columns")
df_merged_nocols = pd.DataFrame({
'symbol': ['A'],
'interval_timestamp': [123],
'col1': [np.nan]
})
df_temp_nocols = pd.DataFrame({
'symbol': ['A'],
'interval_timestamp': [123],
'col2': [100]
})
nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols)
print(f"Nulls filled: {nulls_filled} (should be 0)")
print("β
All edge case tests completed")
def main():
"""Run all tests"""
success = test_null_filling()
test_edge_cases()
print(f"\n" + "="*60)
print("TEST SUMMARY")
print("="*60)
if success:
print("π Null filling functionality is working correctly!")
return 0
else:
print("β Null filling functionality has issues")
return 1
if __name__ == "__main__":
exit_code = main()
sys.exit(exit_code)
|