#!/usr/bin/env python3 """ Test script for null filling during merge operations """ import pandas as pd import numpy as np import os import sys from pathlib import Path # Add the merge directory to path sys.path.append(str(Path(__file__).parent)) from merge_temp import fill_nulls_from_temp def create_test_data(): """Create test data with strategic null values""" # Create merged data with some null values merged_data = { 'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'], 'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000], 'price': [150.0, np.nan, 50000.0, np.nan, 4000.0], # AAPL and BTC have nulls 'volume': [1000000, 1200000, np.nan, 800000, np.nan], # BTC and ETH have nulls 'rsi': [65.0, np.nan, 70.0, 45.0, np.nan], # AAPL and ETH have nulls 'macd': [1.5, 1.8, np.nan, -0.5, 2.1] # BTC has null } df_merged = pd.DataFrame(merged_data) # Create temp data that can fill some of the nulls temp_data = { 'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'], 'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000], 'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0], # Can fill AAPL and BTC nulls 'volume': [950000, 1150000, 2000000, 780000, 500000, 400000], # Can fill BTC and ETH nulls 'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0], # Can fill AAPL and ETH nulls 'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8], # Can fill BTC null 'new_feature': [100, 200, 300, 400, 500, 600] # New feature not in merged } df_temp = pd.DataFrame(temp_data) return df_merged, df_temp def test_null_filling(): """Test the null filling functionality""" print("="*60) print("TESTING NULL FILLING DURING MERGE") print("="*60) # Create test data df_merged, df_temp = create_test_data() print("BEFORE NULL FILLING:") print(f"Merged data shape: {df_merged.shape}") print(f"Temp data shape: {df_temp.shape}") print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}") print("\nNull values by column in merged data:") for col in df_merged.columns: null_count = df_merged[col].isnull().sum() if null_count > 0: print(f" {col}: {null_count} nulls") print(f"\nMerged data preview:") print(df_merged.to_string()) print(f"\nTemp data preview:") print(df_temp.to_string()) # Test the null filling function df_merged_copy = df_merged.copy() nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp) print(f"\nAFTER NULL FILLING:") print(f"Nulls filled: {nulls_filled}") print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}") print("\nRemaining null values by column:") for col in df_merged_copy.columns: null_count = df_merged_copy[col].isnull().sum() if null_count > 0: print(f" {col}: {null_count} nulls") print(f"\nFilled data preview:") print(df_merged_copy.to_string()) # Verify specific cases print(f"\nVERIFICATION:") # Check AAPL price at timestamp 1640995260000 (should be filled) aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') & (df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0] print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)") # Check BTC volume at timestamp 1640995200000 (should be filled) btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') & (df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0] print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)") # Check if new features are NOT added (function should only fill existing columns) has_new_feature = 'new_feature' in df_merged_copy.columns print(f"New feature added: {has_new_feature} (should be False)") # Calculate success rate original_nulls = df_merged.isnull().sum().sum() remaining_nulls = df_merged_copy.isnull().sum().sum() filled_nulls = original_nulls - remaining_nulls if filled_nulls == nulls_filled: print(f"✅ Null counting is consistent: {filled_nulls} nulls filled") else: print(f"❌ Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}") if nulls_filled > 0: fill_rate = (nulls_filled / original_nulls) * 100 print(f"✅ Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})") return True else: print("❌ No nulls were filled") return False def test_edge_cases(): """Test edge cases for null filling""" print(f"\n" + "="*60) print("TESTING EDGE CASES") print("="*60) # Test with empty dataframes df_empty = pd.DataFrame() df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]}) print("Test 1: Empty merged dataframe") nulls_filled = fill_nulls_from_temp(df_empty, df_test) print(f"Nulls filled: {nulls_filled} (should be 0)") print("Test 2: Empty temp dataframe") df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]}) nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty) print(f"Nulls filled: {nulls_filled} (should be 0)") # Test with no matching keys print("Test 3: No matching symbol+timestamp combinations") df_merged_nomatch = pd.DataFrame({ 'symbol': ['A'], 'interval_timestamp': [111], 'value': [np.nan] }) df_temp_nomatch = pd.DataFrame({ 'symbol': ['B'], 'interval_timestamp': [222], 'value': [100] }) nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch) print(f"Nulls filled: {nulls_filled} (should be 0)") # Test with no common columns print("Test 4: No common columns") df_merged_nocols = pd.DataFrame({ 'symbol': ['A'], 'interval_timestamp': [123], 'col1': [np.nan] }) df_temp_nocols = pd.DataFrame({ 'symbol': ['A'], 'interval_timestamp': [123], 'col2': [100] }) nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols) print(f"Nulls filled: {nulls_filled} (should be 0)") print("✅ All edge case tests completed") def main(): """Run all tests""" success = test_null_filling() test_edge_cases() print(f"\n" + "="*60) print("TEST SUMMARY") print("="*60) if success: print("🎉 Null filling functionality is working correctly!") return 0 else: print("❌ Null filling functionality has issues") return 1 if __name__ == "__main__": exit_code = main() sys.exit(exit_code)