Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

File size: 6,962 Bytes

c49b21b

#!/usr/bin/env python3
"""
Test script for null filling during merge operations
"""

import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path

# Add the merge directory to path
sys.path.append(str(Path(__file__).parent))

from merge_temp import fill_nulls_from_temp

def create_test_data():
    """Create test data with strategic null values"""
    
    # Create merged data with some null values
    merged_data = {
        'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'],
        'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000],
        'price': [150.0, np.nan, 50000.0, np.nan, 4000.0],  # AAPL and BTC have nulls
        'volume': [1000000, 1200000, np.nan, 800000, np.nan],  # BTC and ETH have nulls
        'rsi': [65.0, np.nan, 70.0, 45.0, np.nan],  # AAPL and ETH have nulls
        'macd': [1.5, 1.8, np.nan, -0.5, 2.1]  # BTC has null
    }
    df_merged = pd.DataFrame(merged_data)
    
    # Create temp data that can fill some of the nulls
    temp_data = {
        'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'],
        'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000],
        'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0],  # Can fill AAPL and BTC nulls
        'volume': [950000, 1150000, 2000000, 780000, 500000, 400000],  # Can fill BTC and ETH nulls
        'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0],  # Can fill AAPL and ETH nulls
        'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8],  # Can fill BTC null
        'new_feature': [100, 200, 300, 400, 500, 600]  # New feature not in merged
    }
    df_temp = pd.DataFrame(temp_data)
    
    return df_merged, df_temp

def test_null_filling():
    """Test the null filling functionality"""
    print("="*60)
    print("TESTING NULL FILLING DURING MERGE")
    print("="*60)
    
    # Create test data
    df_merged, df_temp = create_test_data()
    
    print("BEFORE NULL FILLING:")
    print(f"Merged data shape: {df_merged.shape}")
    print(f"Temp data shape: {df_temp.shape}")
    print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}")
    print("\nNull values by column in merged data:")
    for col in df_merged.columns:
        null_count = df_merged[col].isnull().sum()
        if null_count > 0:
            print(f"  {col}: {null_count} nulls")
    
    print(f"\nMerged data preview:")
    print(df_merged.to_string())
    print(f"\nTemp data preview:")
    print(df_temp.to_string())
    
    # Test the null filling function
    df_merged_copy = df_merged.copy()
    nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp)
    
    print(f"\nAFTER NULL FILLING:")
    print(f"Nulls filled: {nulls_filled}")
    print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}")
    print("\nRemaining null values by column:")
    for col in df_merged_copy.columns:
        null_count = df_merged_copy[col].isnull().sum()
        if null_count > 0:
            print(f"  {col}: {null_count} nulls")
    
    print(f"\nFilled data preview:")
    print(df_merged_copy.to_string())
    
    # Verify specific cases
    print(f"\nVERIFICATION:")
    
    # Check AAPL price at timestamp 1640995260000 (should be filled)
    aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') & 
                                (df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0]
    print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)")
    
    # Check BTC volume at timestamp 1640995200000 (should be filled)
    btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') & 
                                (df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0]
    print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)")
    
    # Check if new features are NOT added (function should only fill existing columns)
    has_new_feature = 'new_feature' in df_merged_copy.columns
    print(f"New feature added: {has_new_feature} (should be False)")
    
    # Calculate success rate
    original_nulls = df_merged.isnull().sum().sum()
    remaining_nulls = df_merged_copy.isnull().sum().sum()
    filled_nulls = original_nulls - remaining_nulls
    
    if filled_nulls == nulls_filled:
        print(f"✅ Null counting is consistent: {filled_nulls} nulls filled")
    else:
        print(f"❌ Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}")
    
    if nulls_filled > 0:
        fill_rate = (nulls_filled / original_nulls) * 100
        print(f"✅ Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})")
        return True
    else:
        print("❌ No nulls were filled")
        return False

def test_edge_cases():
    """Test edge cases for null filling"""
    print(f"\n" + "="*60)
    print("TESTING EDGE CASES")
    print("="*60)
    
    # Test with empty dataframes
    df_empty = pd.DataFrame()
    df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]})
    
    print("Test 1: Empty merged dataframe")
    nulls_filled = fill_nulls_from_temp(df_empty, df_test)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    print("Test 2: Empty temp dataframe") 
    df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]})
    nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    # Test with no matching keys
    print("Test 3: No matching symbol+timestamp combinations")
    df_merged_nomatch = pd.DataFrame({
        'symbol': ['A'], 
        'interval_timestamp': [111], 
        'value': [np.nan]
    })
    df_temp_nomatch = pd.DataFrame({
        'symbol': ['B'], 
        'interval_timestamp': [222], 
        'value': [100]
    })
    nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    # Test with no common columns
    print("Test 4: No common columns")
    df_merged_nocols = pd.DataFrame({
        'symbol': ['A'], 
        'interval_timestamp': [123], 
        'col1': [np.nan]
    })
    df_temp_nocols = pd.DataFrame({
        'symbol': ['A'], 
        'interval_timestamp': [123], 
        'col2': [100]
    })
    nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    print("✅ All edge case tests completed")

def main():
    """Run all tests"""
    success = test_null_filling()
    test_edge_cases()
    
    print(f"\n" + "="*60)
    print("TEST SUMMARY")
    print("="*60)
    
    if success:
        print("🎉 Null filling functionality is working correctly!")
        return 0
    else:
        print("❌ Null filling functionality has issues")
        return 1

if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)