File size: 6,962 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python3
"""
Test script for null filling during merge operations
"""

import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path

# Add the merge directory to path
sys.path.append(str(Path(__file__).parent))

from merge_temp import fill_nulls_from_temp

def create_test_data():
    """Create test data with strategic null values"""
    
    # Create merged data with some null values
    merged_data = {
        'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'],
        'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000],
        'price': [150.0, np.nan, 50000.0, np.nan, 4000.0],  # AAPL and BTC have nulls
        'volume': [1000000, 1200000, np.nan, 800000, np.nan],  # BTC and ETH have nulls
        'rsi': [65.0, np.nan, 70.0, 45.0, np.nan],  # AAPL and ETH have nulls
        'macd': [1.5, 1.8, np.nan, -0.5, 2.1]  # BTC has null
    }
    df_merged = pd.DataFrame(merged_data)
    
    # Create temp data that can fill some of the nulls
    temp_data = {
        'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'],
        'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000],
        'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0],  # Can fill AAPL and BTC nulls
        'volume': [950000, 1150000, 2000000, 780000, 500000, 400000],  # Can fill BTC and ETH nulls
        'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0],  # Can fill AAPL and ETH nulls
        'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8],  # Can fill BTC null
        'new_feature': [100, 200, 300, 400, 500, 600]  # New feature not in merged
    }
    df_temp = pd.DataFrame(temp_data)
    
    return df_merged, df_temp

def test_null_filling():
    """Test the null filling functionality"""
    print("="*60)
    print("TESTING NULL FILLING DURING MERGE")
    print("="*60)
    
    # Create test data
    df_merged, df_temp = create_test_data()
    
    print("BEFORE NULL FILLING:")
    print(f"Merged data shape: {df_merged.shape}")
    print(f"Temp data shape: {df_temp.shape}")
    print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}")
    print("\nNull values by column in merged data:")
    for col in df_merged.columns:
        null_count = df_merged[col].isnull().sum()
        if null_count > 0:
            print(f"  {col}: {null_count} nulls")
    
    print(f"\nMerged data preview:")
    print(df_merged.to_string())
    print(f"\nTemp data preview:")
    print(df_temp.to_string())
    
    # Test the null filling function
    df_merged_copy = df_merged.copy()
    nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp)
    
    print(f"\nAFTER NULL FILLING:")
    print(f"Nulls filled: {nulls_filled}")
    print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}")
    print("\nRemaining null values by column:")
    for col in df_merged_copy.columns:
        null_count = df_merged_copy[col].isnull().sum()
        if null_count > 0:
            print(f"  {col}: {null_count} nulls")
    
    print(f"\nFilled data preview:")
    print(df_merged_copy.to_string())
    
    # Verify specific cases
    print(f"\nVERIFICATION:")
    
    # Check AAPL price at timestamp 1640995260000 (should be filled)
    aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') & 
                                (df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0]
    print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)")
    
    # Check BTC volume at timestamp 1640995200000 (should be filled)
    btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') & 
                                (df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0]
    print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)")
    
    # Check if new features are NOT added (function should only fill existing columns)
    has_new_feature = 'new_feature' in df_merged_copy.columns
    print(f"New feature added: {has_new_feature} (should be False)")
    
    # Calculate success rate
    original_nulls = df_merged.isnull().sum().sum()
    remaining_nulls = df_merged_copy.isnull().sum().sum()
    filled_nulls = original_nulls - remaining_nulls
    
    if filled_nulls == nulls_filled:
        print(f"βœ… Null counting is consistent: {filled_nulls} nulls filled")
    else:
        print(f"❌ Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}")
    
    if nulls_filled > 0:
        fill_rate = (nulls_filled / original_nulls) * 100
        print(f"βœ… Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})")
        return True
    else:
        print("❌ No nulls were filled")
        return False

def test_edge_cases():
    """Test edge cases for null filling"""
    print(f"\n" + "="*60)
    print("TESTING EDGE CASES")
    print("="*60)
    
    # Test with empty dataframes
    df_empty = pd.DataFrame()
    df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]})
    
    print("Test 1: Empty merged dataframe")
    nulls_filled = fill_nulls_from_temp(df_empty, df_test)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    print("Test 2: Empty temp dataframe") 
    df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]})
    nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    # Test with no matching keys
    print("Test 3: No matching symbol+timestamp combinations")
    df_merged_nomatch = pd.DataFrame({
        'symbol': ['A'], 
        'interval_timestamp': [111], 
        'value': [np.nan]
    })
    df_temp_nomatch = pd.DataFrame({
        'symbol': ['B'], 
        'interval_timestamp': [222], 
        'value': [100]
    })
    nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    # Test with no common columns
    print("Test 4: No common columns")
    df_merged_nocols = pd.DataFrame({
        'symbol': ['A'], 
        'interval_timestamp': [123], 
        'col1': [np.nan]
    })
    df_temp_nocols = pd.DataFrame({
        'symbol': ['A'], 
        'interval_timestamp': [123], 
        'col2': [100]
    })
    nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols)
    print(f"Nulls filled: {nulls_filled} (should be 0)")
    
    print("βœ… All edge case tests completed")

def main():
    """Run all tests"""
    success = test_null_filling()
    test_edge_cases()
    
    print(f"\n" + "="*60)
    print("TEST SUMMARY")
    print("="*60)
    
    if success:
        print("πŸŽ‰ Null filling functionality is working correctly!")
        return 0
    else:
        print("❌ Null filling functionality has issues")
        return 1

if __name__ == "__main__":
    exit_code = main()
    sys.exit(exit_code)