File size: 19,650 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
import pandas as pd
import os
import numpy as np
from datetime import datetime, timedelta

DAYS_OLD = 7
MERGED_DIR = "data/merged/features"
TEMP_DIR = "data/merged/temp"

# Helper: safely cast a value to match a target column's dtype (e.g., drop tz on datetimes)
def _cast_value_for_column(target_series: pd.Series, value):
    try:
        # If target is datetime64[ns], ensure assigned value is tz-naive
        if pd.api.types.is_datetime64_any_dtype(target_series.dtype):
            v = pd.to_datetime(value, errors='coerce', utc=True)
            if isinstance(v, pd.Timestamp):
                return v.tz_localize(None)
            return v
        return value
    except Exception:
        return value

def fill_nulls_from_temp(df_merged, df_temp):
    """
    Fill null values in df_merged using non-null values from df_temp 
    for the same symbol + interval_timestamp combination.
    Returns the number of null values filled.
    """
    nulls_filled = 0
    
    if df_merged.empty or df_temp.empty:
        return nulls_filled
    
    # Create lookup key for efficient matching
    key_cols = ["symbol", "interval_timestamp"]
    
    # Check if key columns exist in both dataframes
    if not all(col in df_merged.columns for col in key_cols):
        print("[WARN] Key columns missing in merged data, skipping null filling")
        return nulls_filled
    if not all(col in df_temp.columns for col in key_cols):
        print("[WARN] Key columns missing in temp data, skipping null filling")
        return nulls_filled
    
    # Create a lookup dictionary from temp data
    # Format: {(symbol, timestamp): {column: value, ...}}
    temp_lookup = {}
    for _, row in df_temp.iterrows():
        key = (row['symbol'], row['interval_timestamp'])
        temp_lookup[key] = row.to_dict()
    
    # Find common columns between merged and temp (excluding keys)
    common_cols = [col for col in df_merged.columns 
                   if col in df_temp.columns and col not in key_cols]
    
    if not common_cols:
        print("[WARN] No common columns found for null filling")
        return nulls_filled
    
    # Track columns with null values before processing
    null_cols_before = []
    for col in common_cols:
        if df_merged[col].isnull().any():
            null_cols_before.append(col)
    
    if not null_cols_before:
        print("[INFO] No null values found in common columns")
        return nulls_filled
    
    print(f"[INFO] Attempting to fill nulls in {len(null_cols_before)} columns: {null_cols_before}")
    
    # Fill null values row by row
    for idx, row in df_merged.iterrows():
        key = (row['symbol'], row['interval_timestamp'])
        
        # Check if we have corresponding temp data for this key
        if key in temp_lookup:
            temp_row = temp_lookup[key]
            
            # Fill null values for each column
            for col in null_cols_before:
                try:
                    # Use more robust null checking to handle arrays/scalars
                    row_val = row[col]
                    temp_val = temp_row.get(col)
                    
                    # Check if row value is null (handle both scalar and array cases)
                    row_is_null = pd.isnull(row_val)
                    if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
                        row_is_null = row_is_null.any()  # For arrays, check if any are null
                    
                    # Check if temp value is not null
                    temp_is_not_null = not pd.isnull(temp_val)
                    if hasattr(temp_is_not_null, '__len__') and len(temp_is_not_null) > 1:
                        temp_is_not_null = temp_is_not_null.all()  # For arrays, check if all are not null
                    
                    if row_is_null and temp_is_not_null:
                        # Fill the null value with dtype-compatible casting
                        df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], temp_val)
                        nulls_filled += 1
                except Exception as e:
                    # Skip problematic columns with a warning
                    print(f"[WARN] Could not process column '{col}' for null filling: {e}")
                    continue
    
    if nulls_filled > 0:
        print(f"[INFO] Successfully filled {nulls_filled} null values from temp data")
        
        # Report which columns were improved
        for col in null_cols_before:
            nulls_remaining = df_merged[col].isnull().sum()
            print(f"[INFO] Column '{col}': {nulls_remaining} nulls remaining")
    
    return nulls_filled

# Helper to filter new records (DISABLED - now keeps ALL data for accumulative merging)
def filter_new(df):
    # IMPORTANT: Return ALL data instead of filtering by days
    # This ensures accumulative merging from day one
    return df.copy()

def merge_temp_to_merged(temp_name, merged_name):
    temp_path = os.path.join(TEMP_DIR, temp_name)
    merged_path = os.path.join(MERGED_DIR, merged_name)
    if not os.path.exists(temp_path):
        print(f"[WARN] Temp file missing: {temp_path}")
        return
    if not os.path.exists(merged_path):
        print(f"[WARN] Merged file missing: {merged_path}")
        return
    
    df_temp = pd.read_parquet(temp_path)
    df_merged = pd.read_parquet(merged_path)
    
    # Check if required columns exist
    required_cols = ["symbol", "interval_timestamp"]
    missing_cols_temp = [col for col in required_cols if col not in df_temp.columns]
    missing_cols_merged = [col for col in required_cols if col not in df_merged.columns]
    
    if missing_cols_temp:
        print(f"[ERROR] Missing columns in temp file {temp_name}: {missing_cols_temp}")
        print(f"[INFO] Available columns in temp: {list(df_temp.columns)}")
        return
    
    if missing_cols_merged:
        print(f"[ERROR] Missing columns in merged file {merged_name}: {missing_cols_merged}")
        print(f"[INFO] Available columns in merged: {list(df_merged.columns)}")
        return
    
    new_temp = filter_new(df_temp)
    
    # Step 1: Fill null values in merged data using temp data for same symbol+timestamp
    nulls_filled = fill_nulls_from_temp(df_merged, df_temp)
    
    # Step 2: Only add truly new rows (not already in merged)
    key_cols = ["symbol", "interval_timestamp"]
    merged_keys = set(tuple(row) for row in df_merged[key_cols].values)
    new_rows = new_temp[~new_temp[key_cols].apply(tuple, axis=1).isin(merged_keys)]
    
    if new_rows.empty and nulls_filled == 0:
        print(f"[INFO] No new records to add from {temp_name} and no nulls filled")
        return
    
    df_final = pd.concat([df_merged, new_rows], ignore_index=True)
    df_final.to_parquet(merged_path, index=False)
    print(f"[OK] Added {len(new_rows)} new records from {temp_name} to {merged_name}, filled {nulls_filled} null values")

def merge_all_to_train(archive_name, features_name, temp_name, train_name):
    """
    Merge archive, features, and temp files into a deduplicated train file under merge/train/.
    Uniqueness is enforced on (symbol, interval_timestamp).
    Also performs null filling between different sources.
    """
    ARCHIVE_DIR = os.path.join(MERGED_DIR, "archive")
    TRAIN_DIR = os.path.join("data", "merged", "train")
    os.makedirs(TRAIN_DIR, exist_ok=True)
    features_path = os.path.join(MERGED_DIR, features_name)
    temp_path = os.path.join(TEMP_DIR, temp_name)
    train_path = os.path.join(TRAIN_DIR, train_name)

    dfs = []
    df_sources = {}  # Track which dataframe came from which source
    
    # 1. Read all relevant archive files (recursively)
    archive_dfs = []
    if os.path.isdir(ARCHIVE_DIR):
        for root, dirs, files in os.walk(ARCHIVE_DIR):
            for fname in files:
                # Only include files matching the asset (e.g., crypto_features_archived_*.parquet)
                if fname.startswith(archive_name.replace('.parquet', '_archived_')) and fname.endswith('.parquet'):
                    fpath = os.path.join(root, fname)
                    try:
                        archive_dfs.append(pd.read_parquet(fpath))
                    except Exception as e:
                        print(f"[WARN] Could not read archive file {fpath}: {e}")
    if archive_dfs:
        df_archive = pd.concat(archive_dfs, ignore_index=True)
        dfs.append(df_archive)
        df_sources['archive'] = df_archive
    else:
        print(f"[WARN] No archive files found for {archive_name}")

    # 2. Read features and temp
    if os.path.exists(features_path):
        df_features = pd.read_parquet(features_path)
        dfs.append(df_features)
        df_sources['features'] = df_features
    else:
        print(f"[WARN] Missing: {features_path}")
        
    if os.path.exists(temp_path):
        df_temp = pd.read_parquet(temp_path)
        dfs.append(df_temp)
        df_sources['temp'] = df_temp
    else:
        print(f"[WARN] Missing: {temp_path}")
    
    if not dfs:
        print("[ERROR] No input files found.")
        return
    
    # 3. Merge all data
    df_all = pd.concat(dfs, ignore_index=True)
    
    # 4. Before deduplication, try to fill nulls using data from different sources
    total_nulls_filled = 0
    if len(df_sources) > 1:
        print(f"[INFO] Attempting cross-source null filling for {train_name}")
        
        # Create a comprehensive lookup from all sources
        all_data_lookup = {}
        for source_name, df_source in df_sources.items():
            for _, row in df_source.iterrows():
                key = (row['symbol'], row['interval_timestamp'])
                if key not in all_data_lookup:
                    all_data_lookup[key] = {}
                
                # Add non-null values from this source
                for col in df_source.columns:
                    try:
                        # Use more robust null checking to handle arrays/scalars
                        col_val = row[col]
                        
                        # Check if value is not null (handle both scalar and array cases)
                        is_not_null = not pd.isnull(col_val)
                        if hasattr(is_not_null, '__len__') and len(is_not_null) > 1:
                            is_not_null = is_not_null.all()  # For arrays, check if all are not null
                        
                        if is_not_null:
                            all_data_lookup[key][col] = col_val
                    except Exception as e:
                        # Skip problematic columns with a warning
                        print(f"[WARN] Could not process column '{col}' for train lookup: {e}")
                        continue
        
        # Fill nulls in the combined dataframe
        for idx, row in df_all.iterrows():
            key = (row['symbol'], row['interval_timestamp'])
            if key in all_data_lookup:
                lookup_row = all_data_lookup[key]
                for col in df_all.columns:
                    try:
                        # Use more robust null checking
                        row_val = row[col]
                        
                        # Check if row value is null (handle both scalar and array cases)
                        row_is_null = pd.isnull(row_val)
                        if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
                            row_is_null = row_is_null.any()  # For arrays, check if any are null
                        
                        if row_is_null and col in lookup_row:
                            df_all.at[idx, col] = _cast_value_for_column(df_all[col], lookup_row[col])
                            total_nulls_filled += 1
                    except Exception as e:
                        # Skip problematic columns with a warning
                        print(f"[WARN] Could not process column '{col}' for train null filling: {e}")
                        continue
    
    # 5. Deduplicate by symbol+interval_timestamp, keeping the last occurrence
    df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
    
    # 6. Handle problematic columns that can't be serialized to parquet
    problematic_cols = []
    for col in df_all.columns:
        try:
            # Test if column can be converted to parquet-compatible format
            sample = df_all[col].iloc[0] if len(df_all) > 0 else None
            if sample is not None and hasattr(sample, '__len__') and not isinstance(sample, str):
                # If it's an array-like object (but not string), it might cause issues
                if len(sample) > 1:  # Multi-dimensional array
                    problematic_cols.append(col)
        except:
            # If we can't even check the sample, it's definitely problematic
            problematic_cols.append(col)
    
    if problematic_cols:
        print(f"[WARN] Dropping problematic columns that can't be serialized: {problematic_cols}")
        df_all = df_all.drop(columns=problematic_cols)
    
    # Save to parquet
    df_all.to_parquet(train_path, index=False)
    
    if total_nulls_filled > 0:
        print(f"[OK] Created train file: {train_path} with {len(df_all)} records, filled {total_nulls_filled} nulls")
    else:
        print(f"[OK] Created train file: {train_path} with {len(df_all)} records.")

def create_merged_features():
    """
    Create the main merged_features.parquet file by combining crypto and stock features
    with intelligent null filling between the two datasets.
    """
    crypto_path = os.path.join(MERGED_DIR, "crypto_features.parquet")
    stocks_path = os.path.join(MERGED_DIR, "stocks_features.parquet")
    merged_path = os.path.join(MERGED_DIR, "merged_features.parquet")
    
    dfs_to_merge = []
    
    # Read crypto features
    if os.path.exists(crypto_path):
        df_crypto = pd.read_parquet(crypto_path)
        dfs_to_merge.append(('crypto', df_crypto))
        print(f"[INFO] Loaded crypto features: {len(df_crypto)} rows, {len(df_crypto.columns)} columns")
    else:
        print(f"[WARN] Crypto features not found: {crypto_path}")
    
    # Read stock features  
    if os.path.exists(stocks_path):
        df_stocks = pd.read_parquet(stocks_path)
        dfs_to_merge.append(('stocks', df_stocks))
        print(f"[INFO] Loaded stock features: {len(df_stocks)} rows, {len(df_stocks.columns)} columns")
    else:
        print(f"[WARN] Stock features not found: {stocks_path}")
    
    if not dfs_to_merge:
        print("[ERROR] No feature files found to merge")
        return
    
    if len(dfs_to_merge) == 1:
        # Only one dataset available, just copy it
        df_merged = dfs_to_merge[0][1].copy()
        print(f"[INFO] Only {dfs_to_merge[0][0]} features available")
    else:
        # Multiple datasets - merge with null filling
        print("[INFO] Merging crypto and stock features with cross-dataset null filling")
        
        # Combine all dataframes
        all_dfs = [df for _, df in dfs_to_merge]
        df_merged = pd.concat(all_dfs, ignore_index=True, sort=False)
        
        # Perform cross-dataset null filling
        # Create lookup from all datasets for same symbol+timestamp
        lookup_data = {}
        for dataset_name, df in dfs_to_merge:
            for _, row in df.iterrows():
                key = (row['symbol'], row['interval_timestamp'])
                if key not in lookup_data:
                    lookup_data[key] = {}
                
                # Add non-null values from this dataset
                for col in df.columns:
                    try:
                        # Use more robust null checking to handle arrays/scalars
                        col_val = row[col]
                        
                        # Check if value is not null (handle both scalar and array cases)
                        is_not_null = not pd.isnull(col_val)
                        if hasattr(is_not_null, '__len__') and len(is_not_null) > 1:
                            is_not_null = is_not_null.all()  # For arrays, check if all are not null
                        
                        if is_not_null:
                            lookup_data[key][col] = col_val
                    except Exception as e:
                        # Skip problematic columns with a warning
                        print(f"[WARN] Could not process column '{col}' for lookup: {e}")
                        continue
        
        # Fill nulls using the comprehensive lookup
        nulls_filled = 0
        for idx, row in df_merged.iterrows():
            key = (row['symbol'], row['interval_timestamp'])
            if key in lookup_data:
                lookup_row = lookup_data[key]
                for col in df_merged.columns:
                    try:
                        # Use more robust null checking
                        row_val = row[col]
                        
                        # Check if row value is null (handle both scalar and array cases)
                        row_is_null = pd.isnull(row_val)
                        if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
                            row_is_null = row_is_null.any()  # For arrays, check if any are null
                        
                        if row_is_null and col in lookup_row:
                            df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], lookup_row[col])
                            nulls_filled += 1
                    except Exception as e:
                        # Skip problematic columns with a warning
                        print(f"[WARN] Could not process column '{col}' for null filling: {e}")
                        continue
        
        if nulls_filled > 0:
            print(f"[INFO] Cross-dataset null filling: {nulls_filled} values filled")
    
    # Remove duplicates if any (keeping last occurrence)
    initial_len = len(df_merged)
    df_merged = df_merged.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
    final_len = len(df_merged)
    
    if initial_len != final_len:
        print(f"[INFO] Removed {initial_len - final_len} duplicate records")
    
    # Save merged features
    df_merged.to_parquet(merged_path, index=False)
    print(f"[OK] Created merged features: {merged_path} with {len(df_merged)} rows, {len(df_merged.columns)} columns")
    
    # Report statistics
    nulls_remaining = df_merged.isnull().sum().sum()
    print(f"[INFO] Merged features null count: {nulls_remaining}")
    
    # Report symbol breakdown
    if 'symbol' in df_merged.columns:
        symbol_counts = df_merged['symbol'].value_counts()
        print(f"[INFO] Top symbols: {dict(symbol_counts.head(10))}")

def main():
    import sys
    
    # Check if this is being run as a test
    if len(sys.argv) > 1 and sys.argv[1] == '--test-null-filling':
        from test_null_filling_merge import main as test_main
        sys.exit(test_main())
    
    merge_temp_to_merged("crypto_features.parquet", "crypto_features.parquet")
    merge_temp_to_merged("stocks_features.parquet", "stocks_features.parquet")
    
    # Create the main merged features file
    create_merged_features()
    
    merge_all_to_train("crypto_features.parquet", "crypto_features.parquet", "crypto_features.parquet", "crypto_features_train.parquet")
    merge_all_to_train("stocks_features.parquet", "stocks_features.parquet", "stocks_features.parquet", "stocks_features_train.parquet")

if __name__ == "__main__":
    main()