File size: 29,864 Bytes
c49b21b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
import os
import pandas as pd
import numpy as np
from pathlib import Path
# import #logging
from datetime import datetime

# Resolve DATA_DIR from config (container-safe) with fallback
try:
    from src.config import DATA_DIR as CFG_DATA_DIR  # when run as module
except Exception:
    try:
        from config import DATA_DIR as CFG_DATA_DIR  # when run as script from src/
    except Exception:
        CFG_DATA_DIR = "/data"

class FixedTimestampHandler:
    def __init__(self, base_path: str | os.PathLike | None = None):
        # Prefer explicit argument, then DATA_DIR env, then config fallback
        resolved_base = base_path or os.getenv("DATA_DIR") or CFG_DATA_DIR
        self.base_path = Path(resolved_base)
        self.finviz_path = self.base_path / "finviz" / "sentiment"
        self.crypto_features_path = self.base_path / "merged" / "features" / "crypto_features.parquet"
        self.stocks_features_path = self.base_path / "merged" / "features" / "stocks_features.parquet"
        self.output_path = self.base_path / "merged" / "features"
        self.output_path.mkdir(parents=True, exist_ok=True)
        # Configure #logging
        #logging.basicConfig(level=#logging.INFO,
                        #    format='%(asctime)s - %(levelname)s - %(message)s')
        # Define tickers and mappings
        self.stock_tickers = ["AAPL", "TSLA", "GOOGL", "NVDA", "MSFT", "COIN"]
        self.crypto_ticker_mapping = {
            "BTC": "bitcoin",
            "ETH": "ethereum",
            "SOL": "solana",
            "XRP": "ripple",
            "ADA": "cardano"
        }
        # Reverse mapping: crypto name to ticker (all lowercase keys)
        self.crypto_name_to_ticker = {v.lower(): k for k, v in self.crypto_ticker_mapping.items()}

    def crypto_name_to_symbol(self, name):
        """Transform crypto name (e.g., 'bitcoin', 'Bitcoin', 'BITCOIN') to ticker symbol (e.g., 'BTC')"""
        if not isinstance(name, str):
            return None
        name_lower = name.strip().lower()
        # Try exact match
        if name_lower in self.crypto_name_to_ticker:
            return self.crypto_name_to_ticker[name_lower]
        # Try to match ignoring spaces and underscores
        for key in self.crypto_name_to_ticker:
            if name_lower.replace(' ', '').replace('_', '') == key.replace(' ', '').replace('_', ''):
                return self.crypto_name_to_ticker[key]
        return None

    def is_timestamp_column(self, df, col_name):
        """Determine if a column is likely a timestamp column"""
        if pd.api.types.is_datetime64_any_dtype(df[col_name]):
            return True
        if pd.api.types.is_numeric_dtype(df[col_name]):
            sample_vals = df[col_name].dropna()
            if len(sample_vals) == 0:
                return False
            sample_val = sample_vals.iloc[0]
            current_time = pd.Timestamp.now().timestamp()
            units = [
                ('s', 1),
                ('ms', 1000),
                ('us', 1000000),
                ('ns', 1000000000)
            ]
            for unit, divisor in units:
                try:
                    if unit == 's':
                        ts_value = sample_val
                    else:
                        ts_value = sample_val / divisor
                    if abs(ts_value - current_time) < (10 * 365 * 24 * 3600):
                        return True
                except:
                    continue
        if df[col_name].dtype == 'object':
            sample_val = df[col_name].dropna().iloc[0] if not df[col_name].empty else None
            if sample_val and isinstance(sample_val, str):
                try:
                    pd.to_datetime(sample_val)
                    return True
                except (ValueError, TypeError):
                    pass
        return False

    def get_timestamp_columns(self, df):
        """Identify all timestamp columns in a dataframe"""
        timestamp_cols = []
        potential_names = ['time', 'date', 'interval', 'timestamp', 'dt']
        for col in df.columns:
            if any(keyword in col.lower() for keyword in potential_names):
                if self.is_timestamp_column(df, col):
                    timestamp_cols.append(col)
        return timestamp_cols

    def convert_timestamp_column(self, df, col_name, unit='auto'):
        """Convert a timestamp column to datetime format with improved validation"""
        if pd.api.types.is_datetime64_any_dtype(df[col_name]):
            if df[col_name].dt.tz is not None:
                df[col_name] = df[col_name].dt.tz_localize(None)
            return df[col_name]
        if pd.api.types.is_numeric_dtype(df[col_name]):
            sample_vals = df[col_name].dropna()
            if len(sample_vals) == 0:
                print(f"[ERROR] No valid values in timestamp column {col_name}")
                return None
            
            # Convert nullable Int64 to regular numeric if needed
            if hasattr(sample_vals, 'dtype') and str(sample_vals.dtype).startswith('Int'):
                sample_vals = sample_vals.astype('int64')
                
            if unit == 'auto':
                current_time = pd.Timestamp.now().timestamp()
                best_unit = None
                best_distance = float('inf')
                for test_unit in ['s', 'ms', 'us', 'ns']:
                    try:
                        # Additional safety check
                        if len(sample_vals) == 0:
                            continue
                        first_val = sample_vals.iloc[0]
                        if pd.isna(first_val):
                            continue
                        if test_unit == 's':
                            test_ts = pd.to_datetime(first_val, unit='s')
                        else:
                            divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[test_unit]
                            test_ts = pd.to_datetime(first_val / divisor, unit='s')
                        distance = abs((pd.Timestamp.now() - test_ts).total_seconds())
                        if distance < best_distance:
                            best_distance = distance
                            best_unit = test_unit
                    except Exception as e:
                        #logging.debug(f"Failed to test unit {test_unit} for column {col_name}: {e}")
                        continue
                if best_unit is None:
                    #logging.error(f"Could not determine unit for column {col_name}")
                    return None
                unit = best_unit
                #logging.info(f"Auto-detected unit for {col_name}: {unit}")
            try:
                # Convert nullable Int64 to regular numeric if needed for the whole column
                values_to_convert = df[col_name]
                if hasattr(values_to_convert, 'dtype') and str(values_to_convert.dtype).startswith('Int'):
                    values_to_convert = values_to_convert.astype('int64')
                
                if unit == 's':
                    converted = pd.to_datetime(values_to_convert, unit='s')
                else:
                    divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[unit]
                    converted = pd.to_datetime(values_to_convert / divisor, unit='s')
                if converted.dt.tz is not None:
                    converted = converted.dt.tz_localize(None)
                if converted.min().year < 2000:
                    #logging.warning(f"Converted timestamps for {col_name} seem too old. Checking alternative units.")
                    for alt_unit in ['s', 'ms', 'us', 'ns']:
                        if alt_unit == unit:
                            continue
                        try:
                            if alt_unit == 's':
                                alt_converted = pd.to_datetime(df[col_name], unit='s')
                            else:
                                alt_divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[alt_unit]
                                alt_converted = pd.to_datetime(df[col_name] / alt_divisor, unit='s')
                            if alt_converted.min().year > 2000:
                                #logging.info(f"Alternative unit {alt_unit} gives better results for {col_name}")
                                converted = alt_converted
                                break
                        except Exception as e:
                            #logging.debug(f"Failed to try alternative unit {alt_unit} for column {col_name}: {e}")
                            continue
                #logging.info(f"Successfully converted {col_name} using unit '{unit}'")
                #logging.info(f"Date range: {converted.min()} to {converted.max()}")
                return converted
            except Exception as e:
                #logging.error(f"Failed to convert {col_name} using unit '{unit}': {e}")
                return None
        elif df[col_name].dtype == 'object':
            try:
                converted = pd.to_datetime(df[col_name])
                if converted.dt.tz is not None:
                    converted = converted.dt.tz_localize(None)
                #logging.info(f"Successfully converted string timestamps in {col_name}")
                return converted
            except Exception as e:
                #logging.error(f"Failed to convert string timestamps in {col_name}: {e}")
                return None
        else:
            #logging.error(f"Unknown timestamp format in column {col_name}")
            return None

    def select_best_timestamp_column(self, df, timestamp_columns):
        """Select the best timestamp column from a list of potential columns"""
        best_col = None
        best_score = -1
        for col in timestamp_columns:
            try:
                if col not in df.columns:
                    print(f"[WARN] Column {col} not found in dataframe")
                    continue
                if df[col].isnull().all():
                    print(f"[WARN] Column {col} contains only null values")
                    continue
                converted = self.convert_timestamp_column(df, col)
                if converted is None:
                    print(f"[WARN] Could not convert column {col} to timestamp")
                    continue
                non_null_count = converted.notna().sum()
                recent_count = converted[converted > pd.Timestamp('2020-01-01')].count()
                score = non_null_count + recent_count * 2
                print(f"[DEBUG] Column {col}: score={score}, non_null={non_null_count}, recent={recent_count}")
                if score > best_score:
                    best_score = score
                    best_col = col
            except Exception as e:
                print(f"[WARN] Error evaluating timestamp column {col}: {e}")
                continue
        print(f"[INFO] Best timestamp column: {best_col} (score: {best_score})")
        return best_col

    def load_sentiment_data(self, symbol):
        """Load sentiment data with proper timestamp handling"""
        sentiment_file = self.finviz_path / f"{symbol.upper()}_sentiment.parquet"
        if not sentiment_file.exists():
            print(f"[WARN] Sentiment file not found: {sentiment_file}")
            return None
        try:
            df = pd.read_parquet(sentiment_file)
            print(f"[INFO] Loaded sentiment data for {symbol}: {len(df)} rows")
            timestamp_cols = self.get_timestamp_columns(df)
            if not timestamp_cols:
                print(f"[ERROR] No timestamp columns found in {symbol} sentiment data")
                return None
            timestamp_col = timestamp_cols[0]
            converted = self.convert_timestamp_column(df, timestamp_col)
            if converted is None:
                print(f"[ERROR] Could not convert timestamp column {timestamp_col} in {symbol}")
                return None
            df['sentiment_timestamp'] = converted
            df['symbol'] = symbol.upper()
            return df
        except Exception as e:
            print(f"[ERROR] Error loading sentiment data for {symbol}: {e}")
            return None

    def load_features_data(self, data_type='stocks'):
        """Load features data with improved timestamp handling"""
        file_path = self.stocks_features_path if data_type == 'stocks' else self.crypto_features_path
        if not file_path.exists():
            print(f"[ERROR] Features file not found: {file_path}")
            return None
        try:
            df = pd.read_parquet(file_path)
            print(f"[INFO] Loaded {data_type} features: {len(df)} rows")
            potential_timestamp_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['time', 'date', 'interval', 'timestamp', 'dt'])]
            print(f"[INFO] Potential timestamp columns: {potential_timestamp_cols}")
            
            # Safer timestamp detection
            timestamp_cols = []
            for col in potential_timestamp_cols:
                try:
                    is_ts = self.is_timestamp_column(df, col)
                    if is_ts:
                        timestamp_cols.append(col)
                        print(f"[DEBUG] {col} confirmed as timestamp column")
                    else:
                        print(f"[DEBUG] {col} rejected as timestamp column")
                except Exception as e:
                    print(f"[WARN] Error checking {col}: {e}")
                    continue
            
            print(f"[INFO] Confirmed timestamp columns: {timestamp_cols}")
            if not timestamp_cols:
                print(f"[ERROR] No valid timestamp columns found in {data_type} features")
                return None
            best_col = self.select_best_timestamp_column(df, timestamp_cols)
            if best_col is None:
                print(f"[ERROR] Could not select a valid timestamp column from {timestamp_cols}")
                return None
            converted = self.convert_timestamp_column(df, best_col)
            if converted is None:
                print(f"[ERROR] Failed to convert selected timestamp column {best_col}")
                return None
            df['feature_timestamp'] = converted
            print(f"[INFO] Selected timestamp column: {best_col}")
            print(f"[INFO] Date range: {converted.min()} to {converted.max()}")
            return df
        except Exception as e:
            import traceback
            print(f"[ERROR] Error loading {data_type} features: {e}")
            print(f"[ERROR] Traceback: {traceback.format_exc()}")
            return None

    def merge_sentiment_to_features(self, features_df, sentiment_df, tolerance_minutes=60*12):
        """Merge sentiment data INTO features data based on closest timestamp, with tolerance window"""
        features_sorted = features_df.sort_values(by='feature_timestamp')
        sentiment_sorted = sentiment_df.sort_values(by='sentiment_timestamp')

        # Use a tolerance window for timestamp matching
        tolerance = pd.Timedelta(minutes=tolerance_minutes)
        merged_df = pd.merge_asof(
            features_sorted,
            sentiment_sorted,
            left_on='feature_timestamp',
            right_on='sentiment_timestamp',
            direction='nearest',
            tolerance=tolerance
        )

        # If no sentiment match within tolerance, sentiment_score will be NaN
        if 'sentiment_score' in merged_df.columns:
            unmatched = merged_df['sentiment_score'].isna().sum()
            print(f"[INFO] Rows with no sentiment match (NaN sentiment_score): {unmatched}")

        print(f"[INFO] Merged {len(features_df)} feature rows with {len(sentiment_df)} sentiment rows using tolerance {tolerance_minutes} min")
        print(f"[INFO] Result: {len(merged_df)} rows")
        return merged_df

    def process_stocks_data(self):
        """Process all stocks data by merging finviz sentiment into stock features"""
        print("[INFO] Processing stocks data...")
        
        # Load stocks features first (this is the base dataset)
        stocks_df = self.load_features_data('stocks')
        if stocks_df is None:
            print("[ERROR] Failed to load stocks features data")
            return None
        
        # Check what columns are available and what symbols are in the data
        if 'symbol' in stocks_df.columns:
            unique_symbols = stocks_df['symbol'].unique()
        elif 'ticker' in stocks_df.columns:
            unique_symbols = stocks_df['ticker'].unique()
        
        print(f"[INFO] Available symbols in stocks features: {unique_symbols}")
        
        # Check if any sentiment files exist
        if not self.finviz_path.exists():
            print(f"[WARN] Finviz sentiment directory does not exist: {self.finviz_path}")
            print(f"[WARN] Proceeding without sentiment data merge for stocks")
            # Save features as-is without sentiment merge
            output_file = self.output_path / "stocks_features.parquet"
            stocks_df.to_parquet(output_file)
            print(f"[INFO] Stocks features saved without sentiment to: {output_file}")
            return stocks_df
            
        # Check if any sentiment files exist for our tickers
        sentiment_files_exist = any(
            (self.finviz_path / f"{ticker.upper()}_sentiment.parquet").exists() 
            for ticker in self.stock_tickers
        )
        
        if not sentiment_files_exist:
            print(f"[WARN] No sentiment files found for any stock tickers: {self.stock_tickers}")
            print(f"[WARN] Proceeding without sentiment data merge for stocks")
            # Save features as-is without sentiment merge
            output_file = self.output_path / "stocks_features.parquet"
            stocks_df.to_parquet(output_file)
            print(f"[INFO] Stocks features saved without sentiment to: {output_file}")
            return stocks_df
        
        merged_stocks_list = []
        
        for ticker in self.stock_tickers:
            print(f"[INFO] Processing stock ticker: {ticker}")
            
            # Load sentiment data for this ticker
            sentiment_df = self.load_sentiment_data(ticker)
            if sentiment_df is None:
                print(f"[WARN] No sentiment data for {ticker}, skipping...")
                continue
            
            # Filter stocks features for this ticker
            ticker_stocks = None
            if 'symbol' in stocks_df.columns:
                ticker_stocks = stocks_df[stocks_df['symbol'] == ticker].copy()
            elif 'ticker' in stocks_df.columns:
                ticker_stocks = stocks_df[stocks_df['ticker'] == ticker].copy()
            
            if ticker_stocks is None or len(ticker_stocks) == 0:
                print(f"[WARN] No feature data found for ticker {ticker} - skipping this ticker")
                continue
            
            print(f"[INFO] Found {len(ticker_stocks)} feature rows for {ticker}")
            
            # Merge sentiment INTO features
            merged_ticker = self.merge_sentiment_to_features(ticker_stocks, sentiment_df)
            
            # Remove symbol_y and replace symbol_x with symbol
            if 'symbol_y' in merged_ticker.columns:
                merged_ticker = merged_ticker.drop(columns=['symbol_y'])
            if 'symbol_x' in merged_ticker.columns:
                merged_ticker = merged_ticker.rename(columns={'symbol_x': 'symbol'})
            
            # Re-order columns: symbol first, interval_timestamp second (if present)
            cols = list(merged_ticker.columns)
            if 'symbol' in cols:
                cols.remove('symbol')
                new_order = ['symbol']
                if 'interval_timestamp' in cols:
                    cols.remove('interval_timestamp')
                    new_order.append('interval_timestamp')
                new_order += cols
                merged_ticker = merged_ticker[new_order]
            merged_stocks_list.append(merged_ticker)
        
        if not merged_stocks_list:
            print("[WARN] No stocks data was successfully merged with sentiment")
            print("[WARN] Saving original stocks features without sentiment")
            output_file = self.output_path / "stocks_features.parquet"
            stocks_df.to_parquet(output_file)
            print(f"[INFO] Stocks features saved without sentiment to: {output_file}")
            return stocks_df
        
        # Combine all merged stock data
        final_stocks_df = pd.concat(merged_stocks_list, ignore_index=True)
        
        # Save the result
        output_file = self.output_path / "stocks_features.parquet"
        final_stocks_df.to_parquet(output_file)
        print(f"[INFO] Stocks data with sentiment saved to: {output_file}")
        
        return final_stocks_df

    def process_crypto_data(self):
        """Process all crypto data by merging finviz sentiment into crypto features"""
        print("[INFO] Processing crypto data...")
        
        # Load crypto features first (this is the base dataset)
        crypto_df = self.load_features_data('crypto')
        if crypto_df is None:
            print("[ERROR] Failed to load crypto features data")
            return None
        
        # Check for various possible symbol/ticker columns
        symbol_columns = [col for col in crypto_df.columns if any(keyword in col.lower() 
                         for keyword in ['symbol', 'ticker', 'name', 'id', 'coin'])]
        
        print(f"[INFO] Available symbol columns in crypto: {symbol_columns}")
        
        # Try to identify unique values in potential symbol columns
        for col in symbol_columns:
            if crypto_df[col].dtype == 'object':
                unique_values = crypto_df[col].unique()[:10]  # Show first 10 unique values
                print(f"[INFO] Sample values in {col}: {unique_values}")
        
        # Check if any sentiment files exist
        if not self.finviz_path.exists():
            print(f"[WARN] Finviz sentiment directory does not exist: {self.finviz_path}")
            print(f"[WARN] Proceeding without sentiment data merge for crypto")
            # Save features as-is without sentiment merge
            output_file = self.output_path / "crypto_features.parquet"
            crypto_df.to_parquet(output_file)
            print(f"[INFO] Crypto features saved without sentiment to: {output_file}")
            return crypto_df
            
        # Check if any sentiment files exist for our crypto tickers
        sentiment_files_exist = any(
            (self.finviz_path / f"{ticker.upper()}_sentiment.parquet").exists() 
            for ticker in self.crypto_ticker_mapping.keys()
        )
        
        if not sentiment_files_exist:
            print(f"[WARN] No sentiment files found for any crypto tickers: {list(self.crypto_ticker_mapping.keys())}")
            print(f"[WARN] Proceeding without sentiment data merge for crypto")
            # Save features as-is without sentiment merge
            output_file = self.output_path / "crypto_features.parquet"
            crypto_df.to_parquet(output_file)
            print(f"[INFO] Crypto features saved without sentiment to: {output_file}")
            return crypto_df
        
        merged_crypto_list = []
        
        for crypto_ticker, crypto_name in self.crypto_ticker_mapping.items():
            print(f"[INFO] Processing crypto ticker: {crypto_ticker} (name: {crypto_name})")
            
            # Load sentiment data for this crypto ticker
            sentiment_df = self.load_sentiment_data(crypto_ticker)
            if sentiment_df is None:
                print(f"[WARN] No sentiment data for {crypto_ticker}, skipping...")
                continue
            
            # Try different approaches to filter crypto features
            ticker_crypto = None
            
            # Approach 1: Try exact ticker match
            for col in ['symbol', 'ticker', 'coin_id', 'id']:
                if col in crypto_df.columns:
                    matches = crypto_df[crypto_df[col].str.upper() == crypto_ticker].copy()
                    if len(matches) > 0:
                        ticker_crypto = matches
                        print(f"[INFO] Found {len(matches)} rows matching {crypto_ticker} in column '{col}'")
                        break
            
            # Approach 2: Try crypto name match
            if ticker_crypto is None or len(ticker_crypto) == 0:
                for col in ['name', 'coin_name']:
                    if col in crypto_df.columns:
                        matches = crypto_df[crypto_df[col].str.lower() == crypto_name.lower()].copy()
                        if len(matches) > 0:
                            ticker_crypto = matches
                            print(f"[INFO] Found {len(matches)} rows matching {crypto_name} in column '{col}'")
                            break
            
            # Approach 3: Try partial matching (in case of different formats)
            if ticker_crypto is None or len(ticker_crypto) == 0:
                for col in symbol_columns:
                    if crypto_df[col].dtype == 'object':
                        # Try case-insensitive contains match
                        matches = crypto_df[crypto_df[col].str.contains(crypto_ticker, case=False, na=False)].copy()
                        if len(matches) > 0:
                            ticker_crypto = matches
                            print(f"[INFO] Found {len(matches)} rows with partial match for {crypto_ticker} in column '{col}'")
                            break
                        
                        # Try crypto name partial match
                        matches = crypto_df[crypto_df[col].str.contains(crypto_name, case=False, na=False)].copy()
                        if len(matches) > 0:
                            ticker_crypto = matches
                            print(f"[INFO] Found {len(matches)} rows with partial match for {crypto_name} in column '{col}'")
                            break
            
            if ticker_crypto is None or len(ticker_crypto) == 0:
                print(f"[WARN] No feature data found for crypto {crypto_ticker} ({crypto_name}) - skipping this crypto")
                continue
            
            # Merge sentiment INTO features
            merged_ticker = self.merge_sentiment_to_features(ticker_crypto, sentiment_df)
            
            # Remove symbol_x and replace symbol_y with symbol
            if 'symbol_x' in merged_ticker.columns:
                merged_ticker = merged_ticker.drop(columns=['symbol_x'])
            if 'symbol_y' in merged_ticker.columns:
                merged_ticker = merged_ticker.rename(columns={'symbol_y': 'symbol'})
            
            # Remove duplicate 'symbol' columns if any
            symbol_cols = [col for col in merged_ticker.columns if col == 'symbol']
            if len(symbol_cols) > 1:
                # Keep only the first 'symbol' column
                # This will drop all but the first occurrence
                merged_ticker = merged_ticker.loc[:, ~merged_ticker.columns.duplicated()]
            
            # Re-order columns: symbol first, interval_timestamp second (if present)
            cols = list(merged_ticker.columns)
            if 'symbol' in cols:
                cols.remove('symbol')
                new_order = ['symbol']
                if 'interval_timestamp' in cols:
                    cols.remove('interval_timestamp')
                    new_order.append('interval_timestamp')
                new_order += cols
                merged_ticker = merged_ticker[new_order]
            merged_crypto_list.append(merged_ticker)
        
        if not merged_crypto_list:
            print("[WARN] No crypto data was successfully merged with sentiment")
            print("[WARN] Saving original crypto features without sentiment")
            output_file = self.output_path / "crypto_features.parquet"
            crypto_df.to_parquet(output_file)
            print(f"[INFO] Crypto features saved without sentiment to: {output_file}")
            return crypto_df
        
        # Combine all merged crypto data
        final_crypto_df = pd.concat(merged_crypto_list, ignore_index=True)
        
        # Save the result
        output_file = self.output_path / "crypto_features.parquet"
        final_crypto_df.to_parquet(output_file)
        print(f"[INFO] Crypto data with sentiment saved to: {output_file}")
        
        return final_crypto_df

    def process_all_data(self):
        """Process both stocks and crypto data"""
        #logging.info("Starting data processing for all assets...")
        
        stocks_result = self.process_stocks_data()
        crypto_result = self.process_crypto_data()
        
        if stocks_result is not None:
            print(f"[OK] Stocks processing completed: {len(stocks_result)} rows")
        else:
            print("[ERROR] Stocks processing failed")

        if crypto_result is not None:
            print(f"[OK] Crypto processing completed: {len(crypto_result)} rows")
        else:
            print("[ERROR] Crypto processing failed")

        return stocks_result, crypto_result

# Example usage
if __name__ == "__main__":
    handler = FixedTimestampHandler()
    
    # Test individual components
    #logging.info("Testing sentiment data loading...")
    sentiment_df = handler.load_sentiment_data("AAPL")
    
    stocks_df = handler.load_features_data('stocks')
    
    # Test merge process
    # handler.test_merge()
    
    # Process all data
    handler.process_all_data()