|
|
|
""" |
|
Time-Shifted Santiment-Crypto Merger |
|
=================================== |
|
|
|
This script handles the case where Santiment data and crypto data have different date ranges |
|
due to API limitations. It performs a time-shifted merge using pattern matching. |
|
|
|
Approaches: |
|
1. Offset-based: Map August crypto data to July Santiment data with consistent offset |
|
2. Day-of-week matching: Match same weekdays/times across different months |
|
3. Pattern-based: Use similar market patterns from different time periods |
|
""" |
|
|
|
import pandas as pd |
|
import numpy as np |
|
from datetime import datetime, timedelta |
|
import os |
|
import logging |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
logger = logging.getLogger(__name__) |
|
|
|
def load_data(): |
|
"""Load crypto and Santiment data""" |
|
logger.info("Loading data files...") |
|
|
|
|
|
crypto_file = 'data/merged/features/crypto_features.parquet' |
|
crypto_df = pd.read_parquet(crypto_file) |
|
crypto_df['datetime'] = pd.to_datetime(crypto_df['interval_timestamp'], unit='ms', utc=True) |
|
|
|
|
|
santiment_file = 'data/santiment/merged_features.parquet' |
|
santiment_df = pd.read_parquet(santiment_file) |
|
|
|
logger.info(f"Crypto: {len(crypto_df)} records from {crypto_df['datetime'].min()} to {crypto_df['datetime'].max()}") |
|
logger.info(f"Santiment: {len(santiment_df)} records from {santiment_df.index.min()} to {santiment_df.index.max()}") |
|
|
|
return crypto_df, santiment_df |
|
|
|
def calculate_time_offset(crypto_df, santiment_df): |
|
"""Calculate the time offset between datasets""" |
|
crypto_start = crypto_df['datetime'].min() |
|
santiment_start = santiment_df.index.min() |
|
|
|
offset = crypto_start - santiment_start |
|
logger.info(f"Time offset: {offset.days} days") |
|
|
|
return offset |
|
|
|
def merge_with_time_shift(crypto_df, santiment_df, method='offset'): |
|
""" |
|
Merge crypto and Santiment data using time-shift techniques |
|
|
|
Args: |
|
crypto_df: Crypto features DataFrame |
|
santiment_df: Santiment features DataFrame |
|
method: 'offset', 'day_of_week', or 'pattern' |
|
""" |
|
logger.info(f"Starting time-shifted merge using method: {method}") |
|
|
|
merged_results = [] |
|
symbol_mapping = {'BTC': 'BTC', 'ETH': 'ETH', 'ADA': 'ADA', 'SOL': 'SOL', 'XRP': 'XRP'} |
|
|
|
if method == 'offset': |
|
|
|
offset = calculate_time_offset(crypto_df, santiment_df) |
|
|
|
for symbol, slug in symbol_mapping.items(): |
|
logger.info(f"Processing {symbol} β {slug} with offset method") |
|
|
|
crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy() |
|
santiment_slug = santiment_df[santiment_df['slug'] == slug].copy() |
|
|
|
if crypto_symbol.empty or santiment_slug.empty: |
|
logger.warning(f"Skipping {symbol} - missing data") |
|
continue |
|
|
|
|
|
merged_symbol = merge_with_offset(crypto_symbol, santiment_slug, offset) |
|
merged_results.append(merged_symbol) |
|
|
|
elif method == 'day_of_week': |
|
|
|
for symbol, slug in symbol_mapping.items(): |
|
logger.info(f"Processing {symbol} β {slug} with day-of-week method") |
|
|
|
crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy() |
|
santiment_slug = santiment_df[santiment_df['slug'] == slug].copy() |
|
|
|
if crypto_symbol.empty or santiment_slug.empty: |
|
logger.warning(f"Skipping {symbol} - missing data") |
|
continue |
|
|
|
merged_symbol = merge_by_day_pattern(crypto_symbol, santiment_slug) |
|
merged_results.append(merged_symbol) |
|
|
|
|
|
if merged_results: |
|
merged_df = pd.concat(merged_results, ignore_index=True) |
|
logger.info(f"Merge completed: {len(merged_df)} records") |
|
return merged_df |
|
else: |
|
logger.error("No data could be merged!") |
|
return None |
|
|
|
def merge_with_offset(crypto_symbol, santiment_slug, offset): |
|
"""Merge using consistent time offset""" |
|
merged_records = [] |
|
|
|
for _, crypto_row in crypto_symbol.iterrows(): |
|
|
|
shifted_time = crypto_row['datetime'] - offset |
|
|
|
|
|
time_diffs = np.abs(santiment_slug.index - shifted_time) |
|
closest_idx = time_diffs.argmin() |
|
closest_idx = santiment_slug.index[closest_idx] |
|
|
|
|
|
if time_diffs.min() <= pd.Timedelta(hours=1): |
|
santiment_row = santiment_slug.loc[closest_idx] |
|
|
|
|
|
combined_row = crypto_row.copy() |
|
for col in santiment_slug.columns: |
|
if col != 'slug': |
|
combined_row[f'santiment_{col}'] = santiment_row[col] |
|
|
|
merged_records.append(combined_row) |
|
|
|
return pd.DataFrame(merged_records) |
|
|
|
def merge_by_day_pattern(crypto_symbol, santiment_slug): |
|
"""Merge by matching day-of-week and time patterns""" |
|
merged_records = [] |
|
|
|
for _, crypto_row in crypto_symbol.iterrows(): |
|
crypto_time = crypto_row['datetime'] |
|
|
|
|
|
santiment_same_weekday = santiment_slug[ |
|
santiment_slug.index.dayofweek == crypto_time.dayofweek |
|
] |
|
|
|
if not santiment_same_weekday.empty: |
|
|
|
crypto_time_of_day = crypto_time.time() |
|
|
|
time_diffs = santiment_same_weekday.index.map( |
|
lambda x: abs((x.time().hour * 60 + x.time().minute) - |
|
(crypto_time_of_day.hour * 60 + crypto_time_of_day.minute)) |
|
) |
|
|
|
closest_idx = time_diffs.argmin() |
|
closest_idx = santiment_same_weekday.index[closest_idx] |
|
santiment_row = santiment_same_weekday.loc[closest_idx] |
|
|
|
|
|
combined_row = crypto_row.copy() |
|
for col in santiment_slug.columns: |
|
if col != 'slug': |
|
combined_row[f'santiment_{col}'] = santiment_row[col] |
|
|
|
merged_records.append(combined_row) |
|
|
|
return pd.DataFrame(merged_records) |
|
|
|
def analyze_merge_quality(merged_df, method): |
|
"""Analyze merge quality and provide statistics""" |
|
if merged_df is None or merged_df.empty: |
|
return {"error": "No merged data"} |
|
|
|
santiment_cols = [col for col in merged_df.columns if col.startswith('santiment_')] |
|
|
|
analysis = { |
|
'method_used': method, |
|
'total_records': len(merged_df), |
|
'santiment_features_added': len(santiment_cols), |
|
'symbols_processed': sorted(merged_df['symbol'].unique()), |
|
'completeness_by_symbol': {} |
|
} |
|
|
|
|
|
for symbol in analysis['symbols_processed']: |
|
symbol_data = merged_df[merged_df['symbol'] == symbol] |
|
non_null_counts = symbol_data[santiment_cols].notna().sum(axis=1) |
|
records_with_santiment = (non_null_counts > 0).sum() |
|
|
|
analysis['completeness_by_symbol'][symbol] = { |
|
'total_records': len(symbol_data), |
|
'records_with_santiment': records_with_santiment, |
|
'completeness_pct': records_with_santiment / len(symbol_data) * 100 |
|
} |
|
|
|
return analysis |
|
|
|
def save_results(merged_df, analysis, method): |
|
"""Save merged results with method identifier""" |
|
if merged_df is None: |
|
logger.error("Cannot save - no merged data") |
|
return None, None |
|
|
|
logger.info("Saving time-shifted merge results...") |
|
|
|
|
|
output_dir = 'data/merged/features' |
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
output_file = os.path.join(output_dir, f'crypto_with_santiment_{method}_{timestamp}.parquet') |
|
|
|
merged_df.to_parquet(output_file, index=False) |
|
logger.info(f"Merged features saved to: {output_file}") |
|
|
|
|
|
analysis_file = os.path.join(output_dir, f'santiment_merge_analysis_{method}_{timestamp}.json') |
|
import json |
|
with open(analysis_file, 'w') as f: |
|
json.dump(analysis, f, indent=2, default=str) |
|
|
|
logger.info(f"Analysis saved to: {analysis_file}") |
|
|
|
return output_file, analysis_file |
|
|
|
def main(): |
|
"""Main time-shifted merge process""" |
|
logger.info("Starting time-shifted Santiment-Crypto merge...") |
|
|
|
try: |
|
|
|
crypto_df, santiment_df = load_data() |
|
|
|
|
|
methods = ['offset', 'day_of_week'] |
|
results = {} |
|
|
|
for method in methods: |
|
logger.info(f"\n{'='*50}") |
|
logger.info(f"TRYING METHOD: {method.upper()}") |
|
logger.info(f"{'='*50}") |
|
|
|
merged_df = merge_with_time_shift(crypto_df, santiment_df, method=method) |
|
analysis = analyze_merge_quality(merged_df, method) |
|
|
|
if merged_df is not None: |
|
output_file, analysis_file = save_results(merged_df, analysis, method) |
|
results[method] = { |
|
'success': True, |
|
'records': len(merged_df), |
|
'completeness': analysis.get('completeness_by_symbol', {}), |
|
'output_file': output_file |
|
} |
|
else: |
|
results[method] = {'success': False} |
|
|
|
|
|
print("\n" + "="*60) |
|
print("TIME-SHIFTED MERGE SUMMARY") |
|
print("="*60) |
|
|
|
for method, result in results.items(): |
|
print(f"\n{method.upper()} METHOD:") |
|
if result['success']: |
|
print(f" β
Success: {result['records']} records merged") |
|
print(f" π File: {result['output_file']}") |
|
for symbol, stats in result['completeness'].items(): |
|
print(f" {symbol}: {stats['completeness_pct']:.1f}% complete") |
|
else: |
|
print(f" β Failed") |
|
|
|
print("="*60) |
|
|
|
except Exception as e: |
|
logger.error(f"Time-shifted merge failed: {e}") |
|
raise |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|