File size: 10,715 Bytes
c49b21b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
#!/usr/bin/env python3
"""
Time-Shifted Santiment-Crypto Merger
===================================
This script handles the case where Santiment data and crypto data have different date ranges
due to API limitations. It performs a time-shifted merge using pattern matching.
Approaches:
1. Offset-based: Map August crypto data to July Santiment data with consistent offset
2. Day-of-week matching: Match same weekdays/times across different months
3. Pattern-based: Use similar market patterns from different time periods
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import logging
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def load_data():
"""Load crypto and Santiment data"""
logger.info("Loading data files...")
# Load crypto features
crypto_file = 'data/merged/features/crypto_features.parquet'
crypto_df = pd.read_parquet(crypto_file)
crypto_df['datetime'] = pd.to_datetime(crypto_df['interval_timestamp'], unit='ms', utc=True)
# Load Santiment features
santiment_file = 'data/santiment/merged_features.parquet'
santiment_df = pd.read_parquet(santiment_file)
logger.info(f"Crypto: {len(crypto_df)} records from {crypto_df['datetime'].min()} to {crypto_df['datetime'].max()}")
logger.info(f"Santiment: {len(santiment_df)} records from {santiment_df.index.min()} to {santiment_df.index.max()}")
return crypto_df, santiment_df
def calculate_time_offset(crypto_df, santiment_df):
"""Calculate the time offset between datasets"""
crypto_start = crypto_df['datetime'].min()
santiment_start = santiment_df.index.min()
offset = crypto_start - santiment_start
logger.info(f"Time offset: {offset.days} days")
return offset
def merge_with_time_shift(crypto_df, santiment_df, method='offset'):
"""
Merge crypto and Santiment data using time-shift techniques
Args:
crypto_df: Crypto features DataFrame
santiment_df: Santiment features DataFrame
method: 'offset', 'day_of_week', or 'pattern'
"""
logger.info(f"Starting time-shifted merge using method: {method}")
merged_results = []
symbol_mapping = {'BTC': 'BTC', 'ETH': 'ETH', 'ADA': 'ADA', 'SOL': 'SOL', 'XRP': 'XRP'}
if method == 'offset':
# Calculate consistent time offset
offset = calculate_time_offset(crypto_df, santiment_df)
for symbol, slug in symbol_mapping.items():
logger.info(f"Processing {symbol} β {slug} with offset method")
crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy()
santiment_slug = santiment_df[santiment_df['slug'] == slug].copy()
if crypto_symbol.empty or santiment_slug.empty:
logger.warning(f"Skipping {symbol} - missing data")
continue
# Apply offset to match timeframes
merged_symbol = merge_with_offset(crypto_symbol, santiment_slug, offset)
merged_results.append(merged_symbol)
elif method == 'day_of_week':
# Match same day-of-week and time patterns
for symbol, slug in symbol_mapping.items():
logger.info(f"Processing {symbol} β {slug} with day-of-week method")
crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy()
santiment_slug = santiment_df[santiment_df['slug'] == slug].copy()
if crypto_symbol.empty or santiment_slug.empty:
logger.warning(f"Skipping {symbol} - missing data")
continue
merged_symbol = merge_by_day_pattern(crypto_symbol, santiment_slug)
merged_results.append(merged_symbol)
# Combine results
if merged_results:
merged_df = pd.concat(merged_results, ignore_index=True)
logger.info(f"Merge completed: {len(merged_df)} records")
return merged_df
else:
logger.error("No data could be merged!")
return None
def merge_with_offset(crypto_symbol, santiment_slug, offset):
"""Merge using consistent time offset"""
merged_records = []
for _, crypto_row in crypto_symbol.iterrows():
# Shift crypto timestamp back by offset to match Santiment timeframe
shifted_time = crypto_row['datetime'] - offset
# Find closest Santiment record
time_diffs = np.abs(santiment_slug.index - shifted_time)
closest_idx = time_diffs.argmin()
closest_idx = santiment_slug.index[closest_idx]
# Check if match is reasonable (within 1 hour)
if time_diffs.min() <= pd.Timedelta(hours=1):
santiment_row = santiment_slug.loc[closest_idx]
# Combine data
combined_row = crypto_row.copy()
for col in santiment_slug.columns:
if col != 'slug':
combined_row[f'santiment_{col}'] = santiment_row[col]
merged_records.append(combined_row)
return pd.DataFrame(merged_records)
def merge_by_day_pattern(crypto_symbol, santiment_slug):
"""Merge by matching day-of-week and time patterns"""
merged_records = []
for _, crypto_row in crypto_symbol.iterrows():
crypto_time = crypto_row['datetime']
# Find Santiment records with same day-of-week and similar time
santiment_same_weekday = santiment_slug[
santiment_slug.index.dayofweek == crypto_time.dayofweek
]
if not santiment_same_weekday.empty:
# Find closest time-of-day match
crypto_time_of_day = crypto_time.time()
time_diffs = santiment_same_weekday.index.map(
lambda x: abs((x.time().hour * 60 + x.time().minute) -
(crypto_time_of_day.hour * 60 + crypto_time_of_day.minute))
)
closest_idx = time_diffs.argmin()
closest_idx = santiment_same_weekday.index[closest_idx]
santiment_row = santiment_same_weekday.loc[closest_idx]
# Combine data
combined_row = crypto_row.copy()
for col in santiment_slug.columns:
if col != 'slug':
combined_row[f'santiment_{col}'] = santiment_row[col]
merged_records.append(combined_row)
return pd.DataFrame(merged_records)
def analyze_merge_quality(merged_df, method):
"""Analyze merge quality and provide statistics"""
if merged_df is None or merged_df.empty:
return {"error": "No merged data"}
santiment_cols = [col for col in merged_df.columns if col.startswith('santiment_')]
analysis = {
'method_used': method,
'total_records': len(merged_df),
'santiment_features_added': len(santiment_cols),
'symbols_processed': sorted(merged_df['symbol'].unique()),
'completeness_by_symbol': {}
}
# Calculate completeness by symbol
for symbol in analysis['symbols_processed']:
symbol_data = merged_df[merged_df['symbol'] == symbol]
non_null_counts = symbol_data[santiment_cols].notna().sum(axis=1)
records_with_santiment = (non_null_counts > 0).sum()
analysis['completeness_by_symbol'][symbol] = {
'total_records': len(symbol_data),
'records_with_santiment': records_with_santiment,
'completeness_pct': records_with_santiment / len(symbol_data) * 100
}
return analysis
def save_results(merged_df, analysis, method):
"""Save merged results with method identifier"""
if merged_df is None:
logger.error("Cannot save - no merged data")
return None, None
logger.info("Saving time-shifted merge results...")
# Create output directory
output_dir = 'data/merged/features'
os.makedirs(output_dir, exist_ok=True)
# Save with method identifier
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_file = os.path.join(output_dir, f'crypto_with_santiment_{method}_{timestamp}.parquet')
merged_df.to_parquet(output_file, index=False)
logger.info(f"Merged features saved to: {output_file}")
# Save analysis
analysis_file = os.path.join(output_dir, f'santiment_merge_analysis_{method}_{timestamp}.json')
import json
with open(analysis_file, 'w') as f:
json.dump(analysis, f, indent=2, default=str)
logger.info(f"Analysis saved to: {analysis_file}")
return output_file, analysis_file
def main():
"""Main time-shifted merge process"""
logger.info("Starting time-shifted Santiment-Crypto merge...")
try:
# Load data
crypto_df, santiment_df = load_data()
# Try different merge methods
methods = ['offset', 'day_of_week']
results = {}
for method in methods:
logger.info(f"\n{'='*50}")
logger.info(f"TRYING METHOD: {method.upper()}")
logger.info(f"{'='*50}")
merged_df = merge_with_time_shift(crypto_df, santiment_df, method=method)
analysis = analyze_merge_quality(merged_df, method)
if merged_df is not None:
output_file, analysis_file = save_results(merged_df, analysis, method)
results[method] = {
'success': True,
'records': len(merged_df),
'completeness': analysis.get('completeness_by_symbol', {}),
'output_file': output_file
}
else:
results[method] = {'success': False}
# Print summary
print("\n" + "="*60)
print("TIME-SHIFTED MERGE SUMMARY")
print("="*60)
for method, result in results.items():
print(f"\n{method.upper()} METHOD:")
if result['success']:
print(f" β
Success: {result['records']} records merged")
print(f" π File: {result['output_file']}")
for symbol, stats in result['completeness'].items():
print(f" {symbol}: {stats['completeness_pct']:.1f}% complete")
else:
print(f" β Failed")
print("="*60)
except Exception as e:
logger.error(f"Time-shifted merge failed: {e}")
raise
if __name__ == "__main__":
main()
|