diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..11930f8d300c566147cf6dccf2ac556c26b617ae --- /dev/null +++ b/.dockerignore @@ -0,0 +1,30 @@ +# Exclude large, generated, and local-only files from Docker build context +.git +.gitignore +.vscode +__pycache__ +*.pyc +*.pyo +*.pyd +*.log + +# Python build artifacts +build/ +dist/ +*.egg-info/ + +# Local env +.env + +# Data and caches (mounted at runtime instead) +data/ +/data/ +**/archive/ +**/temp/ +**/train/ +**/raw/ +**/features/ +**/warehouse/ + +# Notebooks +*.ipynb diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..348770c8313b0f5f8cc14e9f81d833b65cef92bd --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +data/ +.env +src/data_cloud/__init__.py +__pycache__/ +.vscode/ +last_run.txt +*.pyc \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..261feb7dc7514f2f905a61be7f83fe2ab0d548a4 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,108 @@ +############################### +# 1) ─── Python builder ─── +############################### +FROM python:3.11-slim AS builder +WORKDIR /app +RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ git curl wget \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip wheel --no-cache-dir --wheel-dir=/app/wheels -r requirements.txt + +############################### +# 2) ─── Runtime image ─── +############################### +FROM python:3.11-slim +WORKDIR /app + +# OS runtime deps (minimal for memory optimization) +RUN apt-get update && apt-get install -y --no-install-recommends \ + libgomp1 \ + nginx \ + supervisor \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Python deps +COPY --from=builder /app/wheels /wheels +COPY requirements.txt . + +# Install Python dependencies (with cleanup for memory optimization) +RUN pip install --no-cache-dir --no-index --find-links=/wheels -r requirements.txt \ + && rm -rf /wheels \ + && pip cache purge + # Install Playwright system dependencies and browsers + # && python -m playwright install-deps \ + # && python -m playwright install chromium firefox webkit + +# Create necessary directories with proper permissions for root +RUN mkdir -p /data/advisorai-data/archive \ + && mkdir -p /data/advisorai-data/features \ + && mkdir -p /data/advisorai-data/temp \ + && mkdir -p /data/advisorai-data/train \ + && mkdir -p /data/advisorai-data/warehouse \ + && mkdir -p /data/alpaca/archive \ + && mkdir -p /data/alpaca/features \ + && mkdir -p /data/alpaca/temp \ + && mkdir -p /data/alpaca/train \ + && mkdir -p /data/crypto-bubbles/archive \ + && mkdir -p /data/crypto-bubbles/features \ + && mkdir -p /data/crypto-bubbles/temp \ + && mkdir -p /data/crypto-bubbles/train \ + && mkdir -p /data/finnhub/archive \ + && mkdir -p /data/finnhub/features \ + && mkdir -p /data/finnhub/temp \ + && mkdir -p /data/finnhub/train \ + && mkdir -p /data/finviz/archive \ + && mkdir -p /data/finviz/features \ + && mkdir -p /data/finviz/temp \ + && mkdir -p /data/finviz/train \ + && mkdir -p /data/marketaux/archive \ + && mkdir -p /data/marketaux/features \ + && mkdir -p /data/marketaux/temp \ + && mkdir -p /data/marketaux/train \ + && mkdir -p /data/merged/archive \ + && mkdir -p /data/merged/features \ + && mkdir -p /data/merged/temp \ + && mkdir -p /data/merged/train \ + && mkdir -p /data/merged/raw \ + && mkdir -p /data/logs \ + && mkdir -p /data/nltk_data \ + && mkdir -p /tmp/nginx/body \ + && mkdir -p /tmp/nginx/proxy \ + && mkdir -p /tmp/nginx/fastcgi \ + && chmod -R 777 /data /tmp/nginx + +# ─── Application code ─── +COPY . . + +# Set executable permissions for entrypoint +RUN chmod +x /app/deployment/entrypoint.sh /app/deployment/gradio_entrypoint.sh + +# PYTHONPATH for FastAPI +ENV PYTHONPATH=/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge + +# Nginx config +RUN rm -f /etc/nginx/conf.d/default.conf +COPY deployment/nginx.conf /etc/nginx/conf.d/app.conf +COPY deployment/nginx.main.conf /etc/nginx/nginx.conf + +# Set resource limits for memory optimization (512MB limit) +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=utf-8 +ENV MAX_MEMORY_MB=450 +ENV MALLOC_TRIM_THRESHOLD_=100000 +ENV MALLOC_MMAP_THRESHOLD_=131072 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONHASHSEED=random +ENV NLTK_DATA=/data/nltk_data + +# Supervisord config +COPY deployment/supervisord.conf /etc/supervisord.conf + +ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"] + +# Ports +EXPOSE 80 7860 + +CMD ["supervisord", "-c", "/etc/supervisord.conf"] \ No newline at end of file diff --git a/Dockerfile.gradio b/Dockerfile.gradio new file mode 100644 index 0000000000000000000000000000000000000000..ddcb78566b1ae9c4c9b6116a42438577bf260030 --- /dev/null +++ b/Dockerfile.gradio @@ -0,0 +1,85 @@ +############################### +# Gradio-optimized Dockerfile +############################### +FROM python:3.11-slim +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + libgomp1 \ + supervisor \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Copy requirements and install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt \ + && pip cache purge + +# Create necessary directories +RUN mkdir -p /data/logs \ + && mkdir -p /data/merged/features \ + && mkdir -p /data/merged/train \ + && mkdir -p /data/alpaca \ + && mkdir -p /data/advisorai-data \ + && mkdir -p /data/nltk_data \ + && chmod -R 777 /data + +# Copy application code +COPY . . + +# Set executable permissions +RUN chmod +x /app/deployment/gradio_entrypoint.sh + +# Set environment variables +ENV PYTHONPATH=/app:/app/src +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=utf-8 +ENV NLTK_DATA=/data/nltk_data + +# Create simplified supervisord config for Gradio +RUN echo '[supervisord]\n\ +nodaemon=true\n\ +logfile=/dev/stdout\n\ +logfile_maxbytes=0\n\ +pidfile=/tmp/supervisord.pid\n\ +loglevel=info\n\ +\n\ +[program:gradio]\n\ +command=python /app/app.py\n\ +directory=/app\n\ +autostart=true\n\ +autorestart=true\n\ +stdout_logfile=/dev/stdout\n\ +stderr_logfile=/dev/stderr\n\ +stdout_logfile_maxbytes=0\n\ +stderr_logfile_maxbytes=0\n\ +startsecs=10\n\ +startretries=3\n\ +stopwaitsecs=30\n\ +killasgroup=true\n\ +stopasgroup=true\n\ +environment=PYTHONPATH="/app:/app/src"\n\ +\n\ +[program:scheduler]\n\ +command=/bin/sh -c "sleep 180 && python /app/deployment/scheduler.py"\n\ +directory=/app\n\ +autostart=true\n\ +autorestart=true\n\ +startsecs=0\n\ +stdout_logfile=/dev/stdout\n\ +stderr_logfile=/dev/stderr\n\ +stdout_logfile_maxbytes=0\n\ +stderr_logfile_maxbytes=0\n\ +startretries=3\n\ +stopwaitsecs=60\n\ +killasgroup=true\n\ +stopasgroup=true' > /etc/supervisord_gradio.conf + +ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"] + +# Expose Gradio port +EXPOSE 7860 + +CMD ["supervisord", "-c", "/etc/supervisord_gradio.conf"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..0861e280503c4872df740143be5bf5c0362ac09e --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Maaroufabousaleh + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/PERMISSION_FIX_COMPLETED.md b/PERMISSION_FIX_COMPLETED.md new file mode 100644 index 0000000000000000000000000000000000000000..e2646791c5299fa5d8b692b16ad2f1d40fec3451 --- /dev/null +++ b/PERMISSION_FIX_COMPLETED.md @@ -0,0 +1,96 @@ +# Permission Fix Completion Report + +## Summary +Successfully resolved Docker container permission errors for Hugging Face Spaces deployment. The application now uses the platform's persistent writable mount `/data` instead of attempting to write to read-only locations under `/app`. + +## Key Changes Applied + +### 1. Container Startup (`deployment/entrypoint.sh`) +- **Before**: Created symlinks from `/tmp/data` to `/app/data` (not allowed on Spaces) +- **After**: Creates directory structure under `/data` and exports `DATA_DIR="/data"` +- **Result**: Container startup proceeds without symlink permission errors + +### 2. Data Fetch Script (`deployment/fetch_filebase.py`) +- **Before**: Hard-coded paths under `/app/data` +- **After**: Added CLI `--base-dir` support and `DATA_DIR` environment variable detection +- **Result**: Fetch script downloads to `/data` successfully without permission errors + +### 3. Application Configuration (`src/config.py` - NEW) +- **Purpose**: Centralized path management for DATA_DIR, LOG_DIR, and LAST_RUN_PATH +- **Behavior**: Auto-detects writable locations with fallbacks (`/data` → `/app/data` → `/tmp`) +- **Result**: Runtime code can work on both local dev and Hugging Face Spaces + +### 4. Runtime Components Updated +- **health.py**: Uses `LAST_RUN_PATH` and `DATA_DIR` from `src.config` +- **isrunning.py**: Uses `DATA_DIR` and `LAST_RUN_PATH` from `src.config` +- **monitor.py**: Uses `LOG_DIR` from `src.config` and checks `DATA_DIR` for disk usage +- **scheduler.py**: Writes `last_run.txt` to `LAST_RUN_PATH` from `src.config` + +### 5. Container Build (`Dockerfile`) +- **Before**: Created directories under `/app/data` +- **After**: Creates directories under `/data` and sets permissions +- **Result**: Container image prepares the correct writable mount point + +### 6. Permission Test Scripts +- **test_permissions.py**: Updated to test `/data` directories +- **cleanup.py**: Updated to operate on `/data` paths + +## Validation Results + +### Fetch Script Test +```bash +python deployment/fetch_filebase.py --base-dir /data +``` +**Result**: ✅ SUCCESS - All downloads completed with `[OK] Downloaded...` messages, no permission errors + +### Code Compilation Test +```bash +python -m py_compile src/config.py +python -m py_compile src/api/routes/health.py +python -m py_compile src/api/routes/isrunning.py +python -m py_compile deployment/monitor.py +python -m py_compile deployment/scheduler.py +``` +**Result**: ✅ SUCCESS - All files compile without syntax errors + +## Configuration Details + +### Environment Variables +- `DATA_DIR="/data"` - Exported by entrypoint.sh +- `LOG_DIR` - Auto-detected as `$DATA_DIR/logs` with fallback to `/tmp/logs` + +### Path Mapping +| Component | Old Path | New Path | +|-----------|----------|----------| +| Data storage | `/app/data` | `/data` | +| Logs | `/app/logs` | `/data/logs` | +| Last run marker | `/app/deployment/last_run.txt` | `/data/deployment/last_run.txt` | +| Feature files | `/app/data/merged/features` | `/data/merged/features` | + +### CLI Usage +- **Fetch script**: `python deployment/fetch_filebase.py --base-dir /data` +- **Auto-detection**: Script uses `DATA_DIR` environment variable if no `--base-dir` provided +- **Local dev**: Fallback to `/app/data` if `/data` doesn't exist + +## Next Steps for Deployment + +1. **Build and deploy** - The container should now start successfully on Hugging Face Spaces +2. **Monitor logs** - Check that nginx, monitor, and scheduler services start without permission errors +3. **Verify API endpoints** - Test `/health` and `/isrunning` endpoints return proper status +4. **Validate data pipeline** - Confirm scheduled data pipeline runs write to `/data` successfully + +## Remaining Considerations + +### Nginx Configuration +If nginx still fails with `/var/lib/nginx/body` permission errors, consider: +- Using custom nginx config that writes to `/data/nginx` instead +- Running nginx with user permissions that match container user +- Using nginx-light or alternative reverse proxy + +### System Directories +Monitor for any remaining attempts to write to system directories like: +- `/var/log` +- `/usr/local` +- Any paths under `/app` (should be read-only) + +The permission fix is complete and validated. The application is now ready for deployment on Hugging Face Spaces. diff --git a/README.md b/README.md index 7eced63841dd53203a7048eb32653ca8c2c058e2..7a04876edb01ed9148391f0bb561e73e7ef0bdce 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,9 @@ --- title: Advisorai Data Enhanced -emoji: 🌖 -colorFrom: gray -colorTo: indigo -sdk: gradio -sdk_version: 5.42.0 -app_file: app.py +emoji: 📚 +colorFrom: indigo +colorTo: green +sdk: docker pinned: false license: mit --- diff --git a/README_HF.md b/README_HF.md new file mode 100644 index 0000000000000000000000000000000000000000..7a6dd3ce0e519ca47bd85fb4458243e908d2954a --- /dev/null +++ b/README_HF.md @@ -0,0 +1,10 @@ +title: AdvisorAI Data Pipeline Monitor +emoji: 🤖 +colorFrom: blue +colorTo: green +sdk: gradio +sdk_version: 4.44.0 +app_file: app.py +pinned: false +license: mit +short_description: Real-time monitoring for AdvisorAI data collection pipeline diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..91e6c8c5cbd38d6d169a2d1ac558a5387a6b18b0 --- /dev/null +++ b/app.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +AdvisorAI Data Pipeline Monitor - Gradio App +This is the main entry point for Hugging Face Spaces +""" + +import gradio as gr +import json +import os +import sys +import logging +import time +from datetime import datetime + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def get_basic_health(): + """Get basic health status without external dependencies""" + return { + "status": "healthy", + "timestamp": datetime.now().isoformat(), + "message": "AdvisorAI Data Pipeline Monitor is running" + } + +def get_basic_pipeline_status(): + """Get basic pipeline status""" + return { + "status": "monitoring", + "message": "Data pipeline monitoring active", + "last_check": datetime.now().isoformat() + } + +def get_sample_data(): + """Get sample data for display""" + return [ + ["sample_data.json", "merged/features/", "2.5 MB", "2025-01-18 10:30"], + ["market_data.parquet", "alpaca/", "15.3 MB", "2025-01-18 10:25"], + ["sentiment_data.json", "finviz/features/", "1.2 MB", "2025-01-18 10:20"] + ] + +def get_sample_logs(): + """Get sample log entries""" + return """=== scheduler.log === +2025-01-18 10:30:15 - INFO - Scheduler started successfully +2025-01-18 10:30:16 - INFO - Data collection task initiated +2025-01-18 10:30:45 - INFO - Market data fetched successfully + +=== monitor.log === +2025-01-18 10:30:00 - INFO - System monitoring active +2025-01-18 10:30:30 - INFO - Memory usage: 45% +2025-01-18 10:31:00 - INFO - All services running normally +""" + +# Create Gradio interface +with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft()) as app: + gr.Markdown("# 🤖 AdvisorAI Data Pipeline Monitor") + gr.Markdown("Real-time monitoring of the AdvisorAI data collection and processing pipeline") + + with gr.Tabs(): + with gr.TabItem("📊 Dashboard"): + with gr.Row(): + with gr.Column(): + gr.Markdown("### Health Status") + health_display = gr.JSON(label="System Health & Status") + + with gr.Column(): + gr.Markdown("### Pipeline Status") + pipeline_display = gr.JSON(label="Data Pipeline Status") + + with gr.Row(): + refresh_btn = gr.Button("🔄 Refresh", variant="primary") + + with gr.TabItem("📁 Recent Files"): + gr.Markdown("### Recently Modified Data Files") + files_display = gr.Dataframe( + headers=["File", "Path", "Size", "Modified"], + value=get_sample_data(), + label="Recent Files" + ) + refresh_files_btn = gr.Button("🔄 Refresh Files") + + with gr.TabItem("📝 Logs"): + gr.Markdown("### Recent Log Entries") + logs_display = gr.Textbox( + label="Recent Logs", + value=get_sample_logs(), + lines=15, + max_lines=25, + show_copy_button=True + ) + refresh_logs_btn = gr.Button("🔄 Refresh Logs") + + # Event handlers + def refresh_dashboard(): + health = get_basic_health() + pipeline = get_basic_pipeline_status() + return json.dumps(health, indent=2), json.dumps(pipeline, indent=2) + + def refresh_files(): + return get_sample_data() + + def refresh_logs(): + return get_sample_logs() + + # Connect event handlers + refresh_btn.click( + refresh_dashboard, + outputs=[health_display, pipeline_display] + ) + + refresh_files_btn.click( + refresh_files, + outputs=[files_display] + ) + + refresh_logs_btn.click( + refresh_logs, + outputs=[logs_display] + ) + + # Auto-refresh on load + app.load( + refresh_dashboard, + outputs=[health_display, pipeline_display] + ) + +if __name__ == "__main__": + logger.info("Starting Gradio app...") + app.launch( + server_name="0.0.0.0", + server_port=7860, + share=False, + show_error=True + ) diff --git a/deployment/cleanup.py b/deployment/cleanup.py new file mode 100644 index 0000000000000000000000000000000000000000..a4bf11c903785d26a95708f808a25517015a1ec2 --- /dev/null +++ b/deployment/cleanup.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +""" +Cleanup script to manage disk space and prevent service issues +""" +import os +import shutil +import glob +from datetime import datetime, timedelta + +def cleanup_logs(): + """Clean up old log files""" + log_dirs = ["/data/logs", "/var/log"] + + for log_dir in log_dirs: + if os.path.exists(log_dir): + # Remove log files older than 7 days + cutoff_date = datetime.now() - timedelta(days=7) + + for log_file in glob.glob(os.path.join(log_dir, "*.log*")): + try: + file_time = datetime.fromtimestamp(os.path.getmtime(log_file)) + if file_time < cutoff_date: + os.remove(log_file) + print(f"[Cleanup] Removed old log: {log_file}") + except Exception as e: + print(f"[Cleanup] Error removing {log_file}: {e}") + +def cleanup_temp_files(): + """Clean up temporary files""" + temp_dirs = ["/tmp", "/data/merged/temp"] + + for temp_dir in temp_dirs: + if os.path.exists(temp_dir): + # Remove files older than 1 day + cutoff_date = datetime.now() - timedelta(days=1) + + for temp_file in glob.glob(os.path.join(temp_dir, "*")): + try: + if os.path.isfile(temp_file): + file_time = datetime.fromtimestamp(os.path.getmtime(temp_file)) + if file_time < cutoff_date: + os.remove(temp_file) + print(f"[Cleanup] Removed temp file: {temp_file}") + except Exception as e: + print(f"[Cleanup] Error removing {temp_file}: {e}") + +def cleanup_old_data(): + """Clean up old data files to save space""" + # Keep only last 30 days of archived data + archive_dir = "/data/merged/archive" + if os.path.exists(archive_dir): + cutoff_date = datetime.now() - timedelta(days=30) + + for archive_folder in os.listdir(archive_dir): + folder_path = os.path.join(archive_dir, archive_folder) + if os.path.isdir(folder_path): + try: + folder_time = datetime.fromtimestamp(os.path.getmtime(folder_path)) + if folder_time < cutoff_date: + shutil.rmtree(folder_path) + print(f"[Cleanup] Removed old archive: {folder_path}") + except Exception as e: + print(f"[Cleanup] Error removing {folder_path}: {e}") + +def get_disk_usage(): + """Get current disk usage""" + try: + import psutil + # Check disk usage for the data mount if present + disk_usage = psutil.disk_usage('/data' if os.path.exists('/data') else '/') + free_gb = disk_usage.free / (1024**3) + used_percent = (disk_usage.used / disk_usage.total) * 100 + return free_gb, used_percent + except Exception: + return None, None + +def main(): + """Main cleanup function""" + print(f"[Cleanup] Starting cleanup at {datetime.now()}") + + # Check disk usage before cleanup + free_before, used_before = get_disk_usage() + if free_before: + print(f"[Cleanup] Disk usage before: {used_before:.1f}% used, {free_before:.1f}GB free") + + # Run cleanup tasks + cleanup_logs() + cleanup_temp_files() + cleanup_old_data() + + # Check disk usage after cleanup + free_after, used_after = get_disk_usage() + if free_after and free_before: + freed_space = free_after - free_before + print(f"[Cleanup] Disk usage after: {used_after:.1f}% used, {free_after:.1f}GB free") + if freed_space > 0: + print(f"[Cleanup] Freed {freed_space:.2f}GB of disk space") + + print(f"[Cleanup] Cleanup completed at {datetime.now()}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/deployment/entrypoint.sh b/deployment/entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..4c5f61b536bc56cc1e69da01fcb11dddfda58f12 --- /dev/null +++ b/deployment/entrypoint.sh @@ -0,0 +1,64 @@ +#!/bin/sh +set -e + +echo "[entrypoint] v2025-08-16-permissions-fix" + + +echo "[entrypoint] ensuring data directories exist with proper permissions..." +# Create directories under /data and /tmp/nginx (for Nginx temp files) +mkdir -p /data/advisorai-data \ + /data/merged \ + /data/alpaca \ + /data/crypto-bubbles \ + /data/finnhub \ + /data/finviz \ + /data/marketaux \ + /data/logs \ + /tmp/nginx/body \ + /tmp/nginx/proxy \ + /tmp/nginx/fastcgi + +# Fix permissions at runtime (in case Dockerfile is not enough) +# Best-effort ownership/permission fixes; ignore errors on Space mounts +chown -R $(id -u):$(id -g) /data /tmp/nginx 2>/dev/null || true +chmod -R 777 /data /tmp/nginx 2>/dev/null || true + +echo "[entrypoint] restoring data from Filebase…" +# Run data restoration in background to avoid blocking startup. Let script auto-detect writable base. +python /app/deployment/fetch_filebase.py & +FETCH_PID=$! + +# Wait a bit for critical data, but don't block indefinitely +sleep 10 + +# Check if fetch is still running +if kill -0 $FETCH_PID 2>/dev/null; then + echo "[entrypoint] Data fetch still running in background (PID: $FETCH_PID)" +else + echo "[entrypoint] Data fetch completed" +fi + +echo "[entrypoint] launching services…" + +# ROLE-based startup: 'web' (default) runs API+nginx under supervisord; 'worker' runs scheduler directly +ROLE_ENV=${ROLE:-web} +echo "[entrypoint] detected ROLE=$ROLE_ENV" + +if [ "$ROLE_ENV" = "worker" ]; then + echo "[entrypoint] starting worker: scheduler only" + exec python /app/deployment/scheduler.py +else + # Hugging Face Spaces friendly mode: run uvicorn directly on $PORT if HF_MODE=1 + if [ "${HF_MODE:-0}" = "1" ]; then + export PORT=${PORT:-7860} + echo "[entrypoint] HF_MODE=1 -> launching uvicorn directly on PORT=$PORT" + exec uvicorn src.api.main:app --host 0.0.0.0 --port ${PORT} --workers 1 --timeout-keep-alive 30 + else + # Default: nginx + uvicorn via supervisord + if [ -n "$PORT" ]; then + echo "[entrypoint] configuring nginx to listen on PORT=$PORT" + sed -i "s/listen 80;/listen ${PORT};/" /etc/nginx/conf.d/app.conf || true + fi + exec supervisord -c /etc/supervisord.conf + fi +fi \ No newline at end of file diff --git a/deployment/fetch_filebase.py b/deployment/fetch_filebase.py new file mode 100644 index 0000000000000000000000000000000000000000..2750a4395cd4cdd32cc4234a9b7ea1fe39fc9b1b --- /dev/null +++ b/deployment/fetch_filebase.py @@ -0,0 +1,178 @@ +import os +import sys +import argparse + +from dotenv import load_dotenv +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from src.data_cloud.cloud_utils import StorageHandler + + +def choose_base_dir(cli_base=None): + """Choose a writable base directory. Preference order: + 1. CLI-provided path + 2. /data (persistent volume on Spaces) + 3. /tmp + """ + candidates = [] + if cli_base: + candidates.append(cli_base) + candidates.extend(['/data', '/tmp']) + + for base in candidates: + try: + merged_path = os.path.abspath(os.path.join(base, 'merged')) + advisorai_path = os.path.abspath(os.path.join(base, 'advisorai-data')) + os.makedirs(merged_path, mode=0o777, exist_ok=True) + os.makedirs(advisorai_path, mode=0o777, exist_ok=True) + # Quick writability test + test_file = os.path.join(merged_path, '.write_test') + with open(test_file, 'w') as f: + f.write('ok') + os.remove(test_file) + return base + except Exception: + # cannot use this candidate; try next + continue + + # As a last resort, use /tmp (may raise later if not writable) + return '/tmp' + + +def main(argv=None): + parser = argparse.ArgumentParser(description='Fetch data from Filebase/S3 into local disk') + parser.add_argument('--base-dir', help='Base directory to store data (default: auto-detected)') + args = parser.parse_args(argv) + + load_dotenv() + # Load credentials from environment variables + endpoint_url = os.getenv('FILEBASE_ENDPOINT', 'https://s3.filebase.com') + access_key = os.getenv('FILEBASE_ACCESS_KEY') + secret_key = os.getenv('FILEBASE_SECRET_KEY') + bucket_name = os.getenv('FILEBASE_BUCKET') + + # Prefer explicit DATA_DIR env var if present (Option 1) + env_base = os.getenv('DATA_DIR') + if env_base: + base_root = env_base + else: + base_root = choose_base_dir(args.base_dir) + local_base = os.path.abspath(os.path.join(base_root, 'merged')) + advisorai_base = os.path.abspath(os.path.join(base_root, 'advisorai-data')) + + # Ensure base directories exist with proper permissions + os.makedirs(local_base, mode=0o777, exist_ok=True) + os.makedirs(advisorai_base, mode=0o777, exist_ok=True) + + storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name, local_base=local_base) + + # Fetch all folders/files from advisorai-data + advisor_prefix = "advisorai-data/" + print(f"Fetching all folders/files from: {advisor_prefix}") + advisor_keys = [] + if storage.s3 and bucket_name: + try: + resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=advisor_prefix) + for obj in resp.get('Contents', []): + key = obj['Key'] + if not key.endswith('/'): + advisor_keys.append(key) + except Exception as e: + print(f"[WARN] Could not list objects for {advisor_prefix}: {e}") + else: + print(f"[ERROR] No S3 client or bucket configured for advisorai-data!") + # Download advisorai-data files + for key in advisor_keys: + try: + data = storage.download(key) + # Remove 'advisorai-data/' from the start of the key for local path + local_rel_path = key[len("advisorai-data/"):] if key.startswith("advisorai-data/") else key + local_path = os.path.join(advisorai_base, local_rel_path) + os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True) + with open(local_path, 'wb') as f: + f.write(data) + print(f"[OK] Downloaded advisorai-data/{local_rel_path} from s3://{bucket_name}/{key}") + except Exception as e: + print(f"[ERROR] Failed to fetch advisorai-data file {key}: {e}") + + + # Fetch everything under merged/ except only the last 7 from merged/archive/ + merged_prefix = "merged/" + print(f"Fetching everything under: {merged_prefix} (except only last 7 from archive)") + merged_keys = [] + archive_prefix = "merged/archive/" + archive_folders = set() + archive_keys = [] + if storage.s3 and bucket_name: + try: + resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=merged_prefix) + for obj in resp.get('Contents', []): + key = obj['Key'] + # Exclude all archive keys for now + if key.startswith(archive_prefix): + # Collect archive folders for later + parts = key[len(archive_prefix):].split('/') + if len(parts) > 1 and parts[0].isdigit(): + archive_folders.add(parts[0]) + continue + if not key.endswith('/'): + merged_keys.append(key) + except Exception as e: + print(f"[WARN] Could not list objects for {merged_prefix}: {e}") + else: + print(f"[ERROR] No S3 client or bucket configured for merged!") + + # Download all merged/ (except archive) + for key in merged_keys: + try: + data = storage.download(key) + local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key + local_path = os.path.join(local_base, local_rel_path) + os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True) + with open(local_path, 'wb') as f: + f.write(data) + print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}") + except Exception as e: + print(f"[ERROR] Failed to fetch {key}: {e}") + + # Fetch only the last 7 folders under merged/archive + archive_prefix = "merged/archive/" + print(f"Fetching last 7 archive folders from: {archive_prefix}") + archive_folders = set() + archive_keys = [] + if storage.s3 and bucket_name: + try: + resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=archive_prefix) + for obj in resp.get('Contents', []): + key = obj['Key'] + # Expect keys like merged/archive/YYYYMMDD/... + parts = key[len(archive_prefix):].split('/') + if len(parts) > 1 and parts[0].isdigit(): + archive_folders.add(parts[0]) + # Sort and get last 7 folders + last7 = sorted(archive_folders)[-7:] + print(f"[INFO] Last 7 archive folders: {last7}") + # Collect all keys in those folders + for obj in resp.get('Contents', []): + key = obj['Key'] + parts = key[len(archive_prefix):].split('/') + if len(parts) > 1 and parts[0] in last7: + archive_keys.append(key) + except Exception as e: + print(f"[WARN] Could not list objects for {archive_prefix}: {e}") + else: + print(f"[ERROR] No S3 client or bucket configured for archive!") + # Download archive files + for key in archive_keys: + try: + data = storage.download(key) + local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key + local_path = os.path.join(local_base, local_rel_path) + os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True) + with open(local_path, 'wb') as f: + f.write(data) + print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}") + except Exception as e: + print(f"[ERROR] Failed to fetch archive file {key}: {e}") + +if __name__ == "__main__": + main() diff --git a/deployment/gradio_entrypoint.sh b/deployment/gradio_entrypoint.sh new file mode 100644 index 0000000000000000000000000000000000000000..0e28dcf1b14d5465e8851c9fb857933a7b85e32d --- /dev/null +++ b/deployment/gradio_entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +echo "Starting AdvisorAI Data Pipeline with Gradio..." + +# Create necessary directories +mkdir -p /data/logs /data/nltk_data + +# Set proper permissions +chmod -R 777 /data + +# Download NLTK data if needed +python -c " +import nltk +import os +os.environ['NLTK_DATA'] = '/data/nltk_data' +try: + nltk.download('punkt', download_dir='/data/nltk_data', quiet=True) + nltk.download('stopwords', download_dir='/data/nltk_data', quiet=True) + nltk.download('vader_lexicon', download_dir='/data/nltk_data', quiet=True) + print('NLTK data downloaded successfully') +except Exception as e: + print(f'NLTK download failed: {e}') +" + +echo "Starting services..." +exec "$@" diff --git a/deployment/monitor.py b/deployment/monitor.py new file mode 100644 index 0000000000000000000000000000000000000000..8c99d175c6b197b90e841ad19fa9a6e550ab3825 --- /dev/null +++ b/deployment/monitor.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 +""" +Simple monitoring script to track service health and resource usage +""" +import os +import time +import psutil +import json +from datetime import datetime + +from src import config as app_config + +def get_system_stats(): + """Get current system statistics""" + try: + process = psutil.Process() + + # Memory info + memory_info = process.memory_info() + memory_mb = memory_info.rss / 1024 / 1024 + + # CPU info + cpu_percent = process.cpu_percent(interval=1) + + # Disk info (prefer DATA_DIR) + disk_root = app_config.DATA_DIR if os.path.exists(app_config.DATA_DIR) else '/' + disk_usage = psutil.disk_usage(disk_root) + disk_free_gb = disk_usage.free / (1024**3) + disk_used_percent = (disk_usage.used / disk_usage.total) * 100 + + # Process info + num_threads = process.num_threads() + + return { + "timestamp": datetime.now().isoformat(), + "memory_mb": round(memory_mb, 2), + "cpu_percent": round(cpu_percent, 2), + "disk_free_gb": round(disk_free_gb, 2), + "disk_used_percent": round(disk_used_percent, 2), + "num_threads": num_threads, + "pid": process.pid + } + except Exception as e: + return { + "timestamp": datetime.now().isoformat(), + "error": str(e) + } + +def log_stats(): + """Log system statistics to file""" + stats = get_system_stats() + + # Create logs directory if it doesn't exist + log_dir = app_config.LOG_DIR + os.makedirs(log_dir, exist_ok=True) + + # Write to log file + log_file = os.path.join(log_dir, "system_stats.jsonl") + with open(log_file, "a") as f: + f.write(json.dumps(stats) + "\n") + + # Print to stdout for supervisord + print(f"[Monitor] {json.dumps(stats)}") + + # Check for issues + if "error" not in stats: + issues = [] + + if stats["memory_mb"] > 450: # 90% of 512MB limit + issues.append(f"HIGH MEMORY: {stats['memory_mb']:.1f}MB") + + if stats["cpu_percent"] > 80: + issues.append(f"HIGH CPU: {stats['cpu_percent']:.1f}%") + + if stats["disk_free_gb"] < 0.5: + issues.append(f"LOW DISK: {stats['disk_free_gb']:.1f}GB free") + + if issues: + print(f"[Monitor] ALERTS: {', '.join(issues)}") + +if __name__ == "__main__": + print("[Monitor] Starting system monitoring...") + + while True: + try: + log_stats() + time.sleep(60) # Log every minute + except KeyboardInterrupt: + print("[Monitor] Monitoring stopped") + break + except Exception as e: + print(f"[Monitor] Error: {e}") + time.sleep(60) \ No newline at end of file diff --git a/deployment/nginx.conf b/deployment/nginx.conf new file mode 100644 index 0000000000000000000000000000000000000000..ea91d164fd1c03527e1f92c368f00fa59171d7e1 --- /dev/null +++ b/deployment/nginx.conf @@ -0,0 +1,51 @@ +server { + listen 80; + + # Increase timeouts to handle long-running operations + proxy_connect_timeout 60s; + proxy_send_timeout 60s; + proxy_read_timeout 60s; + # Temp paths are configured globally in nginx.main.conf (http scope) + + # Buffer settings + proxy_buffering on; + proxy_buffer_size 4k; + proxy_buffers 8 4k; + proxy_busy_buffers_size 8k; + + # Client settings + client_max_body_size 10m; + client_body_timeout 60s; + client_header_timeout 60s; + + # -- health-check: proxy to gradio app -- + location = /health { + proxy_pass http://127.0.0.1:7860/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # Shorter timeouts for health checks + proxy_connect_timeout 10s; + proxy_send_timeout 10s; + proxy_read_timeout 10s; + + # don't log upstream body + access_log off; + } + + # -- everything else to Gradio -- + location / { + proxy_pass http://127.0.0.1:7860/; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + + # Handle WebSocket upgrades for Gradio + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + } +} \ No newline at end of file diff --git a/deployment/nginx.main.conf b/deployment/nginx.main.conf new file mode 100644 index 0000000000000000000000000000000000000000..f474ca9b8f2a7a2f8fe8e838796c70653cc6312e --- /dev/null +++ b/deployment/nginx.main.conf @@ -0,0 +1,37 @@ +worker_processes auto; + +events { + worker_connections 1024; +} + +http { + include /etc/nginx/mime.types; + default_type application/octet-stream; + + # Timeouts + proxy_connect_timeout 60s; + proxy_send_timeout 60s; + proxy_read_timeout 60s; + + # Temp paths (writable on Spaces) + client_body_temp_path /tmp/nginx/body 1 2; + proxy_temp_path /tmp/nginx/proxy; + fastcgi_temp_path /tmp/nginx/fastcgi; + + # Buffers + proxy_buffering on; + proxy_buffer_size 4k; + proxy_buffers 8 4k; + proxy_busy_buffers_size 8k; + + # Client + client_max_body_size 10m; + client_body_timeout 60s; + client_header_timeout 60s; + + # Logs + access_log /dev/stdout; + error_log /dev/stderr warn; + + include /etc/nginx/conf.d/*.conf; +} diff --git a/deployment/render.yaml b/deployment/render.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b34488c82b65f2600494278ab0c14a3b63fc2832 --- /dev/null +++ b/deployment/render.yaml @@ -0,0 +1,83 @@ +services: + # ──────────────────────────────── + # 1) Web service: API + nginx + # ──────────────────────────────── + - type: web + name: advisorai-complete + env: docker + plan: free + instanceCount: 1 + dockerfilePath: Dockerfile + dockerContext: . + # Health check configuration + healthCheckPath: /health + healthCheckInterval: 60s # Longer interval for free plan + healthCheckTimeout: 15s + healthCheckThreshold: 5 # More lenient for free plan + # Environment variables + envVars: + - key: PORT + value: "80" + - key: API_PORT + value: "10000" + - key: ROLE + value: "web" + - key: PYTHONPATH + value: "/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge" + - key: MAX_MEMORY_MB + value: "512" # Lower limit for free plan + - key: PYTHONUNBUFFERED + value: "1" + - key: PYTHONIOENCODING + value: "utf-8" + - key: TRIGGER_PING_INTERVAL + value: "600" # Less frequent pinging for free plan + # Auto-deploy settings + autoDeploy: true + # Build settings + buildFilter: + paths: + - src/** + - deployment/** + - requirements.txt + - Dockerfile + + # ──────────────────────────────── + # 2) Worker service: pipeline scheduler & backup + # ──────────────────────────────── + - type: worker + name: advisorai-scheduler + env: docker + plan: free + instanceCount: 1 + dockerfilePath: Dockerfile + dockerContext: . + # entrypoint will respect ROLE=worker and launch scheduler + envVars: + - key: ROLE + value: "worker" + - key: PYTHONPATH + value: "/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge" + - key: MAX_MEMORY_MB + value: "512" # Lower limit for free plan + - key: PYTHONUNBUFFERED + value: "1" + - key: PYTHONIOENCODING + value: "utf-8" + - key: TRIGGER_PING_INTERVAL + value: "600" # Less frequent pinging for free plan + - key: MONGODB_URI + value: "" + - key: MONGODB_DATABASE + value: "AdvisorAI" + - key: MONGODB_COLLECTION_WAREHOUSE + value: "warehouse" + # Auto-deploy settings + autoDeploy: true + # Build settings + buildFilter: + paths: + - src/** + - deployment/** + - requirements.txt + - Dockerfile \ No newline at end of file diff --git a/deployment/scheduler.py b/deployment/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..d045e569a5251090c8bf94389299b0eab1404a91 --- /dev/null +++ b/deployment/scheduler.py @@ -0,0 +1,143 @@ +import os +import time +import subprocess +import sys +import threading +import asyncio +from dotenv import load_dotenv +import httpx +import os + +from src import config as app_config + +# ----------------------------------------------------------------------------- +# LOCATE YOUR DATA-PIPELINE SCRIPT +# ----------------------------------------------------------------------------- +if os.path.exists(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py"))): + PIPELINE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py")) + PIPELINE_DIR = os.path.dirname(PIPELINE_PATH) +else: + raise FileNotFoundError("src/main.py not found!") + +# ----------------------------------------------------------------------------- +# CONFIGURATION (via ENV) +# ----------------------------------------------------------------------------- +load_dotenv() +# URL to ping every N seconds (default 300s = 5min) +def _parse_int_env(name: str, default_val: int) -> int: + raw = os.getenv(name, str(default_val)) + if isinstance(raw, str): + # Strip inline comments and whitespace, e.g. "3600 # every hour" + cleaned = raw.split('#', 1)[0].strip() + if cleaned == "": + return int(default_val) + try: + return int(cleaned) + except Exception: + print(f"[Scheduler] Warning: {name}='{raw}' is not a valid int. Using default {default_val}.") + return int(default_val) + try: + return int(raw) + except Exception: + return int(default_val) + +TRIGGER_HEALTH_URL = os.getenv( + "TRIGGER_HEALTH_URL", + "https://advisor-trigger-ki3t.onrender.com/health, https://advisorai-data-1ew2.onrender.com/health" +) +PING_INTERVAL = _parse_int_env("TRIGGER_PING_INTERVAL", 300) +# Pipeline interval default 3600s (1 hour) +PIPELINE_INTERVAL = _parse_int_env("PIPELINE_INTERVAL", 3600) + +# ----------------------------------------------------------------------------- +# ASYNC PINGER WITH EXPONENTIAL BACKOFF +# ----------------------------------------------------------------------------- +async def ping_remote(): + """ + Continuously GET each URL in TRIGGER_HEALTH_URL (comma-separated) every PING_INTERVAL seconds, + backing off on failure (up to 2.5 minutes). + """ + urls = [u.strip() for u in TRIGGER_HEALTH_URL.split(",") if u.strip()] + backoff = min(PING_INTERVAL, 5) + async with httpx.AsyncClient(timeout=10.0) as client: + while True: + all_success = True + for url in urls: + try: + resp = await client.get(url) + resp.raise_for_status() + print(f"[Pinger] {url} -> {resp.status_code}") + except Exception as e: + print(f"[Pinger] error pinging {url}: {e}") + all_success = False + if all_success: + backoff = PING_INTERVAL + await asyncio.sleep(PING_INTERVAL) + else: + await asyncio.sleep(backoff) + backoff = min(backoff * 2, 150) + +def start_async_ping(): + """ + Spin up a dedicated asyncio loop in a daemon thread + to run ping_remote() forever. + """ + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + loop.create_task(ping_remote()) + loop.run_forever() + +# launch the ping loop in the background +threading.Thread(target=start_async_ping, daemon=True).start() +print("[Scheduler] Started background ping thread") + +# ----------------------------------------------------------------------------- +# MAIN PIPELINE LOOP (runs every 30 minutes) +# ----------------------------------------------------------------------------- +import traceback + +while True: + from datetime import datetime + last_run = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + print(f"[Scheduler] Running pipeline... Last run: {last_run}") + # Write last_run to file for API access + try: + with open(app_config.LAST_RUN_PATH, 'w') as f: + f.write(last_run) + except Exception as e: + print(f"[Scheduler] Failed to write last_run.txt: {e}") + try: + # Set working directory to project root (parent of deployment) + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + print(f"[Scheduler] Project root: {project_root}") + print(f"[Scheduler] Pipeline path: {PIPELINE_PATH}") + + # Run from '/' so relative 'data/...' writes resolve to '/data/...' + result = subprocess.run( + [sys.executable, PIPELINE_PATH], + cwd='/', + capture_output=True, + text=True, + env=os.environ.copy() + ) + print(f"[Scheduler] Pipeline finished with code {result.returncode}") + + if result.stdout: + print("[Scheduler] STDOUT:\n", result.stdout) + if result.stderr: + print("[Scheduler] STDERR:\n", result.stderr) + + # Raise an exception if the return code is non-zero + if result.returncode != 0: + raise subprocess.CalledProcessError(result.returncode, result.args, result.stdout, result.stderr) + + except subprocess.CalledProcessError as e: + print(f"[Scheduler] Pipeline execution failed with return code {e.returncode}") + print(f"[Scheduler] STDOUT:\n{e.stdout}") + print(f"[Scheduler] STDERR:\n{e.stderr}") + except Exception as e: + print(f"[Scheduler] Exception running pipeline: {e}") + print(traceback.format_exc()) + + print(f"[Scheduler] Sleeping for {PIPELINE_INTERVAL // 60} minutes...") + time.sleep(PIPELINE_INTERVAL) diff --git a/deployment/supervisord.conf b/deployment/supervisord.conf new file mode 100644 index 0000000000000000000000000000000000000000..81444670956154a17423af0d3e99c7ab1e7dc0ab --- /dev/null +++ b/deployment/supervisord.conf @@ -0,0 +1,65 @@ +[supervisord] +nodaemon=true +logfile=/dev/stdout +logfile_maxbytes=0 +pidfile=/tmp/supervisord.pid +loglevel=info + +[program:gradio] +command=python /app/src/api/gradio_main.py +directory=/app +autostart=true +autorestart=true +stdout_logfile=/dev/stdout +stderr_logfile=/dev/stderr +stdout_logfile_maxbytes=0 +stderr_logfile_maxbytes=0 +startsecs=10 +startretries=3 +stopwaitsecs=30 +killasgroup=true +stopasgroup=true +environment=PYTHONPATH="/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge" + +[program:nginx] +command=/usr/sbin/nginx -g 'daemon off;' +autostart=true +autorestart=true +stdout_logfile=/dev/stdout +stderr_logfile=/dev/stderr +stdout_logfile_maxbytes=0 +stderr_logfile_maxbytes=0 +startsecs=5 +startretries=3 +stopwaitsecs=10 + +[program:scheduler] +; wait 180 s before first run, then your scheduler.py handles its own 30 min sleeps +command=/bin/sh -c 'sleep 180 && python /app/deployment/scheduler.py' +directory=/app +autostart=true +autorestart=true +startsecs=0 +stdout_logfile=/dev/stdout +stderr_logfile=/dev/stderr +stdout_logfile_maxbytes=0 +stderr_logfile_maxbytes=0 +startretries=3 +stopwaitsecs=60 +killasgroup=true +stopasgroup=true + +[program:monitor] +command=python /app/deployment/monitor.py +directory=/app +autostart=true +autorestart=true +startsecs=5 +stdout_logfile=/dev/stdout +stderr_logfile=/dev/stderr +stdout_logfile_maxbytes=0 +stderr_logfile_maxbytes=0 +startretries=3 +stopwaitsecs=10 +killasgroup=true +stopasgroup=true \ No newline at end of file diff --git a/deployment/test_permissions.py b/deployment/test_permissions.py new file mode 100644 index 0000000000000000000000000000000000000000..4ab556da8661f20fd644b1e4efe885a8285ad542 --- /dev/null +++ b/deployment/test_permissions.py @@ -0,0 +1,129 @@ +""" +Test script to verify directory permissions and file creation capabilities. +This script should be run inside the container to verify the fixes. +""" +import os +import tempfile +import sys +from pathlib import Path + +def test_directory_permissions(): + """Test if we can create directories and files in the expected locations.""" + + print("=== Directory Permission Test ===") + + # Test directories that should be writable (use /data on Spaces) + test_dirs = [ + "/data/advisorai-data/test", + "/data/merged/test", + "/data/alpaca/test", + "/data/crypto-bubbles/test", + "/data/finnhub/test", + "/data/finviz/test", + "/data/marketaux/test" + ] + + success_count = 0 + for test_dir in test_dirs: + try: + # Try to create directory + os.makedirs(test_dir, mode=0o755, exist_ok=True) + + # Try to create a test file + test_file = os.path.join(test_dir, "test_write.txt") + with open(test_file, 'w') as f: + f.write(f"Test write successful at {test_dir}") + + # Try to read the file back + with open(test_file, 'r') as f: + content = f.read() + + # Clean up + os.remove(test_file) + os.rmdir(test_dir) + + print(f"✅ SUCCESS: {test_dir}") + success_count += 1 + + except Exception as e: + print(f"❌ FAILED: {test_dir} - {e}") + + print(f"\n📊 Results: {success_count}/{len(test_dirs)} directories passed the test") + + if success_count == len(test_dirs): + print("🎉 All directory permission tests PASSED!") + return True + else: + print("⚠️ Some directory permission tests FAILED!") + return False + +def test_user_info(): + """Display current user and process information.""" + print("\n=== User & Process Information ===") + + # Check if running on Windows or Unix + if hasattr(os, 'getuid'): + # Unix/Linux system + print(f"Current UID: {os.getuid()}") + print(f"Current GID: {os.getgid()}") + print(f"Effective UID: {os.geteuid()}") + print(f"Effective GID: {os.getegid()}") + + # Check if running as root + if os.getuid() == 0: + print("✅ Running as root user") + else: + print("ℹ️ Running as non-root user") + else: + # Windows system + print("ℹ️ Running on Windows system") + print(f"Current user: {os.getenv('USERNAME', 'Unknown')}") + + print(f"Process ID: {os.getpid()}") + print(f"Parent Process ID: {os.getppid()}") + +def test_filebase_connectivity(): + """Test if we can load environment variables needed for Filebase.""" + print("\n=== Environment Variables Test ===") + + required_vars = [ + 'FILEBASE_ENDPOINT', + 'FILEBASE_ACCESS_KEY', + 'FILEBASE_SECRET_KEY', + 'FILEBASE_BUCKET' + ] + + missing_vars = [] + for var in required_vars: + value = os.getenv(var) + if value: + # Don't print sensitive values, just show they exist + if 'KEY' in var: + print(f"✅ {var}: ***redacted*** (length: {len(value)})") + else: + print(f"✅ {var}: {value}") + else: + print(f"❌ {var}: NOT SET") + missing_vars.append(var) + + if missing_vars: + print(f"⚠️ Missing environment variables: {missing_vars}") + return False + else: + print("🎉 All required environment variables are set!") + return True + +if __name__ == "__main__": + print("Starting permission and environment tests...\n") + + test_user_info() + perm_test = test_directory_permissions() + env_test = test_filebase_connectivity() + + print(f"\n=== Final Results ===") + if perm_test and env_test: + print("🎉 ALL TESTS PASSED! The container should work correctly.") + sys.exit(0) + else: + print("❌ SOME TESTS FAILED! Check the output above for details.") + sys.exit(1) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..4137d84810713fd81ae95ff6636b0a643fc2967b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,31 @@ +# feedparser +# crawl4ai +python-dotenv +requests>=2.25.0 +# pymongo +pandas>=1.3.0 +pyarrow +boto3==1.36.* +finnhub-python==2.4.24 +alpaca-py>=0.6.0 +pydantic-settings>=1.0.0 +sanpy>=0.1.0 +python-dateutil +plotly +nltk +Flask==2.2.2 +werkzeug==2.2.3 +fastapi +uvicorn[standard] +httpx +gradio>=4.0.0 +# trafilatura +rich +numpy +pydantic +# playwright +psutil +beautifulsoup4 +scikit-learn +python-multipart +aiofiles \ No newline at end of file diff --git a/santiment_frequency_controller.py b/santiment_frequency_controller.py new file mode 100644 index 0000000000000000000000000000000000000000..935b7deaf581c734ed8d26a4a56aa527583ea708 --- /dev/null +++ b/santiment_frequency_controller.py @@ -0,0 +1,118 @@ +""" +Santiment Frequency Controller +============================= + +This module provides frequency control for Santiment API calls to preserve API limits. +It tracks execution frequency and limits runs to avoid exceeding API quotas. +""" + +import json +import os +from datetime import datetime, timedelta +from pathlib import Path + + +class SantimentFrequencyController: + """Controls the frequency of Santiment API calls to preserve API limits""" + + def __init__(self, state_file: str = None): + """Initialize the frequency controller + + Args: + state_file: Path to the state file. If None, uses default location. + """ + if state_file is None: + # Try to find the state file in data/santiment directory + try: + from src.config import DATA_DIR + state_file = os.path.join(DATA_DIR, "santiment", "frequency_state.json") + except Exception: + # Fallback to local directory + state_file = "data/santiment/frequency_state.json" + + self.state_file = Path(state_file) + self.state_file.parent.mkdir(parents=True, exist_ok=True) + self._load_state() + + def _load_state(self): + """Load the current state from file""" + if self.state_file.exists(): + try: + with open(self.state_file, 'r') as f: + self.state = json.load(f) + except Exception: + self.state = {} + else: + self.state = {} + + # Ensure required fields exist + if 'last_run' not in self.state: + self.state['last_run'] = None + if 'runs_today' not in self.state: + self.state['runs_today'] = 0 + if 'date' not in self.state: + self.state['date'] = None + + def _save_state(self): + """Save the current state to file""" + try: + with open(self.state_file, 'w') as f: + json.dump(self.state, f, indent=2) + except Exception as e: + print(f"[WARN] Failed to save frequency state: {e}") + + def should_run_santiment(self, max_runs_per_day: int = 2) -> bool: + """Check if Santiment should be allowed to run + + Args: + max_runs_per_day: Maximum number of runs allowed per day + + Returns: + True if Santiment should run, False otherwise + """ + today = datetime.now().strftime("%Y-%m-%d") + + # Reset counter if it's a new day + if self.state.get('date') != today: + self.state['date'] = today + self.state['runs_today'] = 0 + self._save_state() + + # Check if we've exceeded the daily limit + return self.state['runs_today'] < max_runs_per_day + + def record_run(self): + """Record that Santiment has been run""" + today = datetime.now().strftime("%Y-%m-%d") + now = datetime.now().isoformat() + + # Update state + self.state['last_run'] = now + self.state['date'] = today + self.state['runs_today'] = self.state.get('runs_today', 0) + 1 + + # Save state + self._save_state() + + print(f"[SANTIMENT] Recorded run #{self.state['runs_today']} for {today}") + + def get_status(self) -> dict: + """Get the current status of the frequency controller + + Returns: + Dictionary with current status information + """ + return { + 'last_run': self.state.get('last_run'), + 'runs_today': self.state.get('runs_today', 0), + 'date': self.state.get('date'), + 'state_file': str(self.state_file) + } + + def reset_daily_count(self): + """Reset the daily run count (for testing or manual reset)""" + today = datetime.now().strftime("%Y-%m-%d") + self.state['date'] = today + self.state['runs_today'] = 0 + self._save_state() + print(f"[SANTIMENT] Reset daily count for {today}") diff --git a/scripts/push_hf_secrets.py b/scripts/push_hf_secrets.py new file mode 100644 index 0000000000000000000000000000000000000000..4f3ff08b0b03712303a224d4ec1b8ef88a890696 --- /dev/null +++ b/scripts/push_hf_secrets.py @@ -0,0 +1,186 @@ +""" +Push all variables from a .env file into a Hugging Face Space as secrets (or variables). + +Requirements: + - huggingface_hub (Python SDK) + Install: pip install -U huggingface_hub + +Usage examples: + python scripts/push_hf_secrets.py --repo your-username/your-space + python scripts/push_hf_secrets.py --repo your-username/your-space --env .env.production + python scripts/push_hf_secrets.py --repo your-username/your-space --dry-run + python scripts/push_hf_secrets.py --repo your-username/your-space --as-variables # send as public variables + +Notes: + - This script is intentionally simple and cross-platform. + - It parses common .env formats (KEY=VALUE, supports quoted values and export prefix). + - It won’t print secret values; only key names are logged. + - "Secrets" are private; "Variables" are public. See: Settings → Secrets and variables +""" + +from __future__ import annotations + +import argparse +import os +import re +import sys +from typing import Dict, Tuple + + +ENV_LINE_RE = re.compile(r"^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)\s*$") + + +def _unquote(value: str) -> str: + """Strip matching single or double quotes and unescape simple escapes for double quotes. + + - If value is wrapped in double quotes, unescape common sequences (\\n, \\r, \\t, \\" , \\\\). + - If wrapped in single quotes, return inner content as-is (no escapes processing). + - Otherwise, return value trimmed of surrounding whitespace. + """ + if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'): + quote = value[0] + inner = value[1:-1] + if quote == '"': + # Process simple escape sequences + inner = ( + inner.replace(r"\\n", "\n") + .replace(r"\\r", "\r") + .replace(r"\\t", "\t") + .replace(r"\\\"", '"') + .replace(r"\\\\", "\\") + ) + return inner + return value.strip() + + +def parse_env_file(path: str) -> Dict[str, str]: + """Parse a .env-like file into a dict of {KEY: VALUE}. + + Skips blank lines and comments (lines starting with #, ignoring leading whitespace). + Supports lines like: + - KEY=VALUE + - export KEY=VALUE + Values can be quoted with single or double quotes. + """ + if not os.path.isfile(path): + raise FileNotFoundError(f".env file not found: {path}") + + env: Dict[str, str] = {} + with open(path, "r", encoding="utf-8-sig") as f: + for idx, raw in enumerate(f, start=1): + line = raw.rstrip("\n\r") + stripped = line.strip() + if not stripped or stripped.startswith("#"): + continue + + m = ENV_LINE_RE.match(line) + if not m: + # Non-fatal: skip lines that don't match KEY=VALUE + continue + + key, raw_val = m.group(1), m.group(2).strip() + + # If value is unquoted, do not strip inline comments aggressively to avoid breaking tokens. + value = _unquote(raw_val) + env[key] = value + + return env + + +def get_hf_api(): + """Return an authenticated HfApi client or None with a helpful error. + + Uses locally saved token if you previously ran `huggingface-cli login` or + set HF_TOKEN environment variable. + """ + try: + from huggingface_hub import HfApi + except Exception: + sys.stderr.write( + "huggingface_hub is not installed. Install with: pip install -U huggingface_hub\n" + ) + return None + return HfApi() + +def set_secret(api, repo: str, key: str, value: str, dry_run: bool = False) -> int: + if dry_run: + print(f"[DRY RUN] Set secret: {key} -> (hidden) on {repo}") + return 0 + try: + api.add_space_secret(repo_id=repo, key=key, value=value) + print(f"Set secret: {key}") + return 0 + except Exception as e: + sys.stderr.write(f"Error setting secret {key!r} for repo {repo!r}: {e}\n") + return 1 + + +def set_variable(api, repo: str, key: str, value: str, dry_run: bool = False) -> int: + if dry_run: + print(f"[DRY RUN] Set variable: {key} -> (hidden) on {repo}") + return 0 + try: + api.add_space_variable(repo_id=repo, key=key, value=value) + print(f"Set variable: {key}") + return 0 + except Exception as e: + sys.stderr.write(f"Error setting variable {key!r} for repo {repo!r}: {e}\n") + return 1 + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Push .env variables to a Hugging Face Space as secrets or variables.") + parser.add_argument("--repo", required=True, help="Space repo id, e.g. your-username/your-space") + parser.add_argument("--env", default=".env", help="Path to .env file (default: .env)") + parser.add_argument("--dry-run", action="store_true", help="Print what would be set without applying changes") + parser.add_argument( + "--as-variables", + action="store_true", + help="Send entries as public variables instead of private secrets", + ) + parser.add_argument( + "--exclude", + action="append", + default=[], + help="Key(s) to exclude (can be repeated)", + ) + args = parser.parse_args(argv) + + api = get_hf_api() + if api is None: + return 127 + + try: + env_map = parse_env_file(args.env) + except Exception as e: + sys.stderr.write(f"Failed to read env file {args.env}: {e}\n") + return 2 + + if not env_map: + print("No variables found in .env; nothing to do.") + return 0 + + excluded = set(args.exclude or []) + total = 0 + failures = 0 + for key, value in env_map.items(): + if key in excluded: + continue + total += 1 + if args.as_variables: + rc = set_variable(api, args.repo, key, value, args.dry_run) + else: + rc = set_secret(api, args.repo, key, value, args.dry_run) + if rc != 0: + failures += 1 + + if failures: + sys.stderr.write(f"Completed with {failures}/{total} failures.\n") + return 1 + + print(f"Completed: {total} secrets {'validated' if args.dry_run else 'set'} for {args.repo}.") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/api/gradio_main.py b/src/api/gradio_main.py new file mode 100644 index 0000000000000000000000000000000000000000..76aa9e4325d33162dee7d90177255f56ca521ff9 --- /dev/null +++ b/src/api/gradio_main.py @@ -0,0 +1,265 @@ +import gradio as gr +import json +import os +import sys +import logging +import pandas as pd +import time +from datetime import datetime, timedelta +import psutil +from pathlib import Path + +# Add src to Python path for imports +sys.path.insert(0, '/app/src') +sys.path.insert(0, '/app') + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[logging.StreamHandler(sys.stdout)] +) +logger = logging.getLogger(__name__) + +def get_health_status(): + """Get basic health status""" + try: + # Get process info + process = psutil.Process() + memory_mb = process.memory_info().rss / 1024 / 1024 + cpu_percent = process.cpu_percent() + + # Get system info + memory = psutil.virtual_memory() + disk = psutil.disk_usage('/') + + # Check scheduler status + scheduler_running = False + last_run_time = "Unknown" + try: + last_run_file = "/app/deployment/last_run.txt" + if os.path.exists(last_run_file): + with open(last_run_file, 'r') as f: + last_run_str = f.read().strip() + last_run = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S') + time_since_last_run = (datetime.now() - last_run).total_seconds() + scheduler_running = time_since_last_run < 2700 # 45 minutes + last_run_time = last_run_str + except Exception as e: + logger.warning(f"Could not check scheduler status: {e}") + + return { + "status": "healthy" if memory_mb < 400 else "warning", + "timestamp": datetime.now().isoformat(), + "process_memory_mb": round(memory_mb, 2), + "process_cpu_percent": round(cpu_percent, 2), + "system_memory_percent": round(memory.percent, 1), + "system_memory_available_gb": round(memory.available / (1024**3), 2), + "disk_free_gb": round(disk.free / (1024**3), 2), + "scheduler_running": scheduler_running, + "scheduler_last_run": last_run_time + } + except Exception as e: + logger.error(f"Health check failed: {e}") + return { + "status": "error", + "error": str(e), + "timestamp": datetime.now().isoformat() + } + +def get_pipeline_status(): + """Get data pipeline status""" + try: + data_dirs = [ + "/data/merged/features", + "/data/merged/train", + "/data/alpaca", + "/data/advisorai-data" + ] + + recent_files = 0 + total_size = 0 + + for data_dir in data_dirs: + if os.path.exists(data_dir): + for root, dirs, files in os.walk(data_dir): + for file in files: + if file.endswith(('.json', '.parquet', '.csv')): + file_path = os.path.join(root, file) + try: + stat = os.stat(file_path) + # Count files modified in last 24 hours + if time.time() - stat.st_mtime < 86400: + recent_files += 1 + total_size += stat.st_size + except Exception: + continue + + return { + "status": "running" if recent_files > 0 else "stale", + "recent_files_24h": recent_files, + "total_data_size_gb": round(total_size / (1024**3), 2), + "last_check": datetime.now().isoformat() + } + except Exception as e: + logger.error(f"Pipeline status check failed: {e}") + return { + "status": "error", + "error": str(e), + "last_check": datetime.now().isoformat() + } + +def get_recent_files(): + """Get list of recent files in the data directories""" + try: + base_paths = [ + "/data/merged/features", + "/data/merged/train", + "/data/alpaca", + "/data/advisorai-data/features" + ] + + recent_files = [] + for base_path in base_paths: + if os.path.exists(base_path): + for root, dirs, files in os.walk(base_path): + for file in files[:10]: # Limit to 10 files per directory + file_path = os.path.join(root, file) + try: + stat = os.stat(file_path) + recent_files.append({ + "File": file, + "Path": file_path.replace("/data/", ""), + "Size": f"{stat.st_size / (1024**2):.2f} MB", + "Modified": datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M") + }) + except Exception: + continue + + # Sort by modification time and take most recent 20 + recent_files.sort(key=lambda x: x["Modified"], reverse=True) + return recent_files[:20] + + except Exception as e: + logger.error(f"Error getting recent files: {e}") + return [{"Error": str(e)}] + +def get_logs(): + """Get recent log entries""" + try: + log_files = [ + "/data/logs/scheduler.log", + "/data/logs/data_pipeline.log", + "/data/logs/monitor.log" + ] + + logs = [] + for log_file in log_files: + if os.path.exists(log_file): + try: + with open(log_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + # Get last 10 lines + recent_lines = lines[-10:] if len(lines) > 10 else lines + logs.append(f"=== {os.path.basename(log_file)} ===\n") + logs.extend(recent_lines) + logs.append("\n") + except Exception as e: + logs.append(f"Error reading {log_file}: {str(e)}\n") + + return "".join(logs) if logs else "No log files found" + + except Exception as e: + logger.error(f"Error getting logs: {e}") + return f"Error getting logs: {str(e)}" + +# Create Gradio interface +with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft()) as app: + gr.Markdown("# 🤖 AdvisorAI Data Pipeline Monitor") + gr.Markdown("Real-time monitoring of the AdvisorAI data collection and processing pipeline") + + with gr.Tabs(): + with gr.TabItem("📊 Dashboard"): + with gr.Row(): + with gr.Column(): + gr.Markdown("### Health Status") + health_display = gr.JSON(label="System Health & Status") + + with gr.Column(): + gr.Markdown("### Pipeline Status") + pipeline_display = gr.JSON(label="Data Pipeline Status") + + with gr.Row(): + refresh_btn = gr.Button("🔄 Refresh", variant="primary") + + with gr.TabItem("📁 Recent Files"): + gr.Markdown("### Recently Modified Data Files") + files_display = gr.Dataframe( + headers=["File", "Path", "Size", "Modified"], + datatype=["str", "str", "str", "str"], + label="Recent Files" + ) + refresh_files_btn = gr.Button("🔄 Refresh Files") + + with gr.TabItem("📝 Logs"): + gr.Markdown("### Recent Log Entries") + logs_display = gr.Textbox( + label="Recent Logs", + lines=20, + max_lines=30, + show_copy_button=True + ) + refresh_logs_btn = gr.Button("🔄 Refresh Logs") + + # Event handlers + def refresh_dashboard(): + health = get_health_status() + pipeline = get_pipeline_status() + return json.dumps(health, indent=2), json.dumps(pipeline, indent=2) + + def refresh_files(): + files = get_recent_files() + if files and isinstance(files[0], dict) and "Error" not in files[0]: + return [[f["File"], f["Path"], f["Size"], f["Modified"]] for f in files] + else: + return [["Error", str(files), "", ""]] + + def refresh_logs(): + return get_logs() + + # Connect event handlers + refresh_btn.click( + refresh_dashboard, + outputs=[health_display, pipeline_display] + ) + + refresh_files_btn.click( + refresh_files, + outputs=[files_display] + ) + + refresh_logs_btn.click( + refresh_logs, + outputs=[logs_display] + ) + + # Auto-refresh on load + app.load( + refresh_dashboard, + outputs=[health_display, pipeline_display] + ) + + app.load( + refresh_files, + outputs=[files_display] + ) + +if __name__ == "__main__": + logger.info("Starting Gradio app...") + app.launch( + server_name="0.0.0.0", + server_port=7860, + share=False, + show_error=True, + quiet=False + ) diff --git a/src/api/main.py b/src/api/main.py new file mode 100644 index 0000000000000000000000000000000000000000..06b6e0d135b32e0bee1fec2f18e1e0c28032a8c2 --- /dev/null +++ b/src/api/main.py @@ -0,0 +1,114 @@ +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, HTMLResponse +import uvicorn +import logging +import sys +from src.api.routes.health import health_status +from src.api.routes.isrunning import is_running + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + +logger = logging.getLogger(__name__) + +app = FastAPI( + title="AdvisorAI Data API", + description="API for AdvisorAI data pipeline and health monitoring", + version="1.0.0" +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +@app.exception_handler(Exception) +async def global_exception_handler(request, exc): + logger.error(f"Global exception handler caught: {exc}", exc_info=True) + return JSONResponse( + status_code=500, + content={"detail": "Internal server error", "error": str(exc)} + ) + +@app.get('/health') +def health(): + """Enhanced health check endpoint""" + try: + return health_status() + except Exception as e: + logger.error(f"Health check failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}") + +# Route to check if there are any JSON files under data/merged/features (relative path) +@app.get('/status') +def status(): + """Check if the data pipeline is running and has recent data""" + try: + return is_running() + except Exception as e: + logger.error(f"Status check failed: {e}", exc_info=True) + raise HTTPException(status_code=500, detail=f"Status check failed: {str(e)}") + +@app.get('/', response_class=HTMLResponse) +def root(): + """Root endpoint returns simple HTML so HF Spaces iframe can render it.""" + html = """ + + + + + + AdvisorAI Data API + + + +

AdvisorAI Data API

+

Service is running.

+ + + + """ + return HTMLResponse(content=html, status_code=200) + +@app.get('/api') +def api_root(): + """JSON root for programmatic clients.""" + return { + "message": "AdvisorAI Data API", + "version": "1.0.0", + "endpoints": { + "/health": "Health check with system metrics", + "/status": "Data pipeline status", + "/api": "This JSON endpoint", + "/": "HTML landing page for Spaces" + } + } + +if __name__ == "__main__": + uvicorn.run( + "src.api.main:app", + host="0.0.0.0", + port=10000, + workers=1, + timeout_keep_alive=30, + access_log=True + ) \ No newline at end of file diff --git a/src/api/routes/health.py b/src/api/routes/health.py new file mode 100644 index 0000000000000000000000000000000000000000..944bd4adde0c7e92de15f998a0404982e6ab2c69 --- /dev/null +++ b/src/api/routes/health.py @@ -0,0 +1,67 @@ +import os +import psutil +import time +from datetime import datetime +from src.config import DATA_DIR, LAST_RUN_PATH + +def health_status(): + """Enhanced health check that monitors actual service health""" + try: + # Check memory usage + process = psutil.Process() + memory_mb = process.memory_info().rss / 1024 / 1024 + cpu_percent = process.cpu_percent() + + # Check if scheduler is running + scheduler_running = False + try: + with open(LAST_RUN_PATH, 'r') as f: + last_run_str = f.read().strip() + last_run = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S') + # Consider scheduler healthy if it ran within last 45 minutes + time_since_last_run = (datetime.now() - last_run).total_seconds() + scheduler_running = time_since_last_run < 2700 # 45 minutes + except Exception: + scheduler_running = False + + # Check disk space (prefer DATA_DIR) + disk_usage = psutil.disk_usage(DATA_DIR if os.path.exists(DATA_DIR) else '/') + disk_free_gb = disk_usage.free / (1024**3) + + # Determine overall health + health_issues = [] + # Memory checks + if memory_mb > 1024: # More than 1GB + health_issues.append(f"High memory usage: {memory_mb:.1f}MB (over 1GB)") + elif memory_mb > 512: # More than 512MB for free plan + health_issues.append(f"High memory usage: {memory_mb:.1f}MB (over 512MB)") + + if cpu_percent > 80: + health_issues.append(f"High CPU usage: {cpu_percent:.1f}%") + + if disk_free_gb < 1: # Less than 1GB free + health_issues.append(f"Low disk space: {disk_free_gb:.1f}GB free") + + if not scheduler_running: + health_issues.append("Scheduler not running or stale") + + status = "healthy" if not health_issues else "degraded" + + return { + "status": status, + "timestamp": datetime.now().isoformat(), + "metrics": { + "memory_mb": round(memory_mb, 1), + "cpu_percent": round(cpu_percent, 1), + "disk_free_gb": round(disk_free_gb, 1), + "scheduler_running": scheduler_running + }, + "issues": health_issues + } + + except Exception as e: + return { + "status": "error", + "timestamp": datetime.now().isoformat(), + "error": str(e) + } \ No newline at end of file diff --git a/src/api/routes/isrunning.py b/src/api/routes/isrunning.py new file mode 100644 index 0000000000000000000000000000000000000000..97bab94b8ee35d0d40ac5502a620663277937f6b --- /dev/null +++ b/src/api/routes/isrunning.py @@ -0,0 +1,34 @@ +import os +from datetime import datetime +from fastapi import APIRouter + +from ... import config as app_config + +router = APIRouter() + + +@router.get("/status") +def is_running(): + """Return a small status dict: whether pipeline appears to be running and last run time.""" + json_folder = os.path.join(app_config.DATA_DIR, 'merged', 'features') + has_json = False + if os.path.exists(json_folder): + try: + has_json = any(f.endswith('.json') for f in os.listdir(json_folder)) + except Exception: + has_json = False + + last_run_file = app_config.LAST_RUN_PATH + last_run_display = 'Unknown' + try: + if os.path.exists(last_run_file): + with open(last_run_file, 'r') as f: + last_run_str = f.read().strip() + last_run_dt = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S') + minutes_ago = int((datetime.now() - last_run_dt).total_seconds() // 60) + last_run_display = f"{minutes_ago} minutes ago" + except Exception: + last_run_display = 'Unknown' + + status = "Running" if not has_json else "Not Running" + return {"status": status, "last_run": last_run_display} \ No newline at end of file diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000000000000000000000000000000000000..64ed7d60fbdb3242d94bc7863341fa3c374ba723 --- /dev/null +++ b/src/config.py @@ -0,0 +1,66 @@ +import os +import tempfile + + +def _is_writable(path: str) -> bool: + try: + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + test_fd, test_path = tempfile.mkstemp(prefix='.wtest_', dir=path) + os.close(test_fd) + os.unlink(test_path) + return True + except Exception: + return False + + +def _detect_data_dir() -> str: + # 1) Respect DATA_DIR env only if writable + env = os.getenv('DATA_DIR') + if env and _is_writable(env): + return env + # 2) Prefer /data if writable (Spaces) + if _is_writable('/data'): + return '/data' + # 3) Local dev fallback: /app/data if writable + if _is_writable('/app/data'): + return '/app/data' + # 4) Final fallback: /tmp + return '/tmp' + + +DATA_DIR = _detect_data_dir() + +# Logs: prefer DATA_DIR/logs, fallback to /tmp/logs +_preferred_logs = os.getenv('LOG_DIR') or os.path.join(DATA_DIR, 'logs') +try: + os.makedirs(_preferred_logs, exist_ok=True) + # sanity: try to write + if not _is_writable(_preferred_logs): + raise PermissionError("Log dir not writable") +except Exception: + _preferred_logs = '/tmp/logs' + os.makedirs(_preferred_logs, exist_ok=True) + +LOG_DIR = _preferred_logs + +# Path for scheduler's last_run marker +def _compute_last_run_path(base_dir: str) -> str: + candidates = [ + os.path.join(base_dir, 'deployment', 'last_run.txt'), + os.path.join(base_dir, 'last_run.txt'), + '/tmp/last_run.txt', + ] + for p in candidates: + try: + os.makedirs(os.path.dirname(p), exist_ok=True) + # test write + with open(p, 'a'): + pass + return p + except Exception: + continue + return '/tmp/last_run.txt' + + +LAST_RUN_PATH = _compute_last_run_path(DATA_DIR) diff --git a/src/data_cloud/cloud_utils.py b/src/data_cloud/cloud_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..78eeeb352d6071965775f835f93b84fb2e8c9ce6 --- /dev/null +++ b/src/data_cloud/cloud_utils.py @@ -0,0 +1,163 @@ +""" +cloud_utils.py – Unified utilities for HTTP fetch and cloud/local storage operations. + +Provides: + • fetch_content / fetch_json for HTTP GET + • StorageHandler class with upload/download and fallback to local filesystem + - Methods set self.last_mode to 'cloud' or 'local' + - Local files are stored under a base directory + +Usage: + from cloud_utils import StorageHandler, fetch_json + +Requirements: + • boto3 and botocore + • requests + • ENV vars for cloud credentials (e.g. FILEBASE_*) +""" +import os +import errno +import requests +import boto3 +from botocore.config import Config +from botocore.exceptions import BotoCoreError, ClientError + +# HTTP Fetch utilities --------------------------------------------------------- +def fetch_content(url, headers=None, timeout=15): + """Fetch binary content via HTTP GET.""" + resp = requests.get(url, headers=headers, timeout=timeout, stream=False) + resp.raise_for_status() + return resp.content + +def fetch_json(url, headers=None, timeout=15): + """Fetch JSON data via HTTP GET.""" + resp = requests.get(url, headers=headers, timeout=timeout) + resp.raise_for_status() + data = resp.json() + return data.get("data", data) if isinstance(data, dict) else data + +def fetch_text(url, headers=None, timeout=15, encoding='utf-8'): + """Fetch text content via HTTP GET.""" + resp = requests.get(url, headers=headers, timeout=timeout) + resp.raise_for_status() + resp.encoding = encoding + return resp.text + +# Storage Handler --------------------------------------------------------------- +class StorageHandler: + def list_prefix(self, prefix): + """List all object keys in the given S3 prefix. Returns a list of keys. Local fallback returns empty list.""" + if self.s3 and self.bucket: + paginator = self.s3.get_paginator('list_objects_v2') + keys = [] + for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix): + for obj in page.get('Contents', []): + keys.append(obj['Key']) + return keys + # Local fallback: not implemented (could walk local filesystem if needed) + return [] + def __init__(self, endpoint_url, access_key, secret_key, bucket_name, local_base="data"): + """ + Initialize cloud storage client and local base path. + endpoint_url: S3-compatible endpoint URL + bucket_name: target bucket name (if None/empty, operate in local-only mode) + local_base: directory prefix for local fallback files + """ + self.bucket = bucket_name + self.local_base = local_base.rstrip(os.sep) + self.last_mode = None # 'cloud' or 'local' + if bucket_name: + # boto3 client config + cfg = Config(signature_version="s3v4", s3={"addressing_style": "path"}) + self.s3 = boto3.client( + "s3", + endpoint_url=endpoint_url, + aws_access_key_id=access_key, + aws_secret_access_key=secret_key, + config=cfg, + region_name='us-east-1' + ) + else: + self.s3 = None + + def _ensure_local_dir(self, key): + path = os.path.join(self.local_base, key) + os.makedirs(os.path.dirname(path), exist_ok=True) + return path + + def download(self, key): + """Download object by key. Returns bytes, sets last_mode. Raises FileNotFoundError if not found.""" + if self.s3 and self.bucket: + try: + resp = self.s3.get_object(Bucket=self.bucket, Key=key) + data = resp['Body'].read() + self.last_mode = 'cloud' + print(f"[OK] Downloaded {key} from s3://{self.bucket}/{key}") + return data + except (ClientError, BotoCoreError) as e: + print(f"[WARN] Could not download {key} from S3: {e}") + # Always fallback to local if S3 is not configured or download fails + local_path = self._ensure_local_dir(key) + try: + with open(local_path, 'rb') as f: + data = f.read() + self.last_mode = 'local' + print(f"[FALLBACK] Loaded {key} from local {local_path}") + return data + except FileNotFoundError: + print(f"[ERROR] {key} not found in S3 or locally at {local_path}") + raise + + def upload(self, key, data, content_type='application/octet-stream'): + """Upload bytes to cloud, fallback to local. Sets last_mode. Returns True if cloud, False if local.""" + if self.s3 and self.bucket: + try: + self.s3.put_object(Bucket=self.bucket, Key=key, Body=data, ContentType=content_type) + self.last_mode = 'cloud' + print(f"[OK] Uploaded {key} -> s3://{self.bucket}/{key}") + return True + except (ClientError, BotoCoreError) as e: + print(f"[ERROR] Failed uploading {key}: {e}") + # Always fallback to local if S3 is not configured or upload fails + local_path = self._ensure_local_dir(key) + with open(local_path, 'wb') as f: + f.write(data) + self.last_mode = 'local' + print(f"[FALLBACK] Saved {key} locally -> {local_path}") + return False + + def exists(self, key): + """Check for existence of object. Returns True if found in cloud or local.""" + if self.s3 and self.bucket: + try: + self.s3.head_object(Bucket=self.bucket, Key=key) + return True + except (ClientError, BotoCoreError): + pass + local_path = os.path.join(self.local_base, key) + return os.path.exists(local_path) + + def delete(self, key): + """Delete object in cloud or local fallback.""" + if self.s3 and self.bucket: + try: + self.s3.delete_object(Bucket=self.bucket, Key=key) + self.last_mode = 'cloud' + print(f"[OK] Deleted {key} from s3://{self.bucket}/{key}") + return + except Exception: + pass + local_path = os.path.join(self.local_base, key) + try: + os.remove(local_path) + self.last_mode = 'local' + print(f"[FALLBACK] Deleted {key} locally -> {local_path}") + except OSError as e: + if e.errno != errno.ENOENT: + raise + + def get_last_mode(self): + """Return 'cloud' or 'local' depending on last operation.""" + return self.last_mode + +# End of cloud_utils.py diff --git a/src/fetchers/advisorai_data/advisorai_data_fetcher.py b/src/fetchers/advisorai_data/advisorai_data_fetcher.py new file mode 100644 index 0000000000000000000000000000000000000000..888e5306bea158438a8ec5be3491b27cd439f55b --- /dev/null +++ b/src/fetchers/advisorai_data/advisorai_data_fetcher.py @@ -0,0 +1,226 @@ +""" +advisorai_data_fetcher.py – Fetches feature files from AdvisorAI Data API and MongoDB, +then uploads them to Filebase S3 instead of local storage. + +✱ 2025-07-11 – switched backend from local filesystem to Filebase S3 + • Uses boto3 against FILEBASE_ENDPOINT + • No local disk writes; everything streams directly to S3 + +Requirements: + • FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com + • FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY env vars + • FILEBASE_BUCKET env var (your bucket name) + • ADVISORAI_data_API_URL and ADVISORAI_data_API_KEY env vars for the Data API + • MONGODB_URI, MONGODB_DATABASE, MONGODB_COLLECTION_FEATURES env vars for archive fetch +""" + +import os +import sys +import requests +import asyncio +from io import BytesIO + +from dotenv import load_dotenv +import pandas as pd +# from pymongo import MongoClient + + +# Ensure src is in sys.path for direct script execution +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) +from data_cloud.cloud_utils import StorageHandler + +# ─── Configuration ──────────────────────────────────────────────────────────── +load_dotenv() + +# AdvisorAI Data API +API_BASE_URL = os.getenv("ADVISORAI_data_API_URL", "http://localhost:8000") +API_KEY = os.getenv("ADVISORAI_data_API_KEY") +if not API_KEY: + print("[ERROR] ADVISORAI_data_API_KEY must be set") + sys.exit(1) +HEADERS = {"Authorization": f"Bearer {API_KEY}"} + +# MongoDB for archive features +MONGODB_URI = os.getenv("MONGODB_URI", "mongodb://localhost:27017") +MONGODB_DATABASE = os.getenv("MONGODB_DATABASE", "AdvisorAI") +MONGODB_COLLECTION_FEATURES = os.getenv("MONGODB_COLLECTION_FEATURES", "arch_features") + +# Filebase S3 credentials +FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT") +FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY") +FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY") +FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET") +if not all([FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, FILEBASE_BUCKET]): + print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set") + sys.exit(1) + + + +# ─── Fetch and upload functions ─────────────────────────────────────────────── + +def fetch_and_upload_latest_parquet(storage): + """Fetch latest Parquet from API and upload to S3 bucket at features/latest_features.parquet""" + url = f"{API_BASE_URL}/features/latest" + resp = requests.get(url, headers=HEADERS, stream=True) + resp.raise_for_status() + data = resp.content + key = "advisorai-data/features/latest_features.parquet" + try: + storage.upload(key, data, content_type="application/octet-stream") + print(f"[OK] Uploaded latest_features.parquet -> {storage.get_last_mode()}:{key}") + # Also save locally + local_path = os.path.join("data", key) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + with open(local_path, "wb") as f: + f.write(data) + print(f"[OK] Saved locally: {local_path}") + except Exception as e: + print(f"[ERROR] Failed uploading latest_features.parquet: {e}", file=sys.stderr) + +async def fetch_and_upload_jsons(storage): + """List JSON feature files, fetch them, and upload to S3 under features/""" + url = f"{API_BASE_URL}/features" + resp = requests.get(url, headers=HEADERS) + resp.raise_for_status() + files = resp.json().get("files", []) + json_files = [f["filename"] for f in files if f.get("file_type") == "json"] + if not json_files: + print("[INFO] No JSON feature files to upload.") + return + # Delete all old feature_report_*.json files before saving any new ones (both locally and on S3) + import glob + import os + # Local delete (as before) + features_dir = os.path.join("data", "advisorai-data", "features") + report_files = glob.glob(os.path.join(features_dir, "feature_report_*.json")) + for old_report in report_files: + try: + os.remove(old_report) + print(f"[INFO] Deleted old local report: {old_report}") + except Exception as e: + print(f"[WARN] Could not delete local {old_report}: {e}", file=sys.stderr) + + # S3 delete (list all files in the prefix and filter manually) + try: + s3_files = storage.list_prefix("advisorai-data/features/") + s3_report_files = [f for f in s3_files if f.startswith("advisorai-data/features/feature_report_") and f.endswith(".json")] + for s3_report in s3_report_files: + try: + storage.delete(s3_report) + print(f"[INFO] Deleted old S3 report: {s3_report}") + except Exception as e: + print(f"[WARN] Could not delete S3 {s3_report}: {e}", file=sys.stderr) + except Exception as e: + print(f"[WARN] Could not list/delete S3 feature_report_*.json: {e}", file=sys.stderr) + + for fname in json_files: + dl_url = f"{API_BASE_URL}/features/{fname}" + r = requests.get(dl_url, headers=HEADERS, stream=True) + r.raise_for_status() + data = r.content + key = f"advisorai-data/features/{fname}" + try: + storage.upload(key, data, content_type="application/json") + print(f"[OK] Uploaded {fname} -> {storage.get_last_mode()}:{key}") + # Also save locally + local_path = os.path.join("data", key) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + with open(local_path, "wb") as f: + f.write(data) + print(f"[OK] Saved locally: {local_path}") + except Exception as e: + print(f"[ERROR] Failed uploading {fname}: {e}", file=sys.stderr) + +# async def fetch_and_upload_archive_parquet(storage): +# """Fetch archive from MongoDB, convert to Parquet, and upload to S3 at archive/merged_features.parquet""" +# client = MongoClient(MONGODB_URI) +# db = client[MONGODB_DATABASE] +# coll = db[MONGODB_COLLECTION_FEATURES] +# docs = list(coll.find()) +# if not docs: +# print("[INFO] No documents in archive collection.") +# return +# for d in docs: +# d.pop("_id", None) +# df = pd.DataFrame(docs) +# buf = BytesIO() +# df.to_parquet(buf, index=False) +# data = buf.getvalue() +# key = "advisorai-data/archive/merged_features.parquet" +# try: +# storage.upload(key, data, content_type="application/octet-stream") +# print(f"[OK] Uploaded archive Parquet -> {storage.get_last_mode()}:{key}") +# # Also save locally +# local_path = os.path.join("data", key) +# os.makedirs(os.path.dirname(local_path), exist_ok=True) +# with open(local_path, "wb") as f: +# f.write(data) +# print(f"[OK] Saved locally: {local_path}") +# except Exception as e: +# print(f"[ERROR] Failed uploading archive Parquet: {e}", file=sys.stderr) + +def create_train_merged_parquet(storage): + """Create advisorai-data/train/merged_features.parquet by merging archive and latest features, deduping by (symbol, interval_timestamp).""" + # Download archive/merged_features.parquet + from io import BytesIO + import pandas as pd + archive_key = "advisorai-data/archive/merged_features.parquet" + latest_key = "advisorai-data/features/latest_features.parquet" + train_key = "advisorai-data/train/merged_features.parquet" + try: + archive_buf = BytesIO(storage.download(archive_key)) + df_archive = pd.read_parquet(archive_buf) + except Exception as e: + print(f"[WARN] Could not load archive parquet: {e}", file=sys.stderr) + df_archive = pd.DataFrame() + try: + latest_buf = BytesIO(storage.download(latest_key)) + df_latest = pd.read_parquet(latest_buf) + except Exception as e: + print(f"[WARN] Could not load latest features parquet: {e}", file=sys.stderr) + df_latest = pd.DataFrame() + if df_archive.empty and df_latest.empty: + print("[INFO] No data to merge for train/merged_features.parquet.") + return + # Concatenate and deduplicate by (symbol, interval_timestamp) + df_all = pd.concat([df_archive, df_latest], ignore_index=True) + if 'symbol' in df_all.columns and 'interval_timestamp' in df_all.columns: + df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last") + else: + print("[WARN] 'symbol' or 'interval_timestamp' column missing, skipping deduplication.") + # Save to train/merged_features.parquet + buf = BytesIO() + df_all.to_parquet(buf, index=False) + data = buf.getvalue() + try: + storage.upload(train_key, data, content_type="application/octet-stream") + print(f"[OK] Uploaded train merged features -> {storage.get_last_mode()}:{train_key}") + # Also save locally + local_path = os.path.join("data", train_key) + os.makedirs(os.path.dirname(local_path), exist_ok=True) + with open(local_path, "wb") as f: + f.write(data) + print(f"[OK] Saved locally: {local_path}") + except Exception as e: + print(f"[ERROR] Failed uploading train merged features: {e}", file=sys.stderr) + +# ─── Main entrypoint ───────────────────────────────────────────────────────── + +def main(): + # Use StorageHandler with both S3 and local enabled + storage = StorageHandler( + endpoint_url=FILEBASE_ENDPOINT, + access_key=FILEBASE_ACCESS_KEY, + secret_key=FILEBASE_SECRET_KEY, + bucket_name=FILEBASE_BUCKET, + local_base="data" + ) + fetch_and_upload_latest_parquet(storage) + asyncio.run(fetch_and_upload_jsons(storage)) + # asyncio.run(fetch_and_upload_archive_parquet(storage)) + create_train_merged_parquet(storage) + +if __name__ == "__main__": + main() diff --git a/src/fetchers/alpaca_api/__init__.py b/src/fetchers/alpaca_api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..816f6d36487f974eb8071b8c265b9ceabc1ce140 --- /dev/null +++ b/src/fetchers/alpaca_api/__init__.py @@ -0,0 +1,32 @@ +# alpaca/__init__.py + +from .config import settings +from .clients import StocksClient, CryptoClient, OptionsClient +from .fetchers import ( + fetch_stock_bars, + fetch_crypto_bars, + fetch_option_bars, + fetch_stock_trades, + fetch_crypto_trades, + fetch_stock_quotes, + fetch_crypto_quotes, +) +from .utils import logger, backoff, to_rfc3339, parse_rfc3339 + +__all__ = [ + "settings", + "StocksClient", + "CryptoClient", + "OptionsClient", + "fetch_stock_bars", + "fetch_crypto_bars", + "fetch_option_bars", + "fetch_stock_trades", + "fetch_crypto_trades", + "fetch_stock_quotes", + "fetch_crypto_quotes", + "logger", + "backoff", + "to_rfc3339", + "parse_rfc3339", +] diff --git a/src/fetchers/alpaca_api/clients/__init__.py b/src/fetchers/alpaca_api/clients/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..562a80eb157b30ca070d6cb8d3c71641cda96ea9 --- /dev/null +++ b/src/fetchers/alpaca_api/clients/__init__.py @@ -0,0 +1,7 @@ +# alpaca/clients/__init__.py + +from .stocks import StocksClient +from .crypto import CryptoClient +from .options import OptionsClient + +__all__ = ["StocksClient", "CryptoClient", "OptionsClient"] diff --git a/src/fetchers/alpaca_api/clients/crypto.py b/src/fetchers/alpaca_api/clients/crypto.py new file mode 100644 index 0000000000000000000000000000000000000000..42faa5e6b7f65f4bba0c2adaf83bfb1431caf5b9 --- /dev/null +++ b/src/fetchers/alpaca_api/clients/crypto.py @@ -0,0 +1,95 @@ +# alpaca/clients/crypto.py + +from datetime import datetime +from typing import Optional +import re +from alpaca.data.historical import CryptoHistoricalDataClient +from alpaca.data.requests import ( + CryptoBarsRequest, + CryptoTradesRequest, + CryptoQuoteRequest, +) +from alpaca.data.timeframe import TimeFrame, TimeFrameUnit +from ..config import settings + +class CryptoClient: + def __init__(self): + # You can omit api_key/secret for crypto, but providing them raises rate limits + self.client = CryptoHistoricalDataClient( + api_key=settings.ALPACA_API_KEY, + secret_key=settings.ALPACA_API_SECRET, + ) + + def get_bars( + self, + symbol: str, + timeframe: str | TimeFrame, + start: datetime, + end: datetime, + limit: int = 1000, + feed: Optional[str] = None, + ): + """ + Fetch historical OHLCV bars for a given crypto symbol. + Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc. + """ + if isinstance(timeframe, str): + m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe) + if not m: + raise ValueError(f"Invalid timeframe format: {timeframe!r}") + amt, unit_str = m.groups() + unit_key = unit_str.capitalize().rstrip("s") + unit = TimeFrameUnit[unit_key] + timeframe = TimeFrame(int(amt), unit) + req = CryptoBarsRequest( + symbol_or_symbols=symbol, + timeframe=timeframe, + start=start, + end=end, + limit=limit, + feed=feed, + ) + return self.client.get_crypto_bars(req) + # ↳ uses CryptoBarsRequest(symbol_or_symbols, timeframe, start, end, limit, feed) :contentReference[oaicite:0]{index=0} + + def get_trades( + self, + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + sort: Optional[str] = None, + ): + """ + Fetch historical trade ticks for a given crypto symbol. + """ + req = CryptoTradesRequest( + symbol_or_symbols=symbol, + start=start, + end=end, + limit=limit, + sort=sort, + ) + return self.client.get_crypto_trades(req) + # ↳ uses CryptoTradesRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1} + + def get_quotes( + self, + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + sort: Optional[str] = None, + ): + """ + Fetch historical Level-1 quotes for a given crypto symbol. + """ + req = CryptoQuoteRequest( + symbol_or_symbols=symbol, + start=start, + end=end, + limit=limit, + sort=sort, + ) + return self.client.get_crypto_quotes(req) + # ↳ uses CryptoQuoteRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:2]{index=2} diff --git a/src/fetchers/alpaca_api/clients/main.py b/src/fetchers/alpaca_api/clients/main.py new file mode 100644 index 0000000000000000000000000000000000000000..c9e2adab8cfa53ef72013d4eab9fb4051b3e3b78 --- /dev/null +++ b/src/fetchers/alpaca_api/clients/main.py @@ -0,0 +1,45 @@ +# from datetime import datetime, timedelta +# import sys +# import os +# import pandas as pd +# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) +# from alpaca_api.clients.stocks import StocksClient + +# def normalize_records(records): +# dicts = [rec.model_dump() for rec in records] +# for rec in dicts: +# for k, v in rec.items(): +# if hasattr(v, 'isoformat'): +# rec[k] = v.isoformat() +# return dicts + +# if __name__ == "__main__": +# client = StocksClient() +# symbol = "AAPL" +# timeframe = "1Day" +# end = datetime.utcnow() +# start = end - timedelta(days=7) + +# output_dir = os.path.join("..", "..", "..", "data", "alpaca") +# os.makedirs(output_dir, exist_ok=True) + +# print(f"Testing get_bars for {symbol} from {start} to {end}") +# bars = client.get_bars(symbol, timeframe, start, end, limit=10) +# # print("Bars:", bars) +# bars_records = normalize_records(bars.data[symbol]) +# bars_df = pd.DataFrame(bars_records) +# bars_df.to_parquet(os.path.join(output_dir, f"{symbol}_bars.parquet"), index=False) + +# print(f"Testing get_trades for {symbol} from {start} to {end}") +# trades = client.get_trades(symbol, start, end, limit=10) +# # print("Trades:", trades) +# trades_records = normalize_records(trades.data[symbol]) +# trades_df = pd.DataFrame(trades_records) +# trades_df.to_parquet(os.path.join(output_dir, f"{symbol}_trades.parquet"), index=False) + +# print(f"Testing get_quotes for {symbol} from {start} to {end}") +# quotes = client.get_quotes(symbol, start, end, limit=10) +# # print("Quotes:", quotes) +# quotes_records = normalize_records(quotes.data[symbol]) +# quotes_df = pd.DataFrame(quotes_records) +# quotes_df.to_parquet(os.path.join(output_dir, f"{symbol}_quotes.parquet"), index=False) diff --git a/src/fetchers/alpaca_api/clients/options.py b/src/fetchers/alpaca_api/clients/options.py new file mode 100644 index 0000000000000000000000000000000000000000..acdd3ef2bcfad85fdefb0700efb50d191450b4b5 --- /dev/null +++ b/src/fetchers/alpaca_api/clients/options.py @@ -0,0 +1,72 @@ +# alpaca/clients/options.py + +from datetime import datetime +from typing import Optional, Union +import re +from alpaca.data.historical import OptionHistoricalDataClient +from alpaca.data.requests import ( + OptionBarsRequest, + OptionTradesRequest, +) +from alpaca.data.timeframe import TimeFrame, TimeFrameUnit +from ..config import settings + +class OptionsClient: + def __init__(self): + self.client = OptionHistoricalDataClient( + api_key=settings.ALPACA_API_KEY, + secret_key=settings.ALPACA_API_SECRET, + ) + + def get_bars( + self, + symbol: str, + timeframe: Union[str, TimeFrame], + start: datetime, + end: datetime, + limit: int = 1000, + sort: Optional[str] = None, + ): + """ + Fetch historical OHLCV bars for a given option contract. + Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc. + """ + if isinstance(timeframe, str): + m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe) + if not m: + raise ValueError(f"Invalid timeframe format: {timeframe!r}") + amount, unit_str = m.groups() + unit_key = unit_str.capitalize().rstrip("s") + unit = TimeFrameUnit[unit_key] + timeframe = TimeFrame(int(amount), unit) + req = OptionBarsRequest( + symbol_or_symbols=symbol, + timeframe=timeframe, + start=start, + end=end, + limit=limit, + sort=sort, + ) + return self.client.get_option_bars(req) + # ↳ uses OptionBarsRequest(symbol_or_symbols, timeframe, start, end, limit, sort) :contentReference[oaicite:0]{index=0} + + def get_trades( + self, + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + sort: Optional[str] = None, + ): + """ + Fetch historical trade ticks for a given option contract. + """ + req = OptionTradesRequest( + symbol_or_symbols=symbol, + start=start, + end=end, + limit=limit, + sort=sort, + ) + return self.client.get_option_trades(req) + # ↳ uses OptionTradesRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1} diff --git a/src/fetchers/alpaca_api/clients/stocks.py b/src/fetchers/alpaca_api/clients/stocks.py new file mode 100644 index 0000000000000000000000000000000000000000..f97b3e034e3710f61d0dbc546554a1488f7799db --- /dev/null +++ b/src/fetchers/alpaca_api/clients/stocks.py @@ -0,0 +1,90 @@ +# alpaca_api/clients/stocks.py + +from datetime import datetime +import re +from alpaca.data.historical import StockHistoricalDataClient +from alpaca.data.timeframe import TimeFrame, TimeFrameUnit +from alpaca.data.requests import StockBarsRequest, StockTradesRequest, StockQuotesRequest, DataFeed +import sys, os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) +from alpaca_api.config import settings + +class StocksClient: + def __init__(self): + self.client = StockHistoricalDataClient( + api_key=settings.ALPACA_API_KEY, + secret_key=settings.ALPACA_API_SECRET, + ) + + def get_bars( + self, + symbol: str, + timeframe: str | TimeFrame, + start: datetime, + end: datetime, + limit: int = 1000, + ): + """ + Fetch historical OHLCV bars for a given stock. + Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc. + """ + if isinstance(timeframe, str): + m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe) + if not m: + raise ValueError(f"Invalid timeframe format: {timeframe!r}") + amount_str, unit_str = m.groups() + # Normalize unit name to match TimeFrameUnit keys (Minute, Hour, Day, Week, Month) + unit_key = unit_str.capitalize().rstrip("s") + unit = TimeFrameUnit[unit_key] + timeframe = TimeFrame(int(amount_str), unit) + # Now we have a proper TimeFrame instance + req = StockBarsRequest( + symbol_or_symbols=symbol, + timeframe=timeframe, + start=start, + end=end, + limit=limit, + feed=DataFeed.IEX, # use IEX for free delayed data + ) + return self.client.get_stock_bars(req) + # ↳ requires StockBarsRequest(symbol_or_symbols, timeframe, start, end, limit) :contentReference[oaicite:0]{index=0} + + def get_trades( + self, + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + ): + """ + Fetch historical trade ticks for a given stock. + """ + req = StockTradesRequest( + symbol_or_symbols=symbol, + start=start, + end=end, + limit=limit, + feed=DataFeed.IEX, # use IEX for free delayed trade data + ) + return self.client.get_stock_trades(req) + # ↳ takes symbol_or_symbols, start, end, limit :contentReference[oaicite:1]{index=1} + + def get_quotes( + self, + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + ): + """ + Fetch historical Level-1 quotes (bid/ask) for a given stock. + """ + req = StockQuotesRequest( + symbol_or_symbols=symbol, + start=start, + end=end, + limit=limit, + feed=DataFeed.IEX, # use IEX for free delayed quote data + ) + return self.client.get_stock_quotes(req) + # ↳ takes symbol_or_symbols, start, end, limit :contentReference[oaicite:2]{index=2} diff --git a/src/fetchers/alpaca_api/config.py b/src/fetchers/alpaca_api/config.py new file mode 100644 index 0000000000000000000000000000000000000000..21658cec2f15140699b4ee6e8f65f3b67a48fc04 --- /dev/null +++ b/src/fetchers/alpaca_api/config.py @@ -0,0 +1,17 @@ +# alpaca/config.py + +from pydantic_settings import BaseSettings, SettingsConfigDict + +class Settings(BaseSettings): + ALPACA_API_KEY: str + ALPACA_API_SECRET: str + ALPACA_BASE_URL: str = "https://paper-api.alpaca.markets/v2" + PAPER: bool = True + + model_config = SettingsConfigDict( + env_file=".env", + env_file_encoding="utf-8", + extra="ignore", # allow all other .env keys without error + ) + +settings = Settings() diff --git a/src/fetchers/alpaca_api/fetchers/__init__.py b/src/fetchers/alpaca_api/fetchers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b974a4828d7382e8c2b7b7222005e652f1bfc974 --- /dev/null +++ b/src/fetchers/alpaca_api/fetchers/__init__.py @@ -0,0 +1,15 @@ +# alpaca/fetchers/__init__.py + +from .bars import fetch_stock_bars, fetch_crypto_bars, fetch_option_bars +from .trades import fetch_stock_trades, fetch_crypto_trades +from .quotes import fetch_stock_quotes, fetch_crypto_quotes + +__all__ = [ + "fetch_stock_bars", + "fetch_crypto_bars", + "fetch_option_bars", + "fetch_stock_trades", + "fetch_crypto_trades", + "fetch_stock_quotes", + "fetch_crypto_quotes", +] diff --git a/src/fetchers/alpaca_api/fetchers/bars.py b/src/fetchers/alpaca_api/fetchers/bars.py new file mode 100644 index 0000000000000000000000000000000000000000..cdae456766c66e26ab30bc2d2bae21e32ddaa844 --- /dev/null +++ b/src/fetchers/alpaca_api/fetchers/bars.py @@ -0,0 +1,58 @@ +# alpaca/fetchers/bars.py + +from datetime import datetime +from ..clients.stocks import StocksClient +from ..clients.crypto import CryptoClient +from ..clients.options import OptionsClient +from ..utils import backoff, logger + +# instantiate once +stocks_client = StocksClient() +crypto_client = CryptoClient() +options_client = OptionsClient() + +@backoff(max_retries=5, base_delay=1, factor=2) +def fetch_stock_bars( + symbol: str, + start: datetime, + end: datetime, + timeframe: str, + limit: int = 1000, +): + """ + Fetch OHLCV bars for a stock, with retry/back-off and logging. + """ + logger.info(f"Fetching stock bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit}") + return stocks_client.get_bars(symbol, timeframe, start, end, limit) + + +@backoff(max_retries=5, base_delay=1, factor=2) +def fetch_crypto_bars( + symbol: str, + start: datetime, + end: datetime, + timeframe: str, + limit: int = 1000, + feed: str | None = None, +): + """ + Fetch OHLCV bars for a crypto, with retry/back-off and logging. + """ + logger.info(f"Fetching crypto bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit} feed={feed}") + return crypto_client.get_bars(symbol, timeframe, start, end, limit, feed) + + +@backoff(max_retries=5, base_delay=1, factor=2) +def fetch_option_bars( + symbol: str, + start: datetime, + end: datetime, + timeframe: str, + limit: int = 1000, + sort: str | None = None, +): + """ + Fetch OHLCV bars for an option contract, with retry/back-off and logging. + """ + logger.info(f"Fetching option bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit} sort={sort}") + return options_client.get_bars(symbol, timeframe, start, end, limit, sort) diff --git a/src/fetchers/alpaca_api/fetchers/quotes.py b/src/fetchers/alpaca_api/fetchers/quotes.py new file mode 100644 index 0000000000000000000000000000000000000000..89af096c777550586bc958b128e54139b590714b --- /dev/null +++ b/src/fetchers/alpaca_api/fetchers/quotes.py @@ -0,0 +1,40 @@ +# alpaca/fetchers/quotes.py + +from datetime import datetime +from ..clients.stocks import StocksClient +from ..clients.crypto import CryptoClient +from ..utils import backoff, logger + +# instantiate clients once +stocks_client = StocksClient() +crypto_client = CryptoClient() + +@backoff(max_retries=5, base_delay=1, factor=2) +def fetch_stock_quotes( + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + sort: str | None = None, +): + """ + Fetch historical Level-1 quotes (bid/ask) for a stock, with retry/back-off and logging. + """ + logger.info(f"Fetching stock quotes: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}") + return stocks_client.get_quotes(symbol, start, end, limit) + # ↳ uses StockQuotesRequest(symbol_or_symbols, start, end, limit) :contentReference[oaicite:0]{index=0} + +@backoff(max_retries=5, base_delay=1, factor=2) +def fetch_crypto_quotes( + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + sort: str | None = None, +): + """ + Fetch historical Level-1 quotes for a crypto symbol, with retry/back-off and logging. + """ + logger.info(f"Fetching crypto quotes: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}") + return crypto_client.get_quotes(symbol, start, end, limit) + # ↳ uses CryptoQuoteRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1} diff --git a/src/fetchers/alpaca_api/fetchers/trades.py b/src/fetchers/alpaca_api/fetchers/trades.py new file mode 100644 index 0000000000000000000000000000000000000000..060ab1d798530c007b1a1082a9e750caca57706e --- /dev/null +++ b/src/fetchers/alpaca_api/fetchers/trades.py @@ -0,0 +1,38 @@ +# alpaca/fetchers/trades.py + +from datetime import datetime +from ..clients.stocks import StocksClient +from ..clients.crypto import CryptoClient +from ..utils import backoff, logger + +# instantiate clients once +stocks_client = StocksClient() +crypto_client = CryptoClient() + +@backoff(max_retries=5, base_delay=1, factor=2) +def fetch_stock_trades( + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + sort: str | None = None, +): + """ + Fetch historical trade ticks for a stock, with retry/back-off and logging. + """ + logger.info(f"Fetching stock trades: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}") + return stocks_client.get_trades(symbol, start, end, limit) + +@backoff(max_retries=5, base_delay=1, factor=2) +def fetch_crypto_trades( + symbol: str, + start: datetime, + end: datetime, + limit: int = 1000, + sort: str | None = None, +): + """ + Fetch historical trade ticks for a crypto symbol, with retry/back-off and logging. + """ + logger.info(f"Fetching crypto trades: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}") + return crypto_client.get_trades(symbol, start, end, limit) diff --git a/src/fetchers/alpaca_api/main.py b/src/fetchers/alpaca_api/main.py new file mode 100644 index 0000000000000000000000000000000000000000..5ef5e0592065c30d27e2428097ca895589f6410d --- /dev/null +++ b/src/fetchers/alpaca_api/main.py @@ -0,0 +1,193 @@ +def normalize_crypto_symbol(sym: str) -> str: + return sym if "/" in sym else f"{sym}/USD" +import os +import sys +from datetime import datetime, timedelta + +import pandas as pd + + +# Add src/fetchers to sys.path for direct execution +base = os.path.dirname(__file__) +src_fetchers = os.path.abspath(os.path.join(base, "..")) +sys.path.insert(0, src_fetchers) + +from alpaca_api.fetchers import ( + fetch_stock_bars, + fetch_stock_trades, + fetch_stock_quotes, + fetch_crypto_bars, + fetch_crypto_trades, + fetch_option_bars, +) +from alpaca_api.config import settings + +def normalize_records(records): + """Convert Pydantic models to ISO-format dicts.""" + dicts = [rec.model_dump() for rec in records] + for rec in dicts: + for k, v in rec.items(): + if hasattr(v, "isoformat"): + rec[k] = v.isoformat() + return dicts + +def save_df(df: pd.DataFrame, fname: str): + out = os.path.join("data", "alpaca", fname) + os.makedirs(os.path.dirname(out), exist_ok=True) + + # Check if file exists and implement incremental loading + if os.path.exists(out): + try: + existing_df = pd.read_parquet(out) + print(f"-> existing data has {len(existing_df)} records") + + # Combine and remove duplicates based on timestamp and symbol + combined_df = pd.concat([existing_df, df], ignore_index=True) + + # Remove duplicates keeping the latest record + if 'timestamp' in combined_df.columns and 'symbol' in combined_df.columns: + combined_df = combined_df.drop_duplicates(subset=['timestamp', 'symbol'], keep='last') + elif 'timestamp' in combined_df.columns: + combined_df = combined_df.drop_duplicates(subset=['timestamp'], keep='last') + + # Sort by timestamp for consistency + if 'timestamp' in combined_df.columns: + combined_df = combined_df.sort_values('timestamp') + + combined_df.to_parquet(out, index=False) + print(f"-> updated {out} with {len(combined_df)} total records ({len(df)} new)") + except Exception as e: + print(f"-> error merging with existing data: {e}, overwriting") + df.to_parquet(out, index=False) + print(f"-> wrote {out} with {len(df)} records") + else: + df.to_parquet(out, index=False) + print(f"-> wrote {out} with {len(df)} records") + +def main(): + # you can also read these from os.getenv or settings if you prefer + stock_symbols = ["AAPL", "TSLA", "GOOGL", "MSFT", "NVDA", "COIN"] # Added COIN + crypto_symbols = ["BTC", "ETH", "SOL", "ADA", "XRP"] + # option symbols use the Alpaca format: "___" + # option_symbols = ["AAPL_20250718_150_C", "TSLA_20250718_700_P"] + + def normalize_option_symbol(sym: str) -> str: + # expects “UNDERLYING_YYYYMMDD_STRIKE_C” or “P” + underlying, ymd, strike, cp = sym.split("_") + yymmdd = ymd[2:] # “20250718” → “250718” + amt = int(float(strike) * 1000) + strike_str = f"{amt:08d}" + return f"{underlying}{yymmdd}{cp}{strike_str}" + days = "1Day" + + end = datetime.utcnow() + + # Check for existing data to determine start date + def get_start_date_for_symbol(symbol, data_type="bars"): + fname = f"{symbol}_{data_type}.parquet" + out = os.path.join("data", "alpaca", fname) + + if os.path.exists(out): + try: + existing_df = pd.read_parquet(out) + if not existing_df.empty and 'timestamp' in existing_df.columns: + # Get the latest timestamp and add 1 day to avoid duplicates + latest_timestamp = pd.to_datetime(existing_df['timestamp'].max()) + start_from_latest = latest_timestamp + timedelta(days=1) + + # Don't go back more than 30 days from now to limit data size + max_lookback = end - timedelta(days=30) + start_date = max(start_from_latest, max_lookback) + + print(f"-> {symbol} {data_type}: continuing from {start_date}") + return start_date + except Exception as e: + print(f"-> error reading existing {fname}: {e}") + + # Default: get last 30 days for new symbols + default_start = end - timedelta(days=30) + print(f"-> {symbol} {data_type}: starting fresh from {default_start}") + return default_start + + # STOCKS: bars, trades, quotes + for sym in stock_symbols: + print(f"\nFetching stock data for {sym}:") + + # Get appropriate start dates for each data type + start_bars = get_start_date_for_symbol(sym, "bars") + start_trades = get_start_date_for_symbol(sym, "trades") + start_quotes = get_start_date_for_symbol(sym, "quotes") + + # Only fetch if there's a meaningful time range + if start_bars < end: + bars = fetch_stock_bars(sym, start_bars, end, days, limit=1000) # Increased limit + save_df(pd.DataFrame(normalize_records(bars.data[sym])), f"{sym}_bars.parquet") + else: + print(f"-> {sym} bars: no new data to fetch") + + if start_trades < end: + trades = fetch_stock_trades(sym, start_trades, end, limit=1000) # Increased limit + save_df(pd.DataFrame(normalize_records(trades.data[sym])), f"{sym}_trades.parquet") + else: + print(f"-> {sym} trades: no new data to fetch") + + if start_quotes < end: + quotes = fetch_stock_quotes(sym, start_quotes, end, limit=1000) # Increased limit + save_df(pd.DataFrame(normalize_records(quotes.data[sym])), f"{sym}_quotes.parquet") + else: + print(f"-> {sym} quotes: no new data to fetch") + + # CRYPTO: bars, trades + for sym in crypto_symbols: + pair = normalize_crypto_symbol(sym) + print(f"\nFetching crypto data for {pair}:") + try: + # Get appropriate start dates for crypto data + start_bars = get_start_date_for_symbol(pair.replace('/', '_'), "bars") + start_trades = get_start_date_for_symbol(pair.replace('/', '_'), "trades") + + # Only fetch if there's a meaningful time range + bar_records = [] + trade_records = [] + + if start_bars < end: + bars = fetch_crypto_bars(pair, start_bars, end, days, limit=1000) # Increased limit + bar_records = bars.data.get(pair, []) + else: + print(f"-> {pair} bars: no new data to fetch") + + if start_trades < end: + trades = fetch_crypto_trades(pair, start_trades, end, limit=1000) # Increased limit + trade_records = trades.data.get(pair, []) + else: + print(f"-> {pair} trades: no new data to fetch") + + if bar_records: + save_df( + pd.DataFrame(normalize_records(bar_records)), + f"{pair.replace('/', '_')}_bars.parquet", + ) + else: + print(f"-> no bar data for {pair}, skipping") + + if trade_records: + save_df( + pd.DataFrame(normalize_records(trade_records)), + f"{pair.replace('/', '_')}_trades.parquet", + ) + else: + print(f"-> no trade data for {pair}, skipping") + + except Exception as e: + print(f"⚠️ error fetching {pair}: {e!r}, skipping") + continue + + # # OPTIONS: bars only + # for sym in option_symbols: + # occ = normalize_option_symbol(sym) + # print(f"\nFetching option bars for {occ}:") + # bars = fetch_option_bars(occ, start, end, days, limit=10) + # save_df(pd.DataFrame(normalize_records(bars.data[occ])), f"{occ}_bars.parquet") + +if __name__ == "__main__": + main() diff --git a/src/fetchers/alpaca_api/merge/alpaca_features.py b/src/fetchers/alpaca_api/merge/alpaca_features.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/fetchers/alpaca_api/utils.py b/src/fetchers/alpaca_api/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..632ff0a59a019048f88c792ac435c166a98d8361 --- /dev/null +++ b/src/fetchers/alpaca_api/utils.py @@ -0,0 +1,83 @@ +# alpaca/utils.py + +import time +import functools +import logging +from datetime import datetime, timezone +from typing import Callable, Type, Tuple, Any + +# ----------------------------- +# Structured logger +# ----------------------------- +logger = logging.getLogger("alpaca") +logger.setLevel(logging.INFO) +handler = logging.StreamHandler() +formatter = logging.Formatter( + "%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S%z", +) +handler.setFormatter(formatter) +if not logger.handlers: + logger.addHandler(handler) + + +# ----------------------------- +# Exponential back-off decorator +# ----------------------------- +def backoff( + max_retries: int = 5, + base_delay: float = 1.0, + factor: float = 2.0, + exceptions: Tuple[Type[BaseException], ...] = (Exception,), +) -> Callable: + """ + Decorator to retry a function with exponential back-off upon specified exceptions. + + :param max_retries: maximum number of retries before giving up + :param base_delay: initial delay between retries (in seconds) + :param factor: multiplier for delay on each retry + :param exceptions: tuple of exception classes that should trigger a retry + """ + def decorator(func: Callable) -> Callable: + @functools.wraps(func) + def wrapper(*args: Any, **kwargs: Any) -> Any: + retries = 0 + delay = base_delay + while True: + try: + return func(*args, **kwargs) + except exceptions as e: + if retries >= max_retries: + logger.error( + f"{func.__name__}: exceeded {max_retries} retries – giving up: {e}" + ) + raise + logger.warning( + f"{func.__name__}: error {e!r}, retrying in {delay:.1f}s " + f"(retry {retries + 1}/{max_retries})" + ) + time.sleep(delay) + retries += 1 + delay *= factor + return wrapper + return decorator + + +# ----------------------------- +# Time helpers +# ----------------------------- +def to_rfc3339(dt: datetime) -> str: + """ + Convert a datetime to an RFC 3339–formatted string. + If no tzinfo is present, UTC is assumed. + """ + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.isoformat() + + +def parse_rfc3339(timestamp: str) -> datetime: + """ + Parse an RFC 3339–formatted string into a datetime. + """ + return datetime.fromisoformat(timestamp) diff --git a/src/fetchers/coindesk_client/asset_metadata.py b/src/fetchers/coindesk_client/asset_metadata.py new file mode 100644 index 0000000000000000000000000000000000000000..92c7d3c54edcf765235d7bfb49c419dcd1ff561d --- /dev/null +++ b/src/fetchers/coindesk_client/asset_metadata.py @@ -0,0 +1,26 @@ +""" +asset_metadata.py – Asset metadata endpoints for CoinDesk API client. + +- list_assets(): List all supported assets with basic metadata. +- get_asset_details(symbol): Fetch detailed metadata for a specific asset. +""" + +from client import BaseClient + +class AssetMetadataClient(BaseClient): + def list_assets(self): + """ + Get a list of all supported assets and their basic metadata. + + :return: JSON response containing assets list. + """ + return self._get("assets") + + def get_asset_details(self, symbol): + """ + Get detailed metadata for a specific asset. + + :param symbol: Asset symbol, e.g., "BTC" or "ETH". + :return: JSON response with asset details. + """ + return self._get(f"assets/{symbol}") diff --git a/src/fetchers/coindesk_client/client.py b/src/fetchers/coindesk_client/client.py new file mode 100644 index 0000000000000000000000000000000000000000..d5df5d4029a72d7da2b3f28fe5a27b31e5657ce1 --- /dev/null +++ b/src/fetchers/coindesk_client/client.py @@ -0,0 +1,218 @@ +""" +client.py – Base HTTP client for CoinDesk API. + +This module provides the BaseClient class that handles HTTP requests +to the CoinDesk API with proper authentication and error handling. +""" + +import requests +import json +from typing import Dict, Any, Optional +from urllib.parse import urljoin, urlencode +import config + + +class APIError(Exception): + """Custom exception for API errors.""" + def __init__(self, message: str, status_code: int = None, response: Any = None): + self.message = message + self.status_code = status_code + self.response = response + super().__init__(self.message) + + +class BaseClient: + """ + Base HTTP client for CoinDesk API requests. + + Handles authentication, request formatting, and error handling. + """ + + def __init__(self, base_url: str = None, headers: Dict[str, str] = None): + """ + Initialize the base client. + + Args: + base_url: Base URL for the API (defaults to config.BASE_URL) + headers: Default headers (defaults to config.HEADERS) + """ + self.base_url = base_url or config.BASE_URL + self.headers = headers or config.HEADERS.copy() + self.session = requests.Session() + self.session.headers.update(self.headers) + + def _make_request(self, method: str, endpoint: str, params: Dict[str, Any] = None, + data: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]: + """ + Make an HTTP request to the API. + + Args: + method: HTTP method (GET, POST, PUT, DELETE) + endpoint: API endpoint path + params: URL parameters + data: Request body data + **kwargs: Additional arguments for requests + + Returns: + dict: JSON response from the API + + Raises: + APIError: If the request fails or returns an error status + """ + # Construct full URL + url = urljoin(self.base_url, endpoint.lstrip('/')) + + # Clean up parameters (remove None values) + if params: + params = {k: v for k, v in params.items() if v is not None} + + try: + # Make the request + response = self.session.request( + method=method, + url=url, + params=params, + json=data, + **kwargs + ) + + # Log the request for debugging + print(f"[DEBUG] {method} {url}") + if params: + print(f"[DEBUG] Params: {params}") + print(f"[DEBUG] Status: {response.status_code}") + + # Check if request was successful + if response.status_code == 200: + try: + return response.json() + except json.JSONDecodeError: + # If response is not JSON, return the text + return {"data": response.text, "status": "success"} + else: + # Handle different error status codes + error_message = f"API request failed with status {response.status_code}" + + try: + error_data = response.json() + if 'error' in error_data: + error_message = error_data['error'] + elif 'message' in error_data: + error_message = error_data['message'] + except json.JSONDecodeError: + error_message = f"{error_message}: {response.text}" + + raise APIError( + message=error_message, + status_code=response.status_code, + response=response + ) + + except requests.exceptions.RequestException as e: + raise APIError(f"Request failed: {str(e)}") + + def get(self, endpoint: str, params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]: + """ + Make a GET request. + + Args: + endpoint: API endpoint path + params: URL parameters + **kwargs: Additional arguments for requests + + Returns: + dict: JSON response from the API + """ + return self._make_request('GET', endpoint, params=params, **kwargs) + + def post(self, endpoint: str, data: Dict[str, Any] = None, + params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]: + """ + Make a POST request. + + Args: + endpoint: API endpoint path + data: Request body data + params: URL parameters + **kwargs: Additional arguments for requests + + Returns: + dict: JSON response from the API + """ + return self._make_request('POST', endpoint, params=params, data=data, **kwargs) + + def put(self, endpoint: str, data: Dict[str, Any] = None, + params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]: + """ + Make a PUT request. + + Args: + endpoint: API endpoint path + data: Request body data + params: URL parameters + **kwargs: Additional arguments for requests + + Returns: + dict: JSON response from the API + """ + return self._make_request('PUT', endpoint, params=params, data=data, **kwargs) + + def delete(self, endpoint: str, params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]: + """ + Make a DELETE request. + + Args: + endpoint: API endpoint path + params: URL parameters + **kwargs: Additional arguments for requests + + Returns: + dict: JSON response from the API + """ + return self._make_request('DELETE', endpoint, params=params, **kwargs) + + def close(self): + """Close the HTTP session.""" + self.session.close() + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.close() + + +# Convenience function to create a client instance +def create_client(base_url: str = None, headers: Dict[str, str] = None) -> BaseClient: + """ + Create a new BaseClient instance. + + Args: + base_url: Base URL for the API + headers: Default headers + + Returns: + BaseClient: Configured client instance + """ + return BaseClient(base_url=base_url, headers=headers) + + +# Test function to verify the client works +def test_client(): + """Test the base client functionality.""" + try: + with create_client() as client: + # Test a simple endpoint (you might need to adjust this based on your API) + response = client.get("/index/cc/v1/markets") + print("Client test successful!") + print(f"Response keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}") + return True + except Exception as e: + print(f"Client test failed: {e}") + return False + + +if __name__ == "__main__": + test_client() \ No newline at end of file diff --git a/src/fetchers/coindesk_client/coindesk_utils.py b/src/fetchers/coindesk_client/coindesk_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..6543eb4da1da75b7a0d49ce19c6db462673838b4 --- /dev/null +++ b/src/fetchers/coindesk_client/coindesk_utils.py @@ -0,0 +1,49 @@ +""" +coindesk_utils.py – Utilities for saving, merging, and managing CoinDesk data as Parquet using StorageHandler. + +Features: +- save_and_merge_parquet: Save new data, merge with existing Parquet, dedupe by date, keep N days. +""" +import os +import pandas as pd +from datetime import datetime, timedelta +from src.data_cloud.cloud_utils import StorageHandler + + +def save_and_merge_parquet( + storage: StorageHandler, + key: str, + new_data: pd.DataFrame, + date_col: str = "timestamp", + days: int = 7, + content_type: str = "application/octet-stream", +): + """ + Save new_data as Parquet, merging with existing file by date_col, keeping only the last N days. + - storage: StorageHandler instance + - key: storage key (e.g., 'coindesk/spot_markets.parquet') + - new_data: DataFrame to save + - date_col: column to use for date filtering (must be datetime-like) + - days: keep only this many days of data + - content_type: MIME type for Parquet + """ + # Try to load existing data + try: + existing_bytes = storage.download(key) + df_old = pd.read_parquet(pd.io.common.BytesIO(existing_bytes)) + except Exception: + df_old = pd.DataFrame() + + # Combine and dedupe + df_all = pd.concat([df_old, new_data], ignore_index=True) + if date_col in df_all.columns: + df_all[date_col] = pd.to_datetime(df_all[date_col], errors="coerce") + cutoff = datetime.utcnow() - timedelta(days=days) + df_all = df_all[df_all[date_col] >= cutoff] + df_all = df_all.sort_values(date_col).drop_duplicates() + + # Save merged Parquet + buf = pd.io.common.BytesIO() + df_all.to_parquet(buf, index=False) + storage.upload(key, buf.getvalue(), content_type=content_type) + return df_all diff --git a/src/fetchers/coindesk_client/config.py b/src/fetchers/coindesk_client/config.py new file mode 100644 index 0000000000000000000000000000000000000000..e77b522e5180ecd9efeb3c9f9de5a8d7c4fa2c11 --- /dev/null +++ b/src/fetchers/coindesk_client/config.py @@ -0,0 +1,30 @@ +""" +config.py – Configuration and secrets for CoinDesk API client. + +- Defines API_KEY, BASE_URL, and optional TIMEZONE constants +- Loads environment variables securely (e.g., via python-dotenv) +- Configures default headers (Authorization, Content-Type) +""" + +import os +from dotenv import load_dotenv + +load_dotenv() + +API_KEY = os.getenv("COINDESK_API_KEY") +BASE_URL = os.getenv("COINDESK_BASE_URL", "https://data-api.coindesk.com/").rstrip('/') +TIMEZONE = os.getenv("COINDESK_TIMEZONE", "UTC") + +# Flexible parameters for data collection +MARKET = os.getenv("COINDESK_MARKET", "binance") +SYMBOL = os.getenv("COINDESK_SYMBOL", "BTC-USD") +INSTRUMENTS = os.getenv("COINDESK_INSTRUMENTS", "BTC-USD").split(",") +DAYS = int(os.getenv("COINDESK_DAYS_OLD", 7)) +FUTURES_LIMIT = int(os.getenv("COINDESK_FUTURES_LIMIT", 50)) +SENTIMENT_LIMIT = int(os.getenv("COINDESK_SENTIMENT_LIMIT", 50)) +BLOCK_NUMBER = int(os.getenv("COINDESK_BLOCK_NUMBER", 100000)) + +HEADERS = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json" +} diff --git a/src/fetchers/coindesk_client/d.txt b/src/fetchers/coindesk_client/d.txt new file mode 100644 index 0000000000000000000000000000000000000000..98ff7af86c9fc478c85d0a4174e1a2a42dd0adda --- /dev/null +++ b/src/fetchers/coindesk_client/d.txt @@ -0,0 +1,12 @@ +Latest Tick:/index/cc/v1/latest/tick?market=cadli&instruments=BTC-USD,ETH-USD&apply_mapping=true +Historical OHLCV+:/index/cc/v1/historical/days?market=cadli&instrument=BTC-USD&limit=30&aggregate=1&fill=true&apply_mapping=true&response_format=JSON +DA Fixings:/index/cc/v1/historical/days/ccda?instrument=BTC-USD&timezone=Europe/London&date=2023-10-30&close_time=16:00&limit=5&response_format=JSON +Index Updates:/index/cc/v2/historical/messages/hour?market=cadli&instrument=BTC-USD&hour_ts=1701176400&apply_mapping=true&response_format=JSON +Index Composition:/index/cc/v1/historical/days/composition?market=cd_mc&instrument=CD20-USD&timezone=Europe/London&date=2025-05-09&close_time=16:00&limit=5&response_format=JSON +Instrument Metadata:/index/cc/v1/latest/instrument/metadata?market=cadli&instruments=BTC-USD,ETH-USD&apply_mapping=true +Markets:/index/cc/v1/markets?market=cadli +Markets + Instruments:/index/cc/v1/markets/instruments?market=cadli&instruments=BTC-USD,ETH-USD&instrument_status=ACTIVE +Forex Rates: /index/cc/v1/latest/tick/forex?instruments=GBP-USD,MYR-USD +EOD Markets + Instruments: /index/cc/v1/markets/instruments/unmapped/eod?market=cdifti&instruments=BTIUSF-USD&instrument_status=ACTIVE +EOD Historical OHLCV+ Day:/index/cc/v1/historical/days/eod?market=cdifti&instrument=BTIUSF-USD&limit=5&response_format=JSON +Index Reconstitution: /index/cc/v1/reconstitution?market=cd_mc&instrument=CD20-USD \ No newline at end of file diff --git a/src/fetchers/coindesk_client/derivatives.py b/src/fetchers/coindesk_client/derivatives.py new file mode 100644 index 0000000000000000000000000000000000000000..2a7f7ffc7ffe0d15153eae68d769856c647104b8 --- /dev/null +++ b/src/fetchers/coindesk_client/derivatives.py @@ -0,0 +1,68 @@ +""" +derivatives.py – Derivatives endpoints for CoinDesk API client. + +- list_markets(): List all available derivatives markets. +- get_latest_futures(symbol=None): Fetch the latest futures data, optionally for a symbol. +- get_futures_historical(days, limit=None): Retrieve futures historical data over N days. +- list_options(symbol=None): List available options or option chain for a given asset. +- get_options_historical(symbol, start, end=None, limit=None): Fetch options historical data over a timeframe. +""" + +from client import BaseClient + +class DerivativesClient(BaseClient): + def list_markets(self): + """ + List all available derivatives markets. + """ + return self._get("derivatives/markets") + + def get_latest_futures(self, symbol=None): + """ + Get the most recent futures data. If `symbol` is provided, returns data for that symbol. + + :param symbol: Futures symbol, e.g., "BTC-USD" (optional). + """ + path = "derivatives/futures" + if symbol: + path += f"/{symbol}" + return self._get(path) + + def get_futures_historical(self, days, limit=None): + """ + Fetch historical futures data for the past `days` days. + + :param days: Number of days of history to retrieve. + :param limit: Maximum number of records to return (optional). + """ + params = {"days": days} + if limit is not None: + params["limit"] = limit + return self._get("derivatives/futures/historical", params=params) + + def list_options(self, symbol=None): + """ + List all available options or get the option chain for a symbol. + + :param symbol: Asset symbol for option chain, e.g., "BTC-USD" (optional). + """ + path = "derivatives/options" + if symbol: + path += f"/{symbol}" + return self._get(path) + + def get_options_historical(self, symbol, start, end=None, limit=None): + """ + Fetch historical options data for a symbol over a timeframe. + + :param symbol: Asset symbol, e.g., "BTC-USD". + :param start: ISO8601 start datetime string. + :param end: ISO8601 end datetime string (optional). + :param limit: Maximum number of records to return (optional). + """ + params = {"start": start} + if end: + params["end"] = end + if limit is not None: + params["limit"] = limit + return self._get(f"derivatives/options/{symbol}/historical", params=params) diff --git a/src/fetchers/coindesk_client/doc.txt b/src/fetchers/coindesk_client/doc.txt new file mode 100644 index 0000000000000000000000000000000000000000..b60272b10a7b4123222aac9dcea4d53edeeac522 --- /dev/null +++ b/src/fetchers/coindesk_client/doc.txt @@ -0,0 +1,122 @@ +Below is the complete sidebar navigation structure under Data API → Introduction, with each endpoint’s link text and URL path. + +## Introduction + +* [Introduction](https://developers.coindesk.com/documentation/data-api/introduction) + +## Indices & Ref. Rates + +* [Latest Tick](https://developers.coindesk.com/documentation/data-api/index_cc_v1_latest_tick) +* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days) +* [DA Fixings](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days_ccda) +* [Index Updates](https://developers.coindesk.com/documentation/data-api/index_cc_v2_historical_messages_hour) +* [Index Composition](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days_composition) +* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/index_cc_v1_latest_instrument_metadata) +* [Markets](https://developers.coindesk.com/documentation/data-api/index_cc_v1_markets) +* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/index_cc_v1_markets_instruments) +* [Forex Rates](https://developers.coindesk.com/documentation/data-api/index_cc_v1_latest_tick_forex) +* [EOD Markets + Instruments](https://developers.coindesk.com/documentation/data-api/index_cc_v1_markets_instruments_unmapped_eod) +* [EOD Historical OHLCV+ Day](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days_eod) +* [Index Reconstitution](https://developers.coindesk.com/documentation/data-api/index_v1_reconstitution) + +## Spot + +* [Latest Tick](https://developers.coindesk.com/documentation/data-api/spot_v1_latest_tick) +* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/spot_v1_historical_days) +* [Trades](https://developers.coindesk.com/documentation/data-api/spot_v2_historical_trades_hour) +* [Order Book](https://developers.coindesk.com/documentation/data-api/spot_v1_historical_orderbook_l2_metrics_minute) +* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/spot_v1_latest_instrument_metadata) +* [Markets](https://developers.coindesk.com/documentation/data-api/spot_v1_markets) +* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/spot_v1_markets_instruments) + +## Futures + +* [Latest Tick](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_tick) +* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/futures_v1_historical_days) +* [Trades](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_trades_hour) +* [Order Book](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_orderbook_l2_metrics_minute) +* [Latest Tick (OI)](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_open_interest_tick) +* [Historical OHLC+ (OI)](https://developers.coindesk.com/documentation/data-api/futures_v1_historical_open_interest_days) +* [Updates (OI)](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_open_interest_messages_hour) +* [Latest Tick (FR)](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_funding_rate_tick) +* [Historical OHLC+ (FR)](https://developers.coindesk.com/documentation/data-api/futures_v1_historical_funding_rate_days) +* [Updates (FR)](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_funding_rate_messages_hour) +* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_instrument_metadata) +* [Markets](https://developers.coindesk.com/documentation/data-api/futures_v1_markets) +* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/futures_v1_markets_instruments) + +## Options + +* [Latest Tick](https://developers.coindesk.com/documentation/data-api/options_v1_latest_tick) +* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/options_v1_historical_days) +* [Trades](https://developers.coindesk.com/documentation/data-api/options_v2_historical_trades_hour) +* [Order Book](https://developers.coindesk.com/documentation/data-api/options_v1_historical_orderbook_l2_metrics_minute) +* [Latest Tick (OI)](https://developers.coindesk.com/documentation/data-api/options_v1_latest_open_interest_tick) +* [Historical OHLC+ (OI)](https://developers.coindesk.com/documentation/data-api/options_v1_historical_open_interest_days) +* [Updates (OI)](https://developers.coindesk.com/documentation/data-api/options_v2_historical_open_interest_messages_hour) +* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/options_v1_latest_instrument_metadata) +* [Markets](https://developers.coindesk.com/documentation/data-api/options_v1_markets) +* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/options_v1_markets_instruments) + +## Derivatives Indices + +* [Latest Tick](https://developers.coindesk.com/documentation/data-api/index_v1_latest_tick) +* [Historical OHLC+](https://developers.coindesk.com/documentation/data-api/index_v1_historical_days) +* [Index Updates](https://developers.coindesk.com/documentation/data-api/index_v2_historical_messages_hour) +* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/index_v1_latest_instrument_metadata) +* [Markets](https://developers.coindesk.com/documentation/data-api/index_v1_markets) +* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/index_v1_markets_instruments) + +## On-Chain DEX + +* [Latest Tick (Swap)](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_latest_swap_tick) +* [Historical OHLCV+ (Swap)](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_historical_swap_days) +* [Swaps](https://developers.coindesk.com/documentation/data-api/onchain_v2_amm_historical_swap_messages_hour) +* [Liquidity Updates](https://developers.coindesk.com/documentation/data-api/onchain_v2_amm_historical_liquidity_update_messages_hour) +* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_latest_instrument_metadata) +* [Markets](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_markets) +* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_markets_instruments) + +## On-Chain Core + +* [ETH Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_2) +* [BSC Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_8) +* [BTC Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_1) +* [BASE Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_2410) +* [ARB Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_808) +* [ETH Address](https://developers.coindesk.com/documentation/data-api/onchain_v1_address_metadata_2) +* [Assets By Chain](https://developers.coindesk.com/documentation/data-api/onchain_v3_summary_by_chain) +* [Asset By Address](https://developers.coindesk.com/documentation/data-api/onchain_v2_data_by_address) +* [Historical Supply](https://developers.coindesk.com/documentation/data-api/onchain_v2_historical_supply_days) + +## Asset + +* [Metadata](https://developers.coindesk.com/documentation/data-api/asset_v2_metadata) +* [Top List](https://developers.coindesk.com/documentation/data-api/asset_v1_top_list) +* [Search](https://developers.coindesk.com/documentation/data-api/asset_v1_search) +* [Summary List](https://developers.coindesk.com/documentation/data-api/asset_v1_summary_list) +* [Events](https://developers.coindesk.com/documentation/data-api/asset_v1_events) +* [Historical Social](https://developers.coindesk.com/documentation/data-api/asset_v1_historical_code_repository_days) + +## News + +* [Latest Articles](https://developers.coindesk.com/documentation/data-api/news_v1_article_list) +* [Sources](https://developers.coindesk.com/documentation/data-api/news_v1_source_list) +* [Categories](https://developers.coindesk.com/documentation/data-api/news_v1_category_list) +* [Single Article](https://developers.coindesk.com/documentation/data-api/news_v1_article_get) +* [Search](https://developers.coindesk.com/documentation/data-api/news_v1_search) + +## Overview + +* [MktCap Latest Tick](https://developers.coindesk.com/documentation/data-api/overview_v1_latest_marketcap_all_tick) +* [MktCap Historical OHLCV](https://developers.coindesk.com/documentation/data-api/overview_v1_historical_marketcap_all_assets_days) + +## Utilities + +* [Rate Limit Verification](https://developers.coindesk.com/documentation/data-api/admin_v2_rate_limit) +* [Version](https://developers.coindesk.com/documentation/data-api/info_v1_version) +* [OpenAPI](https://developers.coindesk.com/documentation/data-api/info_v1_openapi) + +## Deprecated + +*(toggle to view deprecated endpoints)* diff --git a/src/fetchers/coindesk_client/index.py b/src/fetchers/coindesk_client/index.py new file mode 100644 index 0000000000000000000000000000000000000000..17ea695c28a4a0de0ac9162ebc3f90c2d0c761e4 --- /dev/null +++ b/src/fetchers/coindesk_client/index.py @@ -0,0 +1,552 @@ +""" +Optimized CoinDesk API Client with Smart Market Discovery and Endpoint Compatibility +Enhanced version with improved error handling and market validation +""" + +import argparse +import json +import os +from client import BaseClient, APIError +from typing import Union, List, Optional, Dict, Tuple +from datetime import datetime, timedelta +import time +from dataclasses import dataclass +from enum import Enum +import sys + +def safe_print(*args, **kwargs): + """Prints unicode safely even if the terminal encoding is not UTF-8.""" + text = " ".join(str(arg) for arg in args) + try: + sys.stdout.buffer.write((text + '\n').encode('utf-8', errors='replace')) + except Exception: + # Fallback to plain print if all else fails + print(text.encode('ascii', errors='replace').decode('ascii'), **kwargs) + +import config + +import pathlib + +def ensure_data_dir(): + """Ensure the data directory exists.""" + data_dir = pathlib.Path("dta/coindesk/indexes") + data_dir.mkdir(parents=True, exist_ok=True) + return data_dir + +def save_json_result(filename: str, data: dict): + """Save data as JSON to the dta/coindesk/indexes directory.""" + data_dir = ensure_data_dir() + file_path = data_dir / filename + with open(file_path, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + +class EndpointStatus(Enum): + SUPPORTED = "supported" + UNSUPPORTED = "unsupported" + UNKNOWN = "unknown" + +@dataclass +class MarketInfo: + """Market information with endpoint compatibility""" + market_id: str + name: str + endpoints: Dict[str, EndpointStatus] + instruments: List[str] + last_checked: datetime + +class IndexClient(BaseClient): + """ + Enhanced Index & Reference Rates endpoints for CoinDesk Data API. + Includes smart market discovery and endpoint compatibility checking. + """ + + def __init__(self): + super().__init__() + self._market_cache = {} + self._endpoint_compatibility = {} + + def list_markets(self) -> dict: + """List all available markets (index families).""" + return self.get("/index/cc/v1/markets") + + def list_markets_instruments(self, + market: str, + instruments: Optional[List[str]] = None, + instrument_status: str = "ACTIVE" + ) -> dict: + """ + List instruments for a given market. If instruments is None, + retrieves *all* mapped instruments from the API. + """ + params = { + "market": market, + "instrument_status": instrument_status + } + if instruments: + params["instruments"] = ",".join(instruments) + return self.get("/index/cc/v1/markets/instruments", params=params) + + def get_latest_tick(self, market: str, instruments: List[str], + apply_mapping: bool = True) -> dict: + """ + Latest OHLCV+ tick data. + + Args: + market: Index family identifier (e.g., 'sda', 'cdifti') + instruments: List of instrument tickers (e.g., ['XBX-USD', 'ETX-USD']) + apply_mapping: Whether to apply instrument mapping + """ + if not instruments: + raise ValueError("The 'instruments' parameter is required") + + params = { + 'market': market, + 'instruments': ','.join(instruments), + 'apply_mapping': str(apply_mapping).lower() + } + + return self.get("/index/cc/v1/latest/tick", params=params) + + def get_historical_days(self, market: str, instrument: str, limit: int = 30, + aggregate: int = 1, fill: bool = True, + apply_mapping: bool = True, response_format: str = "JSON") -> dict: + """ + Historical OHLCV+ by day. + + Args: + market: Index family identifier (e.g., 'sda', 'cdifti') + instrument: Single instrument ticker (e.g., 'XBX-USD') + limit: Number of days to retrieve + aggregate: Aggregation period + fill: Whether to fill missing data + apply_mapping: Whether to apply instrument mapping + response_format: Response format + """ + if not instrument: + raise ValueError("The 'instrument' parameter is required") + + params = { + 'market': market, + 'instrument': instrument, + 'limit': limit, + 'aggregate': aggregate, + 'fill': str(fill).lower(), + 'apply_mapping': str(apply_mapping).lower(), + 'response_format': response_format + } + + return self.get("/index/cc/v1/historical/days", params=params) + + def get_latest_instrument_metadata(self, market: str, instruments: List[str], + apply_mapping: bool = True) -> dict: + """ + Latest instrument metadata. + + Args: + market: Index family identifier (e.g., 'sda', 'cdifti') + instruments: List of instrument tickers (e.g., ['XBX-USD', 'ETX-USD']) + apply_mapping: Whether to apply instrument mapping + """ + if not instruments: + raise ValueError("The 'instruments' parameter is required") + + params = { + 'market': market, + 'instruments': ','.join(instruments), + 'apply_mapping': str(apply_mapping).lower() + } + + return self.get("/index/cc/v1/latest/instrument/metadata", params=params) + + def list_eod_markets_instruments(self, market: str, instruments: List[str] = None, + instrument_status: str = "ACTIVE") -> dict: + """ + List EOD (unmapped) instruments - most reliable for instrument discovery. + + Args: + market: Index family identifier (e.g., 'cdifti') + instruments: Optional list of instruments to filter + instrument_status: Status filter (default: 'ACTIVE') + """ + params = { + 'market': market, + 'instrument_status': instrument_status + } + if instruments: + params['instruments'] = ','.join(instruments) + + return self.get("/index/cc/v1/markets/instruments/unmapped/eod", params=params) + + def get_historical_days_eod(self, market: str, instrument: str, limit: int = 5, + response_format: str = "JSON") -> dict: + """ + EOD historical OHLCV+ by day. + + Args: + market: Index family identifier (e.g., 'cdifti') + instrument: Single instrument ticker + limit: Number of days to retrieve + response_format: Response format + """ + params = { + 'market': market, + 'instrument': instrument, + 'limit': limit, + 'response_format': response_format + } + + return self.get("/index/cc/v1/historical/days/eod", params=params) + + def check_endpoint_compatibility(self, market: str) -> Dict[str, EndpointStatus]: + """ + Check which endpoints are supported for a specific market. + + Args: + market: Market identifier to check + + Returns: + Dictionary mapping endpoint names to their support status + """ + if market in self._endpoint_compatibility: + return self._endpoint_compatibility[market] + + endpoints = {} + test_instruments = ["BTC-USD", "ETH-USD", "XBX-USD"] # Common test instruments + + # Test EOD instruments endpoint + try: + self.list_eod_markets_instruments(market=market) + endpoints["eod_instruments"] = EndpointStatus.SUPPORTED + except APIError as e: + endpoints["eod_instruments"] = EndpointStatus.UNSUPPORTED if e.status_code == 400 else EndpointStatus.UNKNOWN + except Exception: + endpoints["eod_instruments"] = EndpointStatus.UNKNOWN + + # Test mapped instruments endpoint (requires valid instruments) + try: + # First try to get some instruments + instruments = self.discover_instruments_for_market(market, silent=True) + if instruments: + self.list_markets_instruments(market=market, instruments=instruments[:2]) + endpoints["mapped_instruments"] = EndpointStatus.SUPPORTED + else: + endpoints["mapped_instruments"] = EndpointStatus.UNKNOWN + except APIError as e: + endpoints["mapped_instruments"] = EndpointStatus.UNSUPPORTED if e.status_code == 400 else EndpointStatus.UNKNOWN + except Exception: + endpoints["mapped_instruments"] = EndpointStatus.UNKNOWN + + # Test tick data endpoint + try: + instruments = self.discover_instruments_for_market(market, silent=True) + if instruments: + self.get_latest_tick(market=market, instruments=instruments[:2]) + endpoints["tick_data"] = EndpointStatus.SUPPORTED + else: + endpoints["tick_data"] = EndpointStatus.UNKNOWN + except APIError as e: + endpoints["tick_data"] = EndpointStatus.UNSUPPORTED if e.status_code in [400, 404] else EndpointStatus.UNKNOWN + except Exception: + endpoints["tick_data"] = EndpointStatus.UNKNOWN + + # Test historical data endpoint + try: + instruments = self.discover_instruments_for_market(market, silent=True) + if instruments: + self.get_historical_days(market=market, instrument=instruments[0], limit=1) + endpoints["historical_data"] = EndpointStatus.SUPPORTED + else: + endpoints["historical_data"] = EndpointStatus.UNKNOWN + except APIError as e: + endpoints["historical_data"] = EndpointStatus.UNSUPPORTED if e.status_code in [400, 404] else EndpointStatus.UNKNOWN + except Exception: + endpoints["historical_data"] = EndpointStatus.UNKNOWN + + # Test metadata endpoint + try: + instruments = self.discover_instruments_for_market(market, silent=True) + if instruments: + self.get_latest_instrument_metadata(market=market, instruments=instruments[:2]) + endpoints["metadata"] = EndpointStatus.SUPPORTED + else: + endpoints["metadata"] = EndpointStatus.UNKNOWN + except APIError as e: + endpoints["metadata"] = EndpointStatus.UNSUPPORTED if e.status_code in [400, 404] else EndpointStatus.UNKNOWN + except Exception: + endpoints["metadata"] = EndpointStatus.UNKNOWN + + self._endpoint_compatibility[market] = endpoints + return endpoints + + def discover_markets_with_compatibility(self) -> List[MarketInfo]: + """ + Discover all markets with their endpoint compatibility and instruments. + + Returns: + List of MarketInfo objects with full compatibility information + """ + safe_print("🔍 Discovering markets with endpoint compatibility...") + + try: + resp = self.list_markets() + raw_markets = resp.get('Data', []) + + if not raw_markets: + safe_print("❌ No markets found in API response") + return [] + + market_infos = [] + + for entry in raw_markets: + if isinstance(entry, dict): + market_id = entry.get('market') + market_name = entry.get('name') + else: + market_id = entry + market_name = None + + if not market_id: + continue + + safe_print(f"\n📊 Analyzing market: {market_id} ({market_name or 'Unknown'})") + + # Check endpoint compatibility + endpoints = self.check_endpoint_compatibility(market_id) + + # Get instruments if possible + instruments = self.discover_instruments_for_market(market_id) + + # Create market info + market_info = MarketInfo( + market_id=market_id, + name=market_name or market_id, + endpoints=endpoints, + instruments=instruments, + last_checked=datetime.now() + ) + + market_infos.append(market_info) + + # Print compatibility summary + supported_count = sum(1 for status in endpoints.values() if status == EndpointStatus.SUPPORTED) + total_count = len(endpoints) + safe_print(f" ✅ Supported endpoints: {supported_count}/{total_count}") + safe_print(f" 🔧 Available instruments: {len(instruments)}") + + return market_infos + + except Exception as e: + safe_print(f"❌ Error discovering markets: {e}") + return [] + + def discover_instruments_for_market(self, market: str, silent: bool = False) -> List[str]: + """ + Discover available instruments for a specific market using multiple approaches. + + Args: + market: Market identifier (e.g., 'sda', 'cdifti') + silent: If True, suppress output messages + + Returns: + List of available instrument tickers + """ + if not silent: + safe_print(f"🔍 Discovering instruments for market '{market}'...") + + # 1) EOD endpoint + try: + eod = self.list_eod_markets_instruments(market=market) + data = eod.get("Data", {}).get(market, {}).get("instruments", {}) + if data: + instruments = list(data.keys()) + if not silent: + safe_print(f" ✅ {len(instruments)} via EOD") + return instruments + except Exception as e: + if not silent: + safe_print(f" ⚠️ EOD failed: {e}") + + # 2) Metadata fallback + common = ["BTC-USD", "ETH-USD", "XBX-USD", "ETX-USD"] + try: + meta = self.get_latest_instrument_metadata(market, common) + if meta.get("Data"): + instruments = list(meta["Data"].keys()) + if not silent: + safe_print(f" ✅ {len(instruments)} via metadata") + return instruments + except Exception as e: + if not silent: + safe_print(f" ⚠️ Metadata failed: {e}") + + # 3) General mapped instruments fallback + try: + mapped = self.list_markets_instruments(market=market) + data = mapped.get("Data", {}).get(market, {}) + if data: + instruments = list(data.keys()) + if not silent: + safe_print(f" ✅ {len(instruments)} via general mapped endpoint") + return instruments + except Exception as e: + if not silent: + safe_print(f" ⚠️ General mapped failed: {e}") + + if not silent: + safe_print(f" ❌ No instruments for {market}") + return [] + + def get_market_summary(self, market: str) -> Dict: + """ + Get a comprehensive summary of a market's capabilities. + + Args: + market: Market identifier + + Returns: + Dictionary with market summary information + """ + endpoints = self.check_endpoint_compatibility(market) + instruments = self.discover_instruments_for_market(market, silent=True) + + supported_endpoints = [name for name, status in endpoints.items() if status == EndpointStatus.SUPPORTED] + + return { + "market_id": market, + "total_instruments": len(instruments), + "sample_instruments": instruments[:5], + "supported_endpoints": supported_endpoints, + "endpoint_details": endpoints, + "is_functional": len(supported_endpoints) > 0 and len(instruments) > 0 + } + + +def test_market_comprehensively(client: IndexClient, market: str): + """ + Run comprehensive tests on a market with smart endpoint selection. + + Args: + client: IndexClient instance + market: Market identifier to test + """ + safe_print(f"\n{'='*60}") + safe_print(f"🧪 COMPREHENSIVE MARKET TEST: {market}") + safe_print(f"{'='*60}") + + # Get market summary + summary = client.get_market_summary(market) + + safe_print(f"📊 Market Summary:") + safe_print(f" Market ID: {summary['market_id']}") + safe_print(f" Total Instruments: {summary['total_instruments']}") + safe_print(f" Functional: {'✅' if summary['is_functional'] else '❌'}") + safe_print(f" Supported Endpoints: {', '.join(summary['supported_endpoints'])}") + + if not summary['is_functional']: + safe_print("⚠️ Market is not functional - skipping detailed tests") + return + + instruments = summary['sample_instruments'][:3] # Use first 3 for testing + safe_print(f"🔧 Testing with instruments: {instruments}") + + # Test each supported endpoint + endpoint_tests = { + "eod_instruments": lambda: client.list_eod_markets_instruments(market=market), + "mapped_instruments": lambda: client.list_markets_instruments(market=market, instruments=instruments), + "tick_data": lambda: client.get_latest_tick(market=market, instruments=instruments), + "historical_data": lambda: client.get_historical_days(market=market, instrument=instruments[0], limit=3), + "metadata": lambda: client.get_latest_instrument_metadata(market=market, instruments=instruments) + } + + results = {} + + for endpoint_name, test_func in endpoint_tests.items(): + if endpoint_name in summary['supported_endpoints']: + safe_print(f"\n🧪 Testing {endpoint_name}...") + try: + response = test_func() + data_count = len(response.get('Data', [])) + results[endpoint_name] = "✅ SUCCESS" + safe_print(f" ✅ SUCCESS - Retrieved {data_count} data points") + safe_print(f" 📋 Response keys: {list(response.keys())}") + except Exception as e: + results[endpoint_name] = f"❌ FAILED: {str(e)[:100]}" + safe_print(f" ❌ FAILED: {str(e)[:100]}") + else: + results[endpoint_name] = "⏭️ SKIPPED (unsupported)" + safe_print(f"\n⏭️ Skipping {endpoint_name} (unsupported)") + + # Print test summary + safe_print(f"\n📋 Test Results Summary:") + for endpoint, result in results.items(): + safe_print(f" {endpoint}: {result}") + + safe_print(f"\n{'='*60}") + + + +def fetch_all_functional_markets(): + """ + Fetch latest tick and 30-day history for BTC-USD, SOL-USD, ETH-USD + across all functional markets. + Save results in dta/coindesk/indexes. + """ + import config + from client import APIError + + if not config.API_KEY: + safe_print("❌ Error: COINDESK_API_KEY not set.") + return + + client = IndexClient() + safe_print("🚀 Fetching data for all functional markets and BTC/SOL/ETH...") + + markets = [ + "cadli", "ccix", "ccxrp", "ccxrpperp", + "cd_mc", "cdi_b", "cdi_mda", "cdor", "sda" + ] + instruments = ["BTC-USD", "SOL-USD", "ETH-USD"] + + for m in markets: + safe_print(f"\n📊 Market: {m}") + market_results = {} + for inst in instruments: + # Latest tick + try: + tick = client.get_latest_tick(market=m, instruments=[inst]) + data = tick.get("Data", {}).get(inst, {}) + safe_print(f" 🔸 {inst} latest price: {data.get('price', 'n/a')}") + market_results[f"{inst}_latest_tick"] = tick + except APIError as e: + safe_print(f" ⚠️ {inst} tick failed (status {e.status_code})") + market_results[f"{inst}_latest_tick"] = {"error": f"APIError {e.status_code}"} + except Exception as e: + safe_print(f" ⚠️ {inst} tick error: {e}") + market_results[f"{inst}_latest_tick"] = {"error": str(e)} + + # 30-day historical + try: + hist = client.get_historical_days( + market=m, + instrument=inst, + limit=30, + aggregate=1, + fill=True + ) + days = hist.get("Data", {}).get("values", []) + safe_print(f" • {len(days)} days of history (first: {days[0] if days else 'n/a'})") + market_results[f"{inst}_history"] = hist + except APIError as e: + safe_print(f" ⚠️ {inst} history failed (status {e.status_code})") + market_results[f"{inst}_history"] = {"error": f"APIError {e.status_code}"} + except Exception as e: + safe_print(f" ⚠️ {inst} history error: {e}") + market_results[f"{inst}_history"] = {"error": str(e)} + + # Save results for this market + save_json_result(f"{m}.json", market_results) + +if __name__ == "__main__": + # main() + # To run the fetch-all-markets script, uncomment below: + fetch_all_functional_markets() \ No newline at end of file diff --git a/src/fetchers/coindesk_client/main.py b/src/fetchers/coindesk_client/main.py new file mode 100644 index 0000000000000000000000000000000000000000..cdeffba68ea29f6538615c84a4c56947ac149044 --- /dev/null +++ b/src/fetchers/coindesk_client/main.py @@ -0,0 +1,360 @@ +""" +main.py – Fetch CoinDesk On-Chain **and** AMM (Uniswap‑style) data +================================================================= +Patched 2025‑07‑13 +------------------ +* **Fixed** positional/keyword mismatch for `get_block`. +* **Flatten + sanitize** CoinDesk AMM responses so Parquet writes succeed. +* **Direct overwrite** for list/dict‑rich endpoints to prevent merge type errors. +""" +from __future__ import annotations + +import sys +import os +import argparse +import logging +import datetime as _dt +import json as _json +from typing import List, Optional, Any, Dict + +from dotenv import load_dotenv +import pandas as pd + +# --------------------------------------------------------------------------- +# Tier-locked endpoint skip flag +# --------------------------------------------------------------------------- +SKIP_TIER_LOCKED = os.getenv("COINDESK_SKIP_TIER_LOCKED", "true").lower() in ("1", "true", "yes") + +# --------------------------------------------------------------------------- +# Path bootstrap – ensure project root is import‑able +# --------------------------------------------------------------------------- +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", "..")) +if PROJECT_ROOT not in sys.path: + sys.path.insert(0, PROJECT_ROOT) + +# --------------------------------------------------------------------------- +# Local imports (resolved after path bootstrap) +# --------------------------------------------------------------------------- +from onchain import OnChainClient, normalize_data # noqa: E402 +from src.data_cloud.cloud_utils import StorageHandler # noqa: E402 +from src.fetchers.coindesk_client.coindesk_utils import save_and_merge_parquet # noqa: E402 + +# --------------------------------------------------------------------------- +# Logging +# --------------------------------------------------------------------------- +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +CHAIN_ASSET_MAP: Dict[str, int] = { + "ETH": 2, + "BSC": 8, + "BTC": 1, + "BASE": 2410, + "ARB": 808, +} + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _flatten_records(resp: Any, id_field: str = "id") -> pd.DataFrame: + """Flatten dict‑of‑dict → rows DataFrame; else defer to normalize_data().""" + if isinstance(resp, dict) and all(isinstance(v, dict) for v in resp.values()): + return pd.DataFrame([{id_field: k, **v} for k, v in resp.items()]) + return normalize_data(resp) + + +def _sanitize_for_parquet(df: pd.DataFrame) -> pd.DataFrame: + """Convert any nested dict/list columns to JSON strings for Arrow compatibility.""" + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].apply(lambda x: _json.dumps(x) if isinstance(x, (dict, list)) else str(x)) + return df + + +def _save_merge(storage: StorageHandler, filename: str, df: pd.DataFrame, *, date_col: str, days: int): + """Sanitize then merge new df into history via save_and_merge_parquet().""" + if df.empty: + logger.debug("→ %s empty, skip merge", filename) + return + df = _sanitize_for_parquet(df) + save_and_merge_parquet(storage, filename, df, date_col=date_col, days=days) + logger.info("✔ Merged %s (%d rows)", filename, len(df)) + + +def _save_overwrite(storage: StorageHandler, filename: str, df: pd.DataFrame): + """Sanitize then overwrite local Parquet—bypass merge to avoid mixed types.""" + if df.empty: + logger.debug("→ %s empty, skip overwrite", filename) + return + df = _sanitize_for_parquet(df) + local_dir = storage.local_base + path = os.path.join(local_dir, filename) + os.makedirs(os.path.dirname(path), exist_ok=True) + df.to_parquet(path, index=False) + logger.info("✔ Overwrote %s (%d rows)", filename, len(df)) + +# --------------------------------------------------------------------------- +# On‑chain batch +# --------------------------------------------------------------------------- + +def fetch_onchain_all( + onchain: OnChainClient, + storage: StorageHandler, + symbols: List[str], + days_old: int, + block_configs: List[dict], +): + # Address metadata – overwrite to prevent nested-list merges + for sym in symbols: + chain_sym, address = sym.split("-", 1) + chain_id = CHAIN_ASSET_MAP.get(chain_sym) + + try: + logger.info("→ Address metadata %s on %s", address, chain_sym) + resp = onchain.get_address_metadata(chain_id, address).get("Data", {}) + df = pd.DataFrame([resp]) + _save_overwrite(storage, f"{sym}_address_metadata.parquet", df) + except Exception: + logger.exception("✗ Address metadata %s", sym) + + # Asset‑by‑address – overwrite for list‑rich fields + try: + logger.info("→ Asset‑by‑address %s on %s", address, chain_sym) + resp = onchain.get_data_by_address( + chain_asset=chain_sym, + address=address, + asset_lookup_priority="SYMBOL", + quote_asset="USD", + ).get("Data", {}) + df = normalize_data(resp) + _save_overwrite(storage, f"{sym}_data_by_address.parquet", df) + except Exception as e: + if getattr(getattr(e, "response", None), "status_code", None) == 404: + logger.warning("→ Asset‑by‑address unsupported for %s", sym) + else: + logger.exception("✗ Asset‑by‑address %s", sym) + + # Historical supply – safe merge + for chain_sym in {s.split("-", 1)[0] for s in symbols}: + # ── Historical supply (premium) ── + if SKIP_TIER_LOCKED: + logger.info("← Skipping historical supply for %s (tier-locked)", chain_sym) + else: + try: + logger.info("→ Supply days %s", chain_sym) + resp = onchain.get_historical_supply_days( + asset=chain_sym, + asset_lookup_priority="SYMBOL", + quote_asset="USD", + ).get("Data", {}) + df = normalize_data(resp) + _save_merge(storage, f"{chain_sym}_historical_supply_days.parquet", df, date_col="timestamp", days=days_old) + except Exception as e: + if getattr(getattr(e, "response", None), "status_code", None) == 401: + logger.warning("→ Supply tier-locked for %s", chain_sym) + else: + logger.exception("✗ Supply days %s", chain_sym) + + # Summary by chain – overwrite nested struct + for chain_sym in {s.split("-", 1)[0] for s in symbols}: + try: + logger.info("→ Chain summary %s", chain_sym) + resp = onchain.get_summary_by_chain( + chain_asset=chain_sym, + asset_lookup_priority="SYMBOL", + ).get("Data", {}) + df = pd.DataFrame([resp]) + _save_overwrite(storage, f"{chain_sym}_chain_summary.parquet", df) + except Exception: + logger.exception("✗ Chain summary %s", chain_sym) + + # Block data – safe merge + for cfg in block_configs: + ca, bn, groups = cfg["chain_asset"], cfg["block_number"], cfg["groups"] + try: + logger.info("→ Block %s:%s", ca, bn) + resp = onchain.get_block(ca, bn, groups=groups).get("Data", {}) + df = pd.DataFrame([resp]) + _save_merge(storage, f"block_{ca}_{bn}.parquet", df, date_col="timestamp", days=days_old) + except Exception: + logger.exception("✗ Block %s:%s", ca, bn) + +# --------------------------------------------------------------------------- +# AMM batch +# --------------------------------------------------------------------------- + +def fetch_amm_all( + onchain: OnChainClient, + storage: StorageHandler, + *, + market: str, + instruments: List[str], + days_old: int, + pairs: Optional[List[str]] = None, +): + logger.info("=== AMM %s – %s ===", market, ", ".join(instruments)) + + # Latest tick – safe merge + try: + tick = onchain.get_latest_swap_tick(market=market, instruments=instruments).get("Data", {}) + df = _flatten_records(tick, "instrument") + _save_merge(storage, f"{market}_latest_swap_tick.parquet", df, date_col="timestamp", days=days_old) + except Exception: + logger.exception("✗ Latest tick %s", market) + + # Historical OHLCV – safe merge + for inst in instruments: + try: + hist = onchain.get_historical_swap_days( + market=market, + instrument=inst, + limit=30, + aggregate=1, + fill=True, + ).get("Data", {}) + df = normalize_data(hist) + _save_merge(storage, f"{inst}_historical_swap_days.parquet", df, date_col="timestamp", days=days_old) + except Exception: + logger.exception("✗ OHLCV %s", inst) + + # Hourly messages – safe merge with warning + hour_ts = int(_dt.datetime.utcnow().replace(minute=0, second=0, microsecond=0).timestamp()) + for inst in instruments: + # ── Swap messages (premium) ── + if SKIP_TIER_LOCKED: + logger.info("← Skipping swap-messages for %s (tier-locked)", inst) + else: + try: + swaps = onchain.get_swap_messages_hour(market=market, instrument=inst, hour_ts=hour_ts).get("Data", {}) + df = normalize_data(swaps) + _save_merge(storage, f"{inst}_swap_messages_{hour_ts}.parquet", df, date_col="timestamp", days=days_old) + except Exception as e: + if getattr(getattr(e, "response", None), "status_code", None) == 401: + logger.warning("→ swap-messages tier-locked for %s", inst) + else: + logger.exception("✗ swap messages %s", inst) + try: + liq = onchain.get_liquidity_update_messages_hour(market=market, instrument=inst, hour_ts=hour_ts).get("Data", {}) + df = normalize_data(liq) + _save_merge(storage, f"{inst}_liquidity_updates_{hour_ts}.parquet", df, date_col="timestamp", days=days_old) + except Exception as e: + if SKIP_TIER_LOCKED: + logger.info("← Skipping liquidity-updates for %s (tier-locked)", inst) + elif getattr(getattr(e, "response", None), "status_code", None) == 401: + logger.warning("→ liquidity-updates tier-locked for %s", inst) + else: + logger.exception("✗ liquidity updates %s", inst) + + # Instrument metadata – safe merge + try: + meta = onchain.get_latest_instrument_metadata(market=market, instruments=instruments).get("Data", {}) + df = _flatten_records(meta, "instrument") + _save_merge(storage, f"{market}_instrument_metadata.parquet", df, date_col="timestamp", days=days_old) + except Exception: + logger.exception("✗ Instrument metadata %s", market) + + # Market overview – safe merge + try: + mkts = onchain.get_amm_markets(market=market).get("Data", {}) + df = _flatten_records(mkts, "market") + _save_merge(storage, f"{market}_markets.parquet", df, date_col="timestamp", days=days_old) + except Exception: + logger.exception("✗ Markets %s", market) + + # Optional pairs listing – safe merge + if pairs: + try: + lst = onchain.get_amm_markets_instruments(market=market, instruments=pairs).get("Data", {}) + df = _flatten_records(lst, "pair") + _save_merge(storage, f"{market}_markets_instruments.parquet", df, date_col="timestamp", days=days_old) + except Exception: + logger.exception("✗ Markets+instruments %s", market) + +# --------------------------------------------------------------------------- +# Orchestrator & CLI +# --------------------------------------------------------------------------- + +def fetch_all(config: Dict[str, Any] | None = None): + load_dotenv() + cfg = config or {} + + # Fix: check both 'api_key' and 'api-key' (CLI uses --api-key), fallback to env + api_key = ( + cfg.get("api_key") + or cfg.get("api-key") + or os.getenv("COINDESK_API_KEY") + ) + print("Using API key:", api_key) + host = cfg.get("host") or os.getenv("COINDESK_API_HOST", "data-api.coindesk.com") + base_url = f"https://{host}/" + + days_old = int(cfg.get("days") or os.getenv("COINDESK_DAYS_OLD", 7)) + + symbols_arg = cfg.get("symbols") or os.getenv("COINDESK_SYMBOLS", "ETH-0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2") + symbols = [s.strip() for s in symbols_arg.split(",") if s.strip()] + + amm_market = cfg.get("amm_market") or os.getenv("COINDESK_AMM_MARKET", "uniswapv2") + amm_instruments_arg = cfg.get("amm_instruments") or os.getenv("COINDESK_AMM_INSTRUMENTS", "0x0d4a11d5eeaac28ec3f61d100daf4d40471f1852_2,0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc_2") + amm_instruments = [s.strip() for s in amm_instruments_arg.split(",") if s.strip()] + + amm_pairs_arg = cfg.get("amm_pairs") or os.getenv("COINDESK_AMM_PAIRS", "WETH-USDC,WETH-USDT") + amm_pairs = [p.strip() for p in amm_pairs_arg.split(",") if p.strip()] + + block_configs = [ + {"chain_asset": 2, "block_number": 19501436, "groups": ["ID", "METADATA", "TRANSACTIONS"]}, + {"chain_asset": 8, "block_number": 33459930, "groups": ["ID", "METADATA", "TRANSACTIONS"]}, + {"chain_asset": 1, "block_number": 840946, "groups": ["ID", "METADATA", "TRANSACTIONS"]}, + {"chain_asset": 2410, "block_number": 17014740, "groups": ["ID", "METADATA", "TRANSACTIONS"]}, + {"chain_asset": 808, "block_number": 284999999,"groups": ["ID", "METADATA", "TRANSACTIONS"]}, + ] + + onchain = OnChainClient(api_key=api_key, base_url=base_url) + storage = StorageHandler( + endpoint_url=None, + access_key=None, + secret_key=None, + bucket_name=None, + local_base="data/coindesk/onchain", + ) + + # ------------------------------------------------------------------ + # Execute batches + # ------------------------------------------------------------------ + logger.info("=== Fetching on-chain data ===") + fetch_onchain_all(onchain, storage, symbols, days_old, block_configs) + + logger.info("=== Fetching AMM (%s) data ===", amm_market) + fetch_amm_all( + onchain, + storage, + market=amm_market, + instruments=amm_instruments, + days_old=days_old, + pairs=amm_pairs, + ) + +# --------------------------------------------------------------------------- +# CLI wrapper +# --------------------------------------------------------------------------- + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Fetch CoinDesk On-Chain & AMM data") + parser.add_argument("--symbols", help="comma-separated chain-symbol addresses (e.g. 'ETH-0x...,BTC-...')") + parser.add_argument("--days", type=int, help="merge window in days (default 7)") + parser.add_argument("--api-key", help="CoinDesk API key") + parser.add_argument("--host", help="API host override") + # AMM extras ------------------------------------------------------ + parser.add_argument("--amm-market", help="AMM market (e.g. 'uniswapv2')") + parser.add_argument("--amm-instruments", help="comma-separated instrument addresses") + parser.add_argument("--amm-pairs", help="comma-separated token pairs for markets+instruments") + + args = parser.parse_args() + cfg = {k: v for k, v in vars(args).items() if v is not None} + + # Fallbacks to env handled inside fetch_all + fetch_all(cfg) diff --git a/src/fetchers/coindesk_client/marketcap.py b/src/fetchers/coindesk_client/marketcap.py new file mode 100644 index 0000000000000000000000000000000000000000..8faaac2c225ed90afd8c40f3b4c507f12c6f99fa --- /dev/null +++ b/src/fetchers/coindesk_client/marketcap.py @@ -0,0 +1,33 @@ +""" +marketcap.py – Market capitalization endpoints for CoinDesk API client. + +- get_latest_marketcap(): Fetch the latest market capitalization snapshot. +- get_historical_marketcap(symbol, start, end=None, limit=None): Retrieve historical market cap data for a given asset. +""" + +from client import BaseClient + +class MarketCapClient(BaseClient): + def get_latest_marketcap(self) -> dict: + """ + GET /overview/v1/latest/marketcap/all/tick + Returns the latest tick-level market-capitalisation snapshot for all assets. + """ + return self._get("overview/v1/latest/marketcap/all/tick") + + def get_historical_marketcap(self, symbol, start, end=None, limit=None): + """ + Fetch historical market capitalization for a specific asset over a timeframe. + + :param symbol: Asset symbol, e.g., "BTC-USD". + :param start: ISO8601 start datetime string. + :param end: ISO8601 end datetime string (optional). + :param limit: Maximum number of records to return (optional). + :return: JSON response with historical market cap data. + """ + params = {"start": start} + if end: + params["end"] = end + if limit is not None: + params["limit"] = limit + return self._get(f"marketcap/{symbol}/history", params=params) diff --git a/src/fetchers/coindesk_client/onchain.py b/src/fetchers/coindesk_client/onchain.py new file mode 100644 index 0000000000000000000000000000000000000000..ddd13221e0383383d94ca139e0bbe2b8c329d739 --- /dev/null +++ b/src/fetchers/coindesk_client/onchain.py @@ -0,0 +1,303 @@ +""" +onchain.py – CoinDesk Data API On-Chain & AMM endpoints. + +This client wraps the publicly-documented CoinDesk **/onchain/** routes, now including +Automated Market Maker (AMM) queries for Uniswap-style DEXs. + +Provided functionality +---------------------- +* Processed block data (multi-chain) +* Address metadata & asset summaries +* Historical supply-day metrics +* **NEW – AMM endpoints** + · Latest swap tick (price/volume snapshot) + · Historical OHLCV+ for swaps (daily aggregation) + · Raw swap messages (per-hour granularity) + · Liquidity-update messages (per-hour granularity) + · Instrument metadata + · Market & instrument discovery + +All helper methods return the raw `requests.Response` JSON. You can pass the output +through `normalize_data()` to obtain a tidy *pandas* `DataFrame`. + +Example +~~~~~~~ +>>> client = OnChainClient(api_key="YOUR_COIN_DESK_KEY") +>>> df = normalize_data( +... client.get_latest_swap_tick( +... market="uniswapv2", +... instruments=[ +... "0x0d4a11d5eeaac28ec3f61d100daf4d40471f1852_2", # WETH/USDT +... "0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc_2", # WETH/USDC +... ] +... ) +... ) +>>> df.head() + +Dependencies +------------ +* `pandas` (tabular manipulation) +* `requests` (via BaseClient) +""" +from __future__ import annotations + +from typing import Any, List, Optional, Dict +import pandas as pd + +from client import BaseClient # ← must expose ._get() + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def normalize_data(raw: Any) -> pd.DataFrame: + """Best-effort conversion of API *raw* JSON into a :class:`pandas.DataFrame`. Handles + the three typical response shapes returned by CoinDesk: + + * **list[dict]** – directly convertible via :pyclass:`pandas.DataFrame` + * **dict[str, list]** where all lists are equal length – idem + * **dict[str, Any]** heterogeneous – wrapped in a single-row DataFrame + """ + if isinstance(raw, list): + return pd.DataFrame(raw) + + if isinstance(raw, dict): + try: + return pd.DataFrame(raw) + except ValueError: # unequal length sequences → single row + return pd.DataFrame([raw]) + + # Fallback – unknown shape + return pd.DataFrame() + + +# --------------------------------------------------------------------------- +# Main client +# --------------------------------------------------------------------------- + +class OnChainClient(BaseClient): + """Typed thin wrapper around the CoinDesk On-Chain REST API.""" + + # --------------------------------------------------------------------- + # Core (already present) ------------------------------------------------ + # --------------------------------------------------------------------- + + def get_block(self, chain_asset: int, block_number: int, *, groups: List[str]): + """Processed block data for *chain_asset* at *block_number*. + + ``groups`` is a list such as ``["ID", "METADATA", "TRANSACTIONS"]``. + Maps to ``/onchain/v1/block/{chain_asset}``. + """ + return self._get( + f"onchain/v1/block/{chain_asset}", + params={"block_number": block_number, "groups": ",".join(groups)}, + ) + + def get_address_metadata(self, chain_asset: int, address: str): + """Rich metadata for an *address* on *chain_asset*.""" + return self._get( + f"onchain/v1/address/metadata/{chain_asset}", params={"address": address} + ) + + def get_summary_by_chain(self, chain_asset: str, *, asset_lookup_priority: str = "SYMBOL"): + """Summary view of assets for a blockchain network.""" + return self._get( + "onchain/v3/summary/by/chain", + params={ + "chain_asset": chain_asset, + "asset_lookup_priority": asset_lookup_priority, + }, + ) + + def get_data_by_address( + self, + chain_asset: str, + address: str, + *, + asset_lookup_priority: str = "SYMBOL", + quote_asset: str = "USD", + ): + """Look-up asset data (balance, value, etc.) by *address*.""" + return self._get( + "onchain/v2/data/by/address", + params={ + "chain_asset": chain_asset, + "address": address, + "asset_lookup_priority": asset_lookup_priority, + "quote_asset": quote_asset, + }, + ) + + def get_historical_supply_days( + self, + asset: str, + *, + asset_lookup_priority: str = "SYMBOL", + quote_asset: Optional[str] = None, + ): + """Daily historical supply for *asset* – available for major networks.""" + params: Dict[str, str] = { + "asset": asset, + "asset_lookup_priority": asset_lookup_priority, + } + if quote_asset: + params["quote_asset"] = quote_asset + return self._get("onchain/v2/historical/supply/days", params=params) + + # --------------------------------------------------------------------- + # AMM (new) ------------------------------------------------------------ + # --------------------------------------------------------------------- + + # Helpers – convert booleans to lower-case strings required by API + _bool = staticmethod(lambda x: str(bool(x)).lower()) + + def get_latest_swap_tick( + self, + *, + market: str, + instruments: List[str], + instrument_lookup_strategy: str = "ALL_OPTIONS", + apply_mapping: bool = True, + ): + """Latest tick (price, volume, liquidity) for one or many *instruments* on an AMM + *market* (e.g. ``"uniswapv2"``). + + **Endpoint** ``/onchain/v1/amm/latest/swap/tick`` + """ + return self._get( + "onchain/v1/amm/latest/swap/tick", + params={ + "market": market, + "instruments": ",".join(instruments), + "instrument_lookup_strategy": instrument_lookup_strategy, + "apply_mapping": self._bool(apply_mapping), + }, + ) + + def get_historical_swap_days( + self, + *, + market: str, + instrument: str, + limit: int = 30, + aggregate: int = 1, + fill: bool = True, + instrument_lookup_strategy: str = "ALL_OPTIONS", + apply_mapping: bool = True, + ): + """Daily OHLCV+ history for a swap *instrument* (e.g. LP address _tokenId). + + **Endpoint** ``/onchain/v1/amm/historical/swap/days`` + """ + return self._get( + "onchain/v1/amm/historical/swap/days", + params={ + "market": market, + "instrument": instrument, + "limit": limit, + "aggregate": aggregate, + "fill": self._bool(fill), + "instrument_lookup_strategy": instrument_lookup_strategy, + "apply_mapping": self._bool(apply_mapping), + }, + ) + + def get_swap_messages_hour( + self, + *, + market: str, + instrument: str, + hour_ts: int, + instrument_lookup_strategy: str = "ALL_OPTIONS", + apply_mapping: bool = True, + ): + """Raw swap messages (mints/burns/swaps) for a given *hour_ts* (UNIX seconds). + + **Endpoint** ``/onchain/v2/amm/historical/swap-messages/hour`` + """ + return self._get( + "onchain/v2/amm/historical/swap-messages/hour", + params={ + "market": market, + "instrument": instrument, + "hour_ts": hour_ts, + "instrument_lookup_strategy": instrument_lookup_strategy, + "apply_mapping": self._bool(apply_mapping), + }, + ) + + def get_liquidity_update_messages_hour( + self, + *, + market: str, + instrument: str, + hour_ts: int, + instrument_lookup_strategy: str = "ALL_OPTIONS", + apply_mapping: bool = True, + ): + """Liquidity add/remove messages for the specified *hour_ts*. + + **Endpoint** ``/onchain/v2/amm/historical/liquidity-update-messages/hour`` + """ + return self._get( + "onchain/v2/amm/historical/liquidity-update-messages/hour", + params={ + "market": market, + "instrument": instrument, + "hour_ts": hour_ts, + "instrument_lookup_strategy": instrument_lookup_strategy, + "apply_mapping": self._bool(apply_mapping), + }, + ) + + def get_latest_instrument_metadata( + self, + *, + market: str, + instruments: List[str], + instrument_lookup_strategy: str = "ALL_OPTIONS", + apply_mapping: bool = True, + ): + """Token-pair metadata (decimals, symbols, etc.) for *instruments*. + + **Endpoint** ``/onchain/v1/amm/latest/instrument/metadata`` + """ + return self._get( + "onchain/v1/amm/latest/instrument/metadata", + params={ + "market": market, + "instruments": ",".join(instruments), + "instrument_lookup_strategy": instrument_lookup_strategy, + "apply_mapping": self._bool(apply_mapping), + }, + ) + + # ------------------------------------------------------------------ + # Market discovery + # ------------------------------------------------------------------ + + def get_amm_markets(self, *, market: str): + """List details about an AMM *market* (e.g. pools count, TVL).""" + return self._get("onchain/v1/amm/markets", params={"market": market}) + + def get_amm_markets_instruments( + self, + *, + market: str, + instruments: List[str], + instrument_status: str = "ACTIVE", + instrument_lookup_strategy: str = "ALL_OPTIONS", + ): + """Enumerate instruments on an AMM *market* filtered by *instrument_status*. + + **Endpoint** ``/onchain/v1/amm/markets/instruments`` + """ + return self._get( + "onchain/v1/amm/markets/instruments", + params={ + "market": market, + "instruments": ",".join(instruments), + "instrument_status": instrument_status, + "instrument_lookup_strategy": instrument_lookup_strategy, + }, + ) diff --git a/src/fetchers/coindesk_client/sentiment.py b/src/fetchers/coindesk_client/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..7feba589be26f675cfa33953b0197ba98ee4e096 --- /dev/null +++ b/src/fetchers/coindesk_client/sentiment.py @@ -0,0 +1,32 @@ +""" +sentiment.py – Sentiment data endpoints for CoinDesk API client. + +- get_asset_sentiment(symbol): Fetch the latest sentiment score for a given asset. +- get_historical_sentiment(symbol, days, limit=None): Retrieve sentiment history over N days. +""" + +from client import BaseClient + +class SentimentClient(BaseClient): + def get_asset_sentiment(self, symbol): + """ + Fetch the latest sentiment score for the specified symbol. + + :param symbol: Asset symbol, e.g., "BTC-USD". + :return: JSON response with sentiment score. + """ + return self._get(f"sentiment/{symbol}") + + def get_historical_sentiment(self, symbol, days, limit=None): + """ + Fetch sentiment history for a symbol over the past `days` days. + + :param symbol: Asset symbol, e.g., "BTC-USD". + :param days: Number of days of history to retrieve. + :param limit: Maximum number of records to return (optional). + :return: JSON response with historical sentiment data. + """ + params = {"days": days} + if limit is not None: + params["limit"] = limit + return self._get(f"sentiment/{symbol}/historical", params=params) diff --git a/src/fetchers/coindesk_client/spot.py b/src/fetchers/coindesk_client/spot.py new file mode 100644 index 0000000000000000000000000000000000000000..9420cd6867149de809c954e3c33906c2690d75ad --- /dev/null +++ b/src/fetchers/coindesk_client/spot.py @@ -0,0 +1,83 @@ +# spot.py + +import logging +import requests +from typing import Union, List, Dict, Any +from requests import HTTPError, Session +from client import BaseClient + +logger = logging.getLogger(__name__) + + +class SpotClient(BaseClient): + """ + Spot market endpoints for CCData (CryptoCompare / CoinDesk) Data API. + + - list_markets_instruments(market): all supported instrument codes for a spot market. + - list_markets(market, groups): all spot markets, optionally filtered. + - get_latest_tick(market, instruments): latest tick data for one or more instruments. + """ + + def __init__(self, api_key: str, base_url: str = None, timeout: int = 10): + super().__init__(api_key=api_key, base_url=base_url) + self.timeout = timeout + # Use a Session for connection pooling & retries + self.session = Session() + adapter = requests.adapters.HTTPAdapter(max_retries=3) + self.session.mount("https://", adapter) + + def list_markets_instruments(self, market: str) -> Dict[str, Any]: + """ + GET /spot/v1/markets/instruments + :param market: Exchange slug (e.g. "binance") + :returns: {"Data": [ {instrument, ...}, … ]} + """ + params = {"market": market} + return self._get("spot/v1/markets/instruments", params=params, timeout=self.timeout) + + def list_markets(self, market: str = None, groups: str = "BASIC") -> Dict[str, Any]: + """ + GET /spot/v1/markets + :param market: optional exchange slug to filter by + :param groups: filter group name (e.g. "BASIC", "ADVANCED") + :returns: {"Data": [ {market info…}, … ]} + """ + params: Dict[str, Any] = {"groups": groups} + if market: + params["market"] = market + return self._get("spot/v1/markets", params=params, timeout=self.timeout) + + def get_latest_tick( + self, + market: str, + instruments: Union[str, List[str]] + ) -> Dict[str, Any]: + """ + GET /spot/v1/latest/tick + :param market: Exchange slug (e.g. "binance") + :param instruments: Single ID or list (e.g. "BTC-USDT" or ["BTC-USDT","ETH-USDT"]) + :returns: {"Data": [ { ...tick fields... }, … ]} + """ + # allow list-of-str or comma-string + if isinstance(instruments, (list, tuple)): + instr_param = ",".join(instruments) + else: + instr_param = instruments + + params = {"market": market, "instruments": instr_param} + + try: + resp = self._get("spot/v1/latest/tick", params=params, timeout=self.timeout) + except HTTPError as e: + logger.warning( + "Failed to fetch latest tick(%s) on %s: %s", + instr_param, market, e + ) + return {"Data": []} + + data = resp.get("Data", []) + # ensure we always return a list + if isinstance(data, dict): + data = [data] + + return {"Data": data} diff --git a/src/fetchers/coindesk_client/utils.py b/src/fetchers/coindesk_client/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..eb79a3024aad9d8d3fee3606d8f69a61b26a9960 --- /dev/null +++ b/src/fetchers/coindesk_client/utils.py @@ -0,0 +1,33 @@ +""" +utils.py – Common helpers for CoinDesk API client. + +- Parsing and formatting helpers (e.g., date conversion) +- Logging setup +- Retry/backoff utilities (for transient errors) +""" + +import logging +from datetime import datetime +import time + +def parse_date(date_str): + return datetime.fromisoformat(date_str) + +def setup_logger(name): + logger = logging.getLogger(name) + if not logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + return logger + +def retry(func, retries=3, delay=2): + for attempt in range(retries): + try: + return func() + except Exception as e: + if attempt == retries - 1: + raise + time.sleep(delay) diff --git a/src/fetchers/crawl4ai/crawl_news.py b/src/fetchers/crawl4ai/crawl_news.py new file mode 100644 index 0000000000000000000000000000000000000000..671ce296f599fcbf6937d9316ca05a483c42c734 --- /dev/null +++ b/src/fetchers/crawl4ai/crawl_news.py @@ -0,0 +1,205 @@ +""" +crawl_news.py – Crawls a list of RSS feeds, grabs full-text when needed, +merges with any existing Parquet in Filebase S3 and uploads the fresh file. + +✱ 2025-07-11 – switched backend to Filebase S3 + • Uses boto3 pointed at Filebase's S3-compatible endpoint + • No local caching of seen URLs: state lives in S3 under seen_urls.txt + +Requirements: + • FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com + • FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY env vars + • FILEBASE_BUCKET env var with your bucket name +""" + +import os +import sys +import asyncio +import tempfile +from datetime import datetime +from io import BytesIO +from pathlib import Path + +from dotenv import load_dotenv +import feedparser +import trafilatura +import pandas as pd +import rich.console +from crawl4ai import AsyncWebCrawler + +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) +from data_cloud.cloud_utils import StorageHandler + +# ─── Configuration ──────────────────────────────────────────────────────────── +load_dotenv() + +FEED_URLS = [ + "https://www.marketwatch.com/rss/topstories", + "https://thedefiant.io/feed/", + "https://www.coindesk.com/arc/outboundfeeds/rss/?outputType=xml", + "https://cointelegraph.com/rss", + "https://cryptopotato.com/feed/", + "https://cryptoslate.com/feed/", + "https://cryptonews.com/news/feed/", + "https://smartliquidity.info/feed/", + "https://www.cnbc.com/id/10000664/device/rss/rss.html", + "https://time.com/nextadvisor/feed/", +] +MAX_AGE_DAYS = 1 +MIN_SUMMARY_LEN = 200 +MIN_CRAWL_LEN = 100 +CRAWL_CONCURRENCY = 4 + +S3_NEWS_PATH = "news/crawled_news/news-latest.parquet" +S3_SEEN_PATH = "news/crawled_news/seen_urls.txt" + +# Filebase S3 creds & endpoint ------------------------------------------------ +FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT") +FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY") +FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY") +FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET") + +if not (FILEBASE_ENDPOINT and FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY and FILEBASE_BUCKET): + print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set") + sys.exit(1) + +# Silence logs ---------------------------------------------------------------- +rich.console.Console.print = lambda *a, **k: None +os.environ.update({ + "RICH_NO_COLOR": "1", + "RICH_DISABLE": "1", + "CRAWL4AI_LOG_LEVEL": "CRITICAL", +}) + +# ─── Main routine ───────────────────────────────────────────────────────────── +async def main() -> None: + # Setup storage handler + storage = StorageHandler( + endpoint_url=FILEBASE_ENDPOINT, + access_key=FILEBASE_ACCESS_KEY, + secret_key=FILEBASE_SECRET_KEY, + bucket_name=FILEBASE_BUCKET, + local_base="data" + ) + + # Load seen-URL cache from S3 only, do not fallback to local or create locally + seen_urls: set[str] = set() + try: + seen_data = storage.s3.get_object(Bucket=storage.bucket, Key=S3_SEEN_PATH)['Body'].read() + text = seen_data.decode() + seen_urls = {line.strip() for line in text.splitlines() if line.strip()} + print(f"[INFO] Loaded {len(seen_urls)} seen URLs from S3") + except Exception: + print(f"[INFO] No seen URLs found in S3. Treating as empty.") + seen_urls = set() + + # Fetch & parse RSS feeds ------------------------------------------------- + to_crawl, immediate = [], [] + now_utc = datetime.utcnow() + for url in FEED_URLS: + feed = feedparser.parse(url) + new_count = 0 + for e in feed.entries: + ts = e.get("published_parsed") or e.get("updated_parsed") + if not ts: + continue + link = e.link + if link in seen_urls: + continue + new_count += 1 + + content = e.get("content") + if content: + txt = "".join(p.value for p in content).strip() + if len(txt) >= MIN_CRAWL_LEN: + immediate.append({"url": link, "text": txt, "timestamp": now_utc.isoformat()}) + seen_urls.add(link) + continue + + summ = e.get("summary", "").strip() + if len(summ) >= MIN_SUMMARY_LEN: + immediate.append({"url": link, "text": summ, "timestamp": now_utc.isoformat()}) + seen_urls.add(link) + else: + to_crawl.append(link) + + print(f"• Feed {url} -> {new_count} new items") + + # Selective crawl for short summaries ------------------------------------ + crawled = [] + if to_crawl: + print(f"[INFO] Crawling {len(to_crawl)} pages…") + async with AsyncWebCrawler( + seeds=to_crawl, + max_pages=len(to_crawl), + concurrency=CRAWL_CONCURRENCY, + obey_robots_txt=True, + ) as crawler: + pages = await asyncio.gather(*(crawler.arun(u) for u in to_crawl)) + for sub in pages: + for page in sub: + if page.url not in seen_urls: + txt = trafilatura.extract(page.html, favor_recall=True) + if txt and len(txt.strip()) >= MIN_CRAWL_LEN: + crawled.append({"url": page.url, "text": txt.strip(), "timestamp": now_utc.isoformat()}) + seen_urls.add(page.url) + + # Merge, filter & dedupe -------------------------------------------------- + new_results = immediate + crawled + if not new_results: + print("[WARNING] No new articles to process") + return + + df_new = pd.DataFrame(new_results) + df_new["timestamp"] = pd.to_datetime(df_new["timestamp"], utc=True) + + # Load existing Parquet (cloud or local) + df_old = pd.DataFrame() + try: + parquet_bytes = storage.download(S3_NEWS_PATH) + with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp: + tmp.write(parquet_bytes) + tmp_path = tmp.name + df_old = pd.read_parquet(tmp_path) + os.remove(tmp_path) + print(f"[INFO] Loaded {len(df_old)} existing articles from {storage.get_last_mode()}") + except Exception: + print(f"[INFO] No existing Parquet found in cloud or local storage.") + + df = pd.concat([df_old, df_new], ignore_index=True) + cutoff = pd.Timestamp.utcnow() - pd.Timedelta(days=MAX_AGE_DAYS) + df = df[df.timestamp >= cutoff] + df = df.sort_values("timestamp").drop_duplicates("url", keep="last") + print(f"[DEBUG] old rows: {len(df_old)}, new rows: {len(df_new)}, merged: {len(df)}") + + # Upload updated Parquet to S3 only + parquet_buf = BytesIO() + df.to_parquet(parquet_buf, index=False) + data = parquet_buf.getvalue() + if not data: + raise RuntimeError("Refusing to upload empty Parquet") + storage.s3.put_object(Bucket=storage.bucket, Key=S3_NEWS_PATH, Body=data, ContentType="application/octet-stream") + print(f"[OK] Parquet updated: S3:{S3_NEWS_PATH}") + + # Persist seen URLs to S3 only + seen_body = "\n".join(sorted(seen_urls)) + "\n" + storage.s3.put_object(Bucket=storage.bucket, Key=S3_SEEN_PATH, Body=seen_body.encode(), ContentType="text/plain") + print(f"[OK] Seen URLs updated: S3:{S3_SEEN_PATH}") + + # Upload all files in data/crawled-news to S3 under news/ (no local fallback) + local_news_dir = os.path.join("data", "crawled-news") + s3_news_prefix = "news/crawled_news/" + for root, _, files in os.walk(local_news_dir): + for fname in files: + local_path = os.path.join(root, fname) + rel_path = os.path.relpath(local_path, local_news_dir) + s3_key = s3_news_prefix + rel_path.replace("\\", "/") + with open(local_path, "rb") as f: + file_bytes = f.read() + storage.s3.put_object(Bucket=storage.bucket, Key=s3_key, Body=file_bytes, ContentType="application/octet-stream") + print(f"[OK] Uploaded {local_path} -> S3:{s3_key}") + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/src/fetchers/crypto_bubbles/fetch_crypto_bubbles.py b/src/fetchers/crypto_bubbles/fetch_crypto_bubbles.py new file mode 100644 index 0000000000000000000000000000000000000000..b29d33ed6ae87833797b62fd132dd0c6d976dd47 --- /dev/null +++ b/src/fetchers/crypto_bubbles/fetch_crypto_bubbles.py @@ -0,0 +1,194 @@ +""" +fetch_crypto_bubbles.py – Fetches CryptoBubbles data, converts to Parquet and JSON report, +then uploads both directly to Filebase S3 instead of local storage. + +✱ 2025-07-11 – switched backend from local filesystem to Filebase S3 + • Uses boto3 against FILEBASE_ENDPOINT + • No local disk writes; everything streams directly to S3 + +Requirements: + • FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com + • FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY env vars + • FILEBASE_BUCKET env var with your bucket name + • dotenv for loading env vars from .env (optional) +""" + +import os +import sys +import json +import datetime as _dt +import argparse +from io import BytesIO + +from collections import defaultdict +import numpy as np +import pandas as pd +import requests + + +# Ensure src is in sys.path for direct script execution +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) +from data_cloud.cloud_utils import StorageHandler +from dotenv import load_dotenv + +# ─── Configuration ──────────────────────────────────────────────────────────── +load_dotenv() + +URL = os.getenv("CRYPTOBUBBLES_URL", "https://cryptobubbles.net/backend/data/bubbles1000.usd.json") + +# Filebase S3 credentials +FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT") +FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY") +FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY") +FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET") + +if not all([FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, FILEBASE_BUCKET]): + print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set") + sys.exit(1) + +# boto3 S3 client config +from botocore.config import Config +CFG = Config( + signature_version="s3v4", + s3={"addressing_style": "path"}, +) + + + +# ─── Data fetch & processing ───────────────────────────────────────────────── + +def fetch_json(url: str = URL, timeout: int = 15): + resp = requests.get(url, timeout=timeout) + resp.raise_for_status() + payload = resp.json() + return payload.get("data", payload) if isinstance(payload, dict) else payload + + +def to_dataframe(raw): + return pd.json_normalize(raw) + + +def categorize_columns(df: pd.DataFrame): + groups = defaultdict(list) + for col in df.columns: + if "." in col: + prefix, _ = col.split('.', 1) + groups[prefix].append(col) + else: + groups['base'].append(col) + + nice = { + 'base': 'Base Features', + 'symbols': 'Symbols', + 'performance': 'Performance', + 'rankDiffs': 'Rank Differences', + 'exchangePrices': 'Exchange Prices', + 'links': 'Links', + } + + fc = {} + for key, cols in groups.items(): + name = nice.get(key, key.capitalize()) + fc[name] = {'count': len(cols), 'features': cols} + return fc + + +def generate_report(df, configuration): + now = _dt.datetime.utcnow().isoformat() + mem_mb = df.memory_usage(deep=True).sum() / 1024**2 + dataset_info = { + 'shape': [df.shape[0], df.shape[1]], + 'memory_usage_mb': mem_mb, + 'time_range': {'start': None, 'end': None}, + } + + fc = categorize_columns(df) + + missing = df.isna().sum().to_dict() + total_cells = df.shape[0] * df.shape[1] + non_missing = df.count().sum() + completeness = non_missing / total_cells * 100 + col_quals = [(df.shape[0] - m) / df.shape[0] for m in missing.values()] + avg_quality = float(np.mean(col_quals)) + + data_quality = { + 'completeness': completeness, + 'missing_values_by_column': missing, + 'avg_quality_score': avg_quality, + } + + report = { + 'timestamp': now, + 'dataset_info': dataset_info, + 'feature_categories': fc, + 'data_quality': data_quality, + 'feature_importance': {}, + 'configuration': configuration, + } + return report + +# ─── Main ───────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description='Fetch CryptoBubbles, upload to Filebase') + parser.add_argument('--prefix', default='crypto-bubbles', help='S3 key prefix') + args = parser.parse_args() + + prefix = args.prefix.rstrip('/') + today = _dt.date.today().isoformat() + + raw = fetch_json() + df = to_dataframe(raw) + + # configuration placeholder + configuration = { + 'enable_advanced_indicators': True, + 'enable_feature_selection': True, + 'enable_anomaly_detection': True, + 'max_correlation_threshold': 0.95, + 'min_feature_importance': 0.001, + 'outlier_detection_method': 'iqr', + 'feature_scaling': True, + } + + report = generate_report(df, configuration) + + # prepare Parquet bytes + buf = BytesIO() + df.to_parquet(buf, index=False) + parquet_data = buf.getvalue() + + # prepare JSON report bytes + report_json = json.dumps(report, indent=2).encode() + + + # Use StorageHandler for unified cloud/local upload + storage = StorageHandler( + endpoint_url=None, + access_key=None, + secret_key=None, + bucket_name=None, + local_base="data" + ) + + key_parquet = f"{prefix}/crypto_bubbles_{today}.parquet" + key_report = f"{prefix}/crypto_bubbles_report_{today}.json" + + # Upload Parquet + try: + storage.upload(key_parquet, parquet_data, content_type='application/octet-stream') + print(f"[OK] Uploaded Parquet -> {storage.get_last_mode()}:{key_parquet}") + except Exception as e: + print(f"[ERROR] Failed uploading Parquet: {e}", file=sys.stderr) + + # Upload JSON report + try: + storage.upload(key_report, report_json, content_type='application/json') + print(f"[OK] Uploaded report -> {storage.get_last_mode()}:{key_report}") + except Exception as e: + print(f"[ERROR] Failed uploading report: {e}", file=sys.stderr) + +if __name__ == '__main__': + main() diff --git a/src/fetchers/cryptocompare/client.py b/src/fetchers/cryptocompare/client.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/fetchers/cryptocompare/forum_trending.py b/src/fetchers/cryptocompare/forum_trending.py new file mode 100644 index 0000000000000000000000000000000000000000..ff94a9070b0aacfd376461797eb196405b1cdd53 --- /dev/null +++ b/src/fetchers/cryptocompare/forum_trending.py @@ -0,0 +1,26 @@ +""" +forum_trending.py – fetch_forum_trending(extraParams, ...) +""" + +from .client import CryptoCompareClient + + +FORUM_BASE_URL = "https://www.cryptocompare.com/" + +class ForumTrending: + def __init__(self): + self.client = CryptoCompareClient() + + def fetch_forum_trending(self, extraParams=None, **kwargs): + """ + Fetch trending forum topics. + API: https://www.cryptocompare.com/api/forum/get/trending/ + """ + params = {"extraParams": extraParams} if extraParams else {} + params.update(kwargs) + # Use requests directly for this endpoint, as it is not on min-api + import requests + url = FORUM_BASE_URL + "api/forum/get/trending/" + resp = requests.get(url, params=params) + resp.raise_for_status() + return resp.json() diff --git a/src/fetchers/cryptocompare/histohour.py b/src/fetchers/cryptocompare/histohour.py new file mode 100644 index 0000000000000000000000000000000000000000..ad2057abdfd25c8407397b9c68441471e9289891 --- /dev/null +++ b/src/fetchers/cryptocompare/histohour.py @@ -0,0 +1,17 @@ +""" +histohour.py – fetch_histohour(symbol, limit, aggregate, ...) + +API sample: +https://min-api.cryptocompare.com/data/v2/histohour?aggregate=1&e=CCCAGG&extraParams=https:%2F%2Fwww.cryptocompare.com&fsym=BTC&limit=24&tryConversion=false&tsym=USD +""" + +from .client import CryptoCompareClient + +class HistoHour: + def __init__(self): + self.client = CryptoCompareClient() + + def fetch_histohour(self, fsym, tsym, limit=24, aggregate=1, **kwargs): + params = {"fsym": fsym, "tsym": tsym, "limit": limit, "aggregate": aggregate} + params.update(kwargs) + return self.client.get("v2/histohour", params=params) diff --git a/src/fetchers/cryptocompare/recommended.py b/src/fetchers/cryptocompare/recommended.py new file mode 100644 index 0000000000000000000000000000000000000000..7526a18a5a1cce24ac55a5266b6d2b5eb3111d45 --- /dev/null +++ b/src/fetchers/cryptocompare/recommended.py @@ -0,0 +1,17 @@ +""" +recommended.py – fetch_recommended_all(tsym, ...) + +API sample: +http://min-api.cryptocompare.com/data/recommended/all?tsym=USD +""" + +from .client import CryptoCompareClient + +class Recommended: + def __init__(self): + self.client = CryptoCompareClient() + + def fetch_recommended_all(self, tsym, **kwargs): + params = {"tsym": tsym} + params.update(kwargs) + return self.client.get("top/recommended", params=params) diff --git a/src/fetchers/cryptocompare/top_toptier_volume.py b/src/fetchers/cryptocompare/top_toptier_volume.py new file mode 100644 index 0000000000000000000000000000000000000000..97d245dab570ce2f65a5ec9c4b9f77bac67710ee --- /dev/null +++ b/src/fetchers/cryptocompare/top_toptier_volume.py @@ -0,0 +1,17 @@ +""" +top_toptier_volume.py – fetch_top_toptier_volume(assetClass, ...) + +API sample: +https://min-api.cryptocompare.com/data/top/totaltoptiervol?ascending=true&assetClass=ALL&extraParams=https:%2F%2Fwww.cryptocompare.com&limit=100&page=0&tsym=USD +""" + +from .client import CryptoCompareClient + +class TopTopTierVolume: + def __init__(self): + self.client = CryptoCompareClient() + + def fetch_top_toptier_volume(self, assetClass, **kwargs): + params = {"assetClass": assetClass} + params.update(kwargs) + return self.client.get("top/totaltoptiervolfull", params=params) diff --git a/src/fetchers/cryptocompare/utils.py b/src/fetchers/cryptocompare/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4b19f02f194ff85f718207091b4bc72ca0275a75 --- /dev/null +++ b/src/fetchers/cryptocompare/utils.py @@ -0,0 +1,36 @@ +""" +utils.py – Shared helpers for CryptoCompare API client. +- Timestamp conversion, caching, and rate-limiting +""" + +import time +from functools import wraps + +def timestamp_to_iso(ts): + return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(ts)) + +# Simple in-memory cache (for demonstration) +_cache = {} +def cache_result(func): + @wraps(func) + def wrapper(*args, **kwargs): + key = (func.__name__, args, tuple(sorted(kwargs.items()))) + if key in _cache: + return _cache[key] + result = func(*args, **kwargs) + _cache[key] = result + return result + return wrapper + +# Simple rate limiter (1 call/sec) +def rate_limited(func): + last_called = [0] + @wraps(func) + def wrapper(*args, **kwargs): + elapsed = time.time() - last_called[0] + if elapsed < 1: + time.sleep(1 - elapsed) + result = func(*args, **kwargs) + last_called[0] = time.time() + return result + return wrapper diff --git a/src/fetchers/finnhub/generate_finnhub_features.py b/src/fetchers/finnhub/generate_finnhub_features.py new file mode 100644 index 0000000000000000000000000000000000000000..34f731e294b8a08dfda1192a3fe28f9b1a645b10 --- /dev/null +++ b/src/fetchers/finnhub/generate_finnhub_features.py @@ -0,0 +1,169 @@ +# """generate_finnhub_features.py + +# Automatic feature generator for a local Finnhub data dump (Parquet files). + +# Usage +# ----- +# python generate_finnhub_features.py --data /path/to/finnhub \ +# --out-features features_all.parquet \ +# --out-report feature_report.json + +# The script walks through **all** Parquet files contained in the directory +# structure exported from Finnhub, concatenates them on the **timestamp** index +# (if present) or on the DataFrame index otherwise, prefixes every column with a +# stable identifier built from its file path to guarantee *no data loss*, and +# computes a lightweight metadata report inspired by AdvisorAI's format. + +# Key design principles +# --------------------- +# * No column is dropped – every raw field ends-up in the final output. +# * Column names are namespaced with `__`. +# * When multiple DataFrames contain an explicit timestamp or date column, they +# are converted to pandas `datetime64[ns]` and merged on the outer union of +# timestamps to preserve every record. +# * Numeric features are left untouched; you may append your own engineered +# columns in `extend_features()` without altering the originals. +# * The JSON report contains: +# - basic shape / memory stats +# - global time range from the merged index +# - missing-value analysis (per column null counts & completeness %) +# - feature category counts (simple heuristic) +# """ + +# from __future__ import annotations + +# import argparse +# import json +# from pathlib import Path +# from typing import Dict, List, Tuple + +# import pandas as pd +# import numpy as np + + +# TIMESTAMP_CANDIDATES = {"timestamp", "time", "date", "datetime", "t", "ts", "priced_at"} + + +# def find_parquet_files(root: Path) -> List[Path]: +# return [p for p in root.rglob("*.parquet") if p.is_file()] + + +# def build_prefix(file_path: Path, root: Path) -> str: +# rel = file_path.relative_to(root) +# no_ext = rel.as_posix().replace("/", "_").rsplit(".", 1)[0] +# return f"{no_ext}__" + + +# def load_and_prefix(file_path: Path, root: Path) -> Tuple[pd.DataFrame, str]: +# df = pd.read_parquet(file_path) +# prefix = build_prefix(file_path, root) +# df = df.rename(columns={c: f"{prefix}{c}" for c in df.columns}) +# # Identify/standardise timestamp column (if any) +# for c in list(df.columns): +# base = c.split("__")[-1].lower() +# if base in TIMESTAMP_CANDIDATES: +# # convert numeric seconds or string dates to datetime +# df[f"{prefix}__ts"] = pd.to_datetime(df[c], errors="coerce", unit="s") +# df = df.drop(columns=[c]) +# # Do NOT set index, just keep as column +# break +# # Always reset index to avoid merge errors +# df = df.reset_index(drop=True) +# return df, prefix + + +# def merge_frames(frames: List[pd.DataFrame]) -> pd.DataFrame: +# # All frames have RangeIndex, so concat columns +# return pd.concat(frames, axis=1) + + +# def extend_features(df: pd.DataFrame) -> pd.DataFrame: +# """Add engineered features without touching original columns.""" +# numeric_cols = df.select_dtypes(include=[np.number]).columns +# pct_df = df[numeric_cols].pct_change(fill_method=None) +# pct_df.columns = [f"{c}_pct_change1" for c in pct_df.columns] +# return pd.concat([df, pct_df], axis=1) + + +# def feature_category(col: str) -> str: +# c = col.lower() +# if any(k in c for k in ("open", "close", "high", "low", "price", "volume")): +# return "Price / Volume" +# if any(k in c for k in ("pe", "cash", "debt", "income", "margin")): +# return "Fundamentals" +# if any(k in c for k in ("rsi", "macd", "ema", "sma", "bb", "stoch")): +# return "Technical" +# if any(k in c for k in ("news", "sentiment", "social")): +# return "Sentiment" +# return "Other" + + +# def build_report(df: pd.DataFrame) -> Dict: +# # Use merged datetime index for time range +# idx = df.index +# start = str(idx.min()) if not idx.empty else None +# end = str(idx.max()) if not idx.empty else None + +# report: Dict = { +# "timestamp": pd.Timestamp.utcnow().isoformat(), +# "dataset_info": { +# "shape": list(df.shape), +# "memory_usage_mb": round(df.memory_usage(deep=True).sum() / 1024**2, 3), +# "time_range": {"start": start, "end": end}, +# }, +# } + +# # Feature categories +# cats: Dict[str, List[str]] = {} +# for col in df.columns: +# cat = feature_category(col) +# cats.setdefault(cat, []).append(col) +# report["feature_categories"] = {c: {"count": len(v), "features": v[:10]} for c, v in cats.items()} + +# # Data quality +# missing = df.isna().sum().to_dict() +# completeness = 100 - (sum(missing.values()) / df.size * 100) +# report["data_quality"] = {"completeness": completeness, "missing_values_by_column": missing} + +# return report + + +# def save_outputs(df: pd.DataFrame, features_path: Path, report_path: Path): +# features_path.parent.mkdir(parents=True, exist_ok=True) +# report_path.parent.mkdir(parents=True, exist_ok=True) +# # Ensure all columns are unique by appending suffixes to duplicates +# cols = pd.Series(df.columns) +# for dup in cols[cols.duplicated()].unique(): +# dups = cols[cols == dup].index.tolist() +# for i, idx in enumerate(dups): +# if i == 0: +# continue +# cols[idx] = f"{dup}_{i}" +# df.columns = cols +# df.to_parquet(features_path) +# with report_path.open("w") as f: +# json.dump(build_report(df), f, indent=2) + + +# def main(): +# # Hardcoded paths for direct script execution +# data_dir = Path("data/finnhub") +# out_features = Path("data/finnhub/merged_features.parquet") +# out_report = Path("data/finnhub/feature_report.json") + +# frames = [] +# for fp in find_parquet_files(data_dir): +# df, _ = load_and_prefix(fp, data_dir) +# frames.append(df) + +# if not frames: +# raise RuntimeError("No Parquet files found in the specified data directory.") + +# merged = merge_frames(frames) +# merged = extend_features(merged) +# save_outputs(merged, out_features, out_report) +# print(f"OK: Features saved to {out_features}, report to {out_report}") + + +# if __name__ == "__main__": +# main() diff --git a/src/fetchers/finnhub/incomp/__init__.py b/src/fetchers/finnhub/incomp/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/fetchers/finnhub/incomp/alternative_data/__init__.py b/src/fetchers/finnhub/incomp/alternative_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53571122bcc88ff94431af500db337493749d2a5 --- /dev/null +++ b/src/fetchers/finnhub/incomp/alternative_data/__init__.py @@ -0,0 +1,17 @@ +""" +alternative_data/__init__.py – Package exports for the alternative_data sub-package. +""" + +from .esg_scores import get_esg_scores +from .insider_sentiment import get_insider_sentiment +from .insider_transactions import get_insider_transactions +from .lobbying import get_lobbying +from .social_sentiment import get_social_sentiment + +__all__ = [ + "get_esg_scores", + "get_insider_sentiment", + "get_insider_transactions", + "get_lobbying", + "get_social_sentiment", +] diff --git a/src/fetchers/finnhub/incomp/alternative_data/esg_scores.py b/src/fetchers/finnhub/incomp/alternative_data/esg_scores.py new file mode 100644 index 0000000000000000000000000000000000000000..d4b12a55e505aa16b2ed9a72f94eafd6594d65e1 --- /dev/null +++ b/src/fetchers/finnhub/incomp/alternative_data/esg_scores.py @@ -0,0 +1,25 @@ +""" +alternative_data/esg_scores.py – Retrieve ESG (Environmental, Social, and Governance) scores for a given ticker. +""" + +from typing import Dict +from ..client import FinnhubClient + +def get_esg_scores(client: FinnhubClient, symbol: str) -> Dict: + """ + Fetch ESG scores for the specified symbol. + + :param client: An instance of FinnhubClient + :param symbol: Stock ticker (e.g., "AAPL") + :return: A dict with ESG metrics, for example: + { + 'symbol': 'AAPL', + 'year': 2024, + 'esgScore': 73.5, + 'environmentScore': 68.2, + 'socialScore': 75.1, + 'governanceScore': 78.9 + } + """ + params = {"symbol": symbol} + return client.get("stock/esg", params=params) diff --git a/src/fetchers/finnhub/incomp/alternative_data/insider_sentiment.py b/src/fetchers/finnhub/incomp/alternative_data/insider_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..2d43e73da86bf869d7d298dd9038f41fc1f2f67e --- /dev/null +++ b/src/fetchers/finnhub/incomp/alternative_data/insider_sentiment.py @@ -0,0 +1,31 @@ +def get_insider_sentiment(client, symbol, from_date=None, to_date=None): + """ + Fetch insider sentiment using the provided FinnhubClient. + :param client: FinnhubClient instance + :param symbol: Stock symbol + :param from_date: Start date (YYYY-MM-DD) + :param to_date: End date (YYYY-MM-DD) + :return: Insider sentiment data + """ + params = {"symbol": symbol} + if from_date: + params["from"] = from_date + if to_date: + params["to"] = to_date + return client.get("stock/insider-sentiment", params=params) +""" +insider_sentiment.py – GET /stock/insider-sentiment +""" +from ..client import FinnhubClient + +class InsiderSentiment: + def __init__(self): + self.client = FinnhubClient() + + def get_insider_sentiment(self, symbol, from_date=None, to_date=None): + params = {"symbol": symbol} + if from_date: + params["from"] = from_date + if to_date: + params["to"] = to_date + return self.client.get("stock/insider-sentiment", params=params) diff --git a/src/fetchers/finnhub/incomp/alternative_data/insider_transactions.py b/src/fetchers/finnhub/incomp/alternative_data/insider_transactions.py new file mode 100644 index 0000000000000000000000000000000000000000..b1bf8b37f1c0c1c089c113e434cbe0727b53b899 --- /dev/null +++ b/src/fetchers/finnhub/incomp/alternative_data/insider_transactions.py @@ -0,0 +1,31 @@ +def get_insider_transactions(client, symbol, from_date=None, to_date=None): + """ + Fetch insider transactions using the provided FinnhubClient. + :param client: FinnhubClient instance + :param symbol: Stock symbol + :param from_date: Start date (YYYY-MM-DD) + :param to_date: End date (YYYY-MM-DD) + :return: Insider transactions data + """ + params = {"symbol": symbol} + if from_date: + params["from"] = from_date + if to_date: + params["to"] = to_date + return client.get("stock/insider-transactions", params=params) +""" +insider_transactions.py – GET /stock/insider-transactions +""" +from ..client import FinnhubClient + +class InsiderTransactions: + def __init__(self): + self.client = FinnhubClient() + + def get_insider_transactions(self, symbol, from_date=None, to_date=None): + params = {"symbol": symbol} + if from_date: + params["from"] = from_date + if to_date: + params["to"] = to_date + return self.client.get("stock/insider-transactions", params=params) diff --git a/src/fetchers/finnhub/incomp/alternative_data/lobbying.py b/src/fetchers/finnhub/incomp/alternative_data/lobbying.py new file mode 100644 index 0000000000000000000000000000000000000000..074e6da66a032f6a95573f716fffc016a5e8ad71 --- /dev/null +++ b/src/fetchers/finnhub/incomp/alternative_data/lobbying.py @@ -0,0 +1,35 @@ +""" +alternative_data/lobbying.py – Retrieve lobbying disclosure data for a given ticker. +""" + +from typing import Dict +from ..client import FinnhubClient + +def get_lobbying(client: FinnhubClient, symbol: str, start_date: str, end_date: str) -> Dict: + """ + Fetch registered lobbying activities for the specified symbol between start_date and end_date. + + :param client: An instance of FinnhubClient + :param symbol: Stock ticker or company symbol (e.g., "AAPL") + :param start_date: Start date in YYYY-MM-DD format + :param end_date: End date in YYYY-MM-DD format + :return: A dict containing lobbying records, typically: + { + 'symbol': 'AAPL', + 'data': [ + { + 'disclosureDate': '2025-05-15', + 'client': 'Big Tech Lobbyists LLC', + 'amount': 250000, + 'subject': 'Regulatory Affairs' + }, + ... + ] + } + """ + params = { + "symbol": symbol, + "from": start_date, + "to": end_date, + } + return client.get("stock/lobbying", params=params) diff --git a/src/fetchers/finnhub/incomp/alternative_data/social_sentiment.py b/src/fetchers/finnhub/incomp/alternative_data/social_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1141143ffb8412f9f53ba9f04a38cf0753e62d --- /dev/null +++ b/src/fetchers/finnhub/incomp/alternative_data/social_sentiment.py @@ -0,0 +1,40 @@ +""" +alternative_data/social_sentiment.py – Retrieve social media sentiment metrics for a given ticker. +""" + +from typing import Dict +from ..client import FinnhubClient + +def get_social_sentiment( + client: FinnhubClient, + symbol: str, + start_date: str, + end_date: str, + source: str = "reddit" +) -> Dict: + """ + Fetch social sentiment data for the specified symbol between start_date and end_date. + + :param client: An instance of FinnhubClient + :param symbol: Stock ticker (e.g., "AAPL") + :param start_date: Start date in YYYY-MM-DD format + :param end_date: End date in YYYY-MM-DD format + :param source: Sentiment source, either "reddit" or "twitter" (default: "reddit") + :return: A dict containing the social sentiment data, typically: + { + "symbol": "AAPL", + "from": "2025-06-01", + "to": "2025-06-07", + "data": [ + {"date": "2025-06-01", "mention": 123, "sentiment": 0.45}, + ... + ] + } + """ + params = { + "symbol": symbol, + "from": start_date, + "to": end_date, + "source": source + } + return client.get("stock/social-sentiment", params=params) diff --git a/src/fetchers/finnhub/incomp/client.py b/src/fetchers/finnhub/incomp/client.py new file mode 100644 index 0000000000000000000000000000000000000000..3ef6d022ccf3b6e36822da8db22ed2a8866f9846 --- /dev/null +++ b/src/fetchers/finnhub/incomp/client.py @@ -0,0 +1,42 @@ +# """ +# client.py – Manages base URL, API key, and rate-limiting for Finnhub API (60 calls/min). +# """ + +# import os +# from dotenv import load_dotenv +# import requests +# from .utils import rate_limited + +# load_dotenv() + +# # Load your API key and optional base URL from environment +# API_KEY = os.getenv("FINHUB_API_KEY") +# BASE_URL = os.getenv("FINHUB_BASE_URL", "https://finnhub.io/api/v1/") + +# # Default headers for every request +# HEADERS = { +# "X-Finnhub-Token": API_KEY, +# "Content-Type": "application/json" +# } + +# class FinnhubClient: +# def __init__(self): +# """ +# Initialize a session with the default headers. +# """ +# self.session = requests.Session() +# self.session.headers.update(HEADERS) + +# @rate_limited() +# def get(self, endpoint: str, params: dict = None) -> dict: +# """ +# Perform a GET request to the given Finnhub endpoint, respecting rate limits. + +# :param endpoint: API path (e.g. "quote", "stock/candle") +# :param params: Query parameters as a dict +# :return: Parsed JSON response as a dict +# """ +# url = BASE_URL.rstrip("/") + "/" + endpoint.lstrip("/") +# response = self.session.get(url, params=params) +# response.raise_for_status() +# return response.json() diff --git a/src/fetchers/finnhub/incomp/economic_data/__init__.py b/src/fetchers/finnhub/incomp/economic_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a249e5e02de8322aea2a50a241088a29f64e13fe --- /dev/null +++ b/src/fetchers/finnhub/incomp/economic_data/__init__.py @@ -0,0 +1,11 @@ +""" +economic_data/__init__.py – Package exports for the economic_data sub-package. +""" + +from .calendar import get_economic_calendar +from .indicators import get_economic_indicators + +__all__ = [ + "get_economic_calendar", + "get_economic_indicators", +] diff --git a/src/fetchers/finnhub/incomp/economic_data/calendar.py b/src/fetchers/finnhub/incomp/economic_data/calendar.py new file mode 100644 index 0000000000000000000000000000000000000000..3d706f368ca3f6f483e900ded89194e3236e5192 --- /dev/null +++ b/src/fetchers/finnhub/incomp/economic_data/calendar.py @@ -0,0 +1,33 @@ +def get_economic_calendar(client, start_date=None, end_date=None, country=None): + """ + Fetch economic calendar events using the provided FinnhubClient. + :param client: FinnhubClient instance + :param start_date: Start date (YYYY-MM-DD) + :param end_date: End date (YYYY-MM-DD) + :param country: Country code (optional) + :return: Economic calendar events + """ + params = {} + if start_date: + params["from"] = start_date + if end_date: + params["to"] = end_date + if country: + params["country"] = country + return client.get("economic/calendar", params=params) +""" +calendar.py – GET /economic/calendar +""" +from ..client import FinnhubClient + +class EconomicCalendar: + def __init__(self): + self.client = FinnhubClient() + + def get_calendar(self, _from=None, to=None): + params = {} + if _from: + params["from"] = _from + if to: + params["to"] = to + return self.client.get("economic/calendar", params=params) diff --git a/src/fetchers/finnhub/incomp/economic_data/indicators.py b/src/fetchers/finnhub/incomp/economic_data/indicators.py new file mode 100644 index 0000000000000000000000000000000000000000..df09dcfc508673eb8fb11000acd3755d77d0824a --- /dev/null +++ b/src/fetchers/finnhub/incomp/economic_data/indicators.py @@ -0,0 +1,37 @@ +""" +economic_data/indicators.py – Fetches time series data for specified macroeconomic indicators via the Finnhub API. +""" + +from typing import Dict, Any +from ..client import FinnhubClient + +def get_economic_indicators( + client: FinnhubClient, + indicator: str, + start_date: str, + end_date: str +) -> Dict[str, Any]: + """ + Retrieve macroeconomic indicator data between start_date and end_date. + + :param client: An instance of FinnhubClient + :param indicator: The code of the macro indicator (e.g., 'US_GDP', 'CPI', 'UnemploymentRate') + :param start_date: Start date in YYYY-MM-DD format + :param end_date: End date in YYYY-MM-DD format + :return: A dict containing the time series for the requested indicator, typically: + { + 'indicator': 'US_GDP', + 'from': '2020-01-01', + 'to': '2025-06-30', + 'data': [ + {'date': '2020-01-01', 'value': 21000.0}, + ... + ] + } + """ + params = { + "indicator": indicator, + "from": start_date, + "to": end_date + } + return client.get("economic/indicator", params=params) diff --git a/src/fetchers/finnhub/incomp/fundamentals/__init__.py b/src/fetchers/finnhub/incomp/fundamentals/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f54e7b6c4c5e170e1d0e3712312310005eab42ed --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/__init__.py @@ -0,0 +1,15 @@ +""" +fundamentals/__init__.py – Package exports for the fundamentals sub-package. +""" + +from .basic_financials import BasicFinancials +from .corporate_actions import CorporateActions +from .earnings import Earnings +from .financials_reported import FinancialsReported + +__all__ = [ + "BasicFinancials", + "CorporateActions", + "Earnings", + "FinancialsReported", +] diff --git a/src/fetchers/finnhub/incomp/fundamentals/basic_financials.py b/src/fetchers/finnhub/incomp/fundamentals/basic_financials.py new file mode 100644 index 0000000000000000000000000000000000000000..7bcd78c5bc8fe5ecba1320b24bb543652453c3e8 --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/basic_financials.py @@ -0,0 +1,23 @@ +""" +basic_financials.py – GET /company-basic-financials +""" + +from typing import Optional, Dict, Any +from ..client import FinnhubClient + +class BasicFinancials: + def __init__(self): + self.client = FinnhubClient() + + def get_basic_financials(self, symbol: str, metric: Optional[str] = None) -> Dict[str, Any]: + """ + Fetch basic financial metrics for the specified symbol. + + :param symbol: Stock ticker (e.g., "AAPL") + :param metric: Specific metric to fetch (e.g., 'pe', 'eps'); if None, returns all available metrics + :return: A dict containing the requested financial metrics. + """ + params = {"symbol": symbol} + if metric: + params["metric"] = metric + return self.client.get("stock/metric", params=params) diff --git a/src/fetchers/finnhub/incomp/fundamentals/corporate_actions.py b/src/fetchers/finnhub/incomp/fundamentals/corporate_actions.py new file mode 100644 index 0000000000000000000000000000000000000000..1852be8c0a952ca653b5f460332039e1975aeef5 --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/corporate_actions.py @@ -0,0 +1,89 @@ +""" +fundamentals/corporate_actions.py – Retrieve corporate actions: dividends, splits, and price targets. +""" + +from typing import Optional, Dict, Any +from ..client import FinnhubClient + +class CorporateActions: + def __init__(self): + """ + Initialize CorporateActions with a FinnhubClient instance. + """ + self.client = FinnhubClient() + + def get_dividends( + self, + symbol: str, + start_date: Optional[str] = None, + end_date: Optional[str] = None + ) -> Dict[str, Any]: + """ + Fetch dividend history for the specified symbol between start_date and end_date. + + :param symbol: Stock ticker (e.g., "AAPL") + :param start_date: Start date in YYYY-MM-DD format + :param end_date: End date in YYYY-MM-DD format + :return: A dict containing dividend data, for example: + { + 'symbol': 'AAPL', + 'data': [ + {'paymentDate': '2025-06-01', 'amount': 0.23, 'recordDate': '2025-05-20'}, + ... + ] + } + """ + params = {"symbol": symbol} + if start_date: + params["from"] = start_date + if end_date: + params["to"] = end_date + return self.client.get("stock/dividend", params=params) + + def get_splits( + self, + symbol: str, + start_date: Optional[str] = None, + end_date: Optional[str] = None + ) -> Dict[str, Any]: + """ + Fetch stock split history for the specified symbol between start_date and end_date. + + :param symbol: Stock ticker (e.g., "AAPL") + :param start_date: Start date in YYYY-MM-DD format + :param end_date: End date in YYYY-MM-DD format + :return: A dict containing split data, for example: + { + 'symbol': 'AAPL', + 'data': [ + {'date': '2020-08-31', 'splitRatio': '4:1'}, + ... + ] + } + """ + params = {"symbol": symbol} + if start_date: + params["from"] = start_date + if end_date: + params["to"] = end_date + return self.client.get("stock/split", params=params) + + def get_price_target(self, symbol: str) -> Dict[str, Any]: + """ + Fetch the price target data for the specified symbol. + + :param symbol: Stock ticker (e.g., "AAPL") + :return: A dict containing price target information, for example: + { + 'symbol': 'AAPL', + 'buy': 165.0, + 'hold': 155.0, + 'sell': 145.0, + 'average': 155.0, + 'high': 170.0, + 'low': 140.0, + 'lastUpdated': '2025-06-15' + } + """ + params = {"symbol": symbol} + return self.client.get("stock/price-target", params=params) diff --git a/src/fetchers/finnhub/incomp/fundamentals/earnings.py b/src/fetchers/finnhub/incomp/fundamentals/earnings.py new file mode 100644 index 0000000000000000000000000000000000000000..8345d111da2c9d3827af1062e65d36e336e130c2 --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/earnings.py @@ -0,0 +1,38 @@ +""" +fundamentals/earnings.py – Fetch historical earnings data for a given ticker. +""" + +from typing import Dict, Any, List, Optional +from ..client import FinnhubClient + +class Earnings: + def __init__(self): + self.client = FinnhubClient() + + def get_earnings( + self, + symbol: str, + start_date: Optional[str] = None, + end_date: Optional[str] = None + ) -> List[Dict[str, Any]]: + """ + Retrieve earnings data for the specified symbol. + + :param symbol: Stock ticker (e.g., "AAPL") + :param start_date: (optional) Start date in YYYY-MM-DD format + :param end_date: (optional) End date in YYYY-MM-DD format + :return: A list of earnings records, each dict typically containing: + { + 'period': '2025-06-30', + 'actual': 1.30, + 'estimate': 1.25, + 'surprise': 0.05, + 'surprisePercent': 4.0 + } + """ + params = {"symbol": symbol} + if start_date: + params["from"] = start_date + if end_date: + params["to"] = end_date + return self.client.get("stock/earnings", params=params) diff --git a/src/fetchers/finnhub/incomp/fundamentals/financials_reported.py b/src/fetchers/finnhub/incomp/fundamentals/financials_reported.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0e573840f1b2c6fc1cf75cfef43047e6ec4c10 --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/financials_reported.py @@ -0,0 +1,46 @@ +""" +fundamentals/financials_reported.py – Retrieve reported financial statements for a given ticker. +""" + +from typing import Dict, Any +from ..client import FinnhubClient + +class FinancialsReported: + def __init__(self): + """ + Initialize FinancialsReported with a FinnhubClient instance. + """ + self.client = FinnhubClient() + + def get_financials_reported( + self, + symbol: str, + freq: str = "annual" + ) -> Dict[str, Any]: + """ + Fetch reported financial statements for the specified symbol. + + :param symbol: Stock ticker (e.g., "AAPL") + :param freq: Frequency of reports: "annual" or "quarter" + :return: A dict containing financial statements, typically: + { + 'symbol': 'AAPL', + 'metric': 'ic', # e.g., 'ic' for income statement, 'bs' for balance sheet, 'cf' for cash flow + 'report': [ + { + 'reportDate': '2025-03-31', + 'ic': { + 'revenue': 89000.0, + 'grossProfit': 38000.0, + ... + } + }, + ... + ] + } + """ + params = { + "symbol": symbol, + "freq": freq + } + return self.client.get("stock/financials-reported", params=params) diff --git a/src/fetchers/finnhub/incomp/fundamentals/key_metrics.py b/src/fetchers/finnhub/incomp/fundamentals/key_metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..8b75a294eefbf76d41dfaab483ac378073b93abe --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/key_metrics.py @@ -0,0 +1,11 @@ +""" +key_metrics.py – GET /stock/metric +""" +from ..client import FinnhubClient + +class KeyMetrics: + def __init__(self): + self.client = FinnhubClient() + + def get_key_metrics(self, symbol): + return self.client.get("stock/metric", params={"symbol": symbol}) diff --git a/src/fetchers/finnhub/incomp/fundamentals/ownership.py b/src/fetchers/finnhub/incomp/fundamentals/ownership.py new file mode 100644 index 0000000000000000000000000000000000000000..a44ec196b1c13bedd12adc0bacab7d22918d1d4b --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/ownership.py @@ -0,0 +1,11 @@ +""" +ownership.py – GET /stock/institutional-ownership +""" +from ..client import FinnhubClient + +class Ownership: + def __init__(self): + self.client = FinnhubClient() + + def get_ownership(self, symbol): + return self.client.get("stock/institutional-ownership", params={"symbol": symbol}) diff --git a/src/fetchers/finnhub/incomp/fundamentals/peers.py b/src/fetchers/finnhub/incomp/fundamentals/peers.py new file mode 100644 index 0000000000000000000000000000000000000000..337f459f93df0d816295bfdf1f89f691b26c78d3 --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/peers.py @@ -0,0 +1,11 @@ +""" +peers.py – GET /stock/peers +""" +from ..client import FinnhubClient + +class Peers: + def __init__(self): + self.client = FinnhubClient() + + def get_peers(self, symbol): + return self.client.get("stock/peers", params={"symbol": symbol}) diff --git a/src/fetchers/finnhub/incomp/fundamentals/profile.py b/src/fetchers/finnhub/incomp/fundamentals/profile.py new file mode 100644 index 0000000000000000000000000000000000000000..a0bea41d254a6f356217c92e5bcc7ea12812972b --- /dev/null +++ b/src/fetchers/finnhub/incomp/fundamentals/profile.py @@ -0,0 +1,11 @@ +""" +profile.py – GET /stock/profile2 +""" +from ..client import FinnhubClient + +class Profile: + def __init__(self): + self.client = FinnhubClient() + + def get_profile(self, symbol): + return self.client.get("stock/profile2", params={"symbol": symbol}) diff --git a/src/fetchers/finnhub/incomp/market_data/__init__.py b/src/fetchers/finnhub/incomp/market_data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..502d1b6e79a517034e8fde1e4754d1cb7a73e418 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/__init__.py @@ -0,0 +1,15 @@ +""" +market_data/__init__.py – Package exports for the market_data sub-package. +""" + +from .symbol_listings import get_symbol_list +from .stock_candle import get_stock_candles +from .crypto_candle import get_crypto_candles +from .forex_candle import get_forex_candles + +__all__ = [ + "get_symbol_list", + "get_stock_candles", + "get_crypto_candles", + "get_forex_candles", +] diff --git a/src/fetchers/finnhub/incomp/market_data/crypto_candle.py b/src/fetchers/finnhub/incomp/market_data/crypto_candle.py new file mode 100644 index 0000000000000000000000000000000000000000..b29e2be493ad8620ffc06d2af6650740410f3ce0 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/crypto_candle.py @@ -0,0 +1,40 @@ +""" +market_data/crypto_candle.py – Retrieve historical OHLC (open, high, low, close) and volume data for cryptocurrencies. +""" + +from typing import Dict, Any +from ..client import FinnhubClient + +def get_crypto_candles( + client: FinnhubClient, + symbol: str, + resolution: str, + start_timestamp: int, + end_timestamp: int +) -> Dict[str, Any]: + """ + Fetch candlestick (OHLC + volume) data for a given crypto symbol over a time range. + + :param client: An instance of FinnhubClient + :param symbol: Crypto symbol (e.g., "BINANCE:BTCUSDT") + :param resolution: Time resolution. Supported values: "1", "5", "15", "30", "60", "D", "W", "M" + :param start_timestamp: UNIX timestamp (in seconds) for the start of the range + :param end_timestamp: UNIX timestamp (in seconds) for the end of the range + :return: A dict with keys: + { + 'c': [close prices], + 'h': [high prices], + 'l': [low prices], + 'o': [open prices], + 'v': [volumes], + 't': [timestamps], + 's': 'ok' + } + """ + params = { + "symbol": symbol, + "resolution": resolution, + "from": start_timestamp, + "to": end_timestamp + } + return client.get("crypto/candle", params=params) diff --git a/src/fetchers/finnhub/incomp/market_data/crypto_trades.py b/src/fetchers/finnhub/incomp/market_data/crypto_trades.py new file mode 100644 index 0000000000000000000000000000000000000000..f56bea05d41cf4712edc3401e85d9eafdccd48aa --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/crypto_trades.py @@ -0,0 +1,11 @@ +""" +crypto_trades.py – GET /crypto/trades +""" +from ..client import FinnhubClient + +class CryptoTrades: + def __init__(self): + self.client = FinnhubClient() + + def get_crypto_trades(self, symbol): + return self.client.get("crypto/trades", params={"symbol": symbol}) diff --git a/src/fetchers/finnhub/incomp/market_data/exchange_listings.py b/src/fetchers/finnhub/incomp/market_data/exchange_listings.py new file mode 100644 index 0000000000000000000000000000000000000000..bbf9522adbb285cd71aeef1f5f6dc7ddfb314ea1 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/exchange_listings.py @@ -0,0 +1,14 @@ +""" +exchange_listings.py – GET /stock/exchange, GET /crypto/exchange +""" +from ..client import FinnhubClient + +class ExchangeListings: + def __init__(self): + self.client = FinnhubClient() + + def get_stock_exchanges(self): + return self.client.get("stock/exchange") + + def get_crypto_exchanges(self): + return self.client.get("crypto/exchange") diff --git a/src/fetchers/finnhub/incomp/market_data/forex_candle.py b/src/fetchers/finnhub/incomp/market_data/forex_candle.py new file mode 100644 index 0000000000000000000000000000000000000000..bccbb83d19b9b139f8659f298e7a7691db3b4635 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/forex_candle.py @@ -0,0 +1,40 @@ +""" +market_data/forex_candle.py – Retrieve historical OHLC (open, high, low, close) and volume data for forex pairs. +""" + +from typing import Dict, Any +from ..client import FinnhubClient + +def get_forex_candles( + client: FinnhubClient, + symbol: str, + resolution: str, + start_timestamp: int, + end_timestamp: int +) -> Dict[str, Any]: + """ + Fetch candlestick (OHLC + volume) data for a given forex symbol over a time range. + + :param client: An instance of FinnhubClient + :param symbol: Forex pair symbol (e.g., "OANDA:EUR_USD") + :param resolution: Time resolution. Supported values: "1", "5", "15", "30", "60", "D", "W", "M" + :param start_timestamp: UNIX timestamp (in seconds) for the start of the range + :param end_timestamp: UNIX timestamp (in seconds) for the end of the range + :return: A dict with keys: + { + 'c': [close prices], + 'h': [high prices], + 'l': [low prices], + 'o': [open prices], + 'v': [volumes], + 't': [timestamps], + 's': 'ok' + } + """ + params = { + "symbol": symbol, + "resolution": resolution, + "from": start_timestamp, + "to": end_timestamp + } + return client.get("forex/candle", params=params) diff --git a/src/fetchers/finnhub/incomp/market_data/forex_rates.py b/src/fetchers/finnhub/incomp/market_data/forex_rates.py new file mode 100644 index 0000000000000000000000000000000000000000..0334c2bd832eb1d6ebbb923185e77c4d8a9d3db3 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/forex_rates.py @@ -0,0 +1,11 @@ +""" +forex_rates.py – GET /forex/rates +""" +from ..client import FinnhubClient + +class ForexRates: + def __init__(self): + self.client = FinnhubClient() + + def get_forex_rates(self): + return self.client.get("forex/rates") diff --git a/src/fetchers/finnhub/incomp/market_data/quote.py b/src/fetchers/finnhub/incomp/market_data/quote.py new file mode 100644 index 0000000000000000000000000000000000000000..77dc29c46c44cd9fe00505965f0d2263d59e45d8 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/quote.py @@ -0,0 +1,11 @@ +""" +quote.py – GET /quote +""" +from ..client import FinnhubClient + +class Quote: + def __init__(self): + self.client = FinnhubClient() + + def get_quote(self, symbol): + return self.client.get("quote", params={"symbol": symbol}) diff --git a/src/fetchers/finnhub/incomp/market_data/stock_candle.py b/src/fetchers/finnhub/incomp/market_data/stock_candle.py new file mode 100644 index 0000000000000000000000000000000000000000..124da28b0e657ae97798c93d59b516d9fe236cc2 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/stock_candle.py @@ -0,0 +1,40 @@ +""" +market_data/stock_candle.py – Retrieve historical OHLC (open, high, low, close) and volume data for stocks. +""" + +from typing import Dict, Any, Optional +from ..client import FinnhubClient + +def get_stock_candles( + client: FinnhubClient, + symbol: str, + resolution: str, + start_timestamp: int, + end_timestamp: int +) -> Dict[str, Any]: + """ + Fetch candlestick (OHLC + volume) data for a given stock symbol over a time range. + + :param client: An instance of FinnhubClient + :param symbol: Stock ticker (e.g., "AAPL") + :param resolution: Time resolution. Supported values: "1", "5", "15", "30", "60", "D", "W", "M" + :param start_timestamp: UNIX timestamp (in seconds) for the start of the range + :param end_timestamp: UNIX timestamp (in seconds) for the end of the range + :return: A dict with keys: + { + 'c': [close prices], + 'h': [high prices], + 'l': [low prices], + 'o': [open prices], + 'v': [volumes], + 't': [timestamps], + 's': 'ok' + } + """ + params = { + "symbol": symbol, + "resolution": resolution, + "from": start_timestamp, + "to": end_timestamp + } + return client.get("stock/candle", params=params) diff --git a/src/fetchers/finnhub/incomp/market_data/symbol_listings.py b/src/fetchers/finnhub/incomp/market_data/symbol_listings.py new file mode 100644 index 0000000000000000000000000000000000000000..4f00ddbf874c13ffc366918ce594c99ad444a2e8 --- /dev/null +++ b/src/fetchers/finnhub/incomp/market_data/symbol_listings.py @@ -0,0 +1,19 @@ +""" +market_data/symbol_listings.py – Fetches the list of tradable symbols from Finnhub. +""" + +from typing import List, Dict +from ..client import FinnhubClient + +def get_symbol_list(client: FinnhubClient, exchange: str = "US") -> List[Dict]: + """ + Retrieve all symbols available on a given exchange. + + :param client: An instance of FinnhubClient + :param exchange: Exchange code (e.g., "US", "NASDAQ", "BINANCE") + :return: A list of symbol metadata dicts, each containing keys like + 'symbol', 'description', 'type', etc. + """ + params = {"exchange": exchange} + symbols = client.get("stock/symbol", params=params) + return symbols diff --git a/src/fetchers/finnhub/incomp/news/__init__.py b/src/fetchers/finnhub/incomp/news/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3fdf81efef21177694a6ff3d4c0b833fff0de84e --- /dev/null +++ b/src/fetchers/finnhub/incomp/news/__init__.py @@ -0,0 +1,11 @@ +""" +news/__init__.py – Package exports for the news sub-package. +""" + +from .company_news import CompanyNews +from .general_news import GeneralNews + +__all__ = [ + "CompanyNews", + "GeneralNews", +] diff --git a/src/fetchers/finnhub/incomp/news/company_news.py b/src/fetchers/finnhub/incomp/news/company_news.py new file mode 100644 index 0000000000000000000000000000000000000000..11a5719ddf1f7292be87588591d000b8ed2a36be --- /dev/null +++ b/src/fetchers/finnhub/incomp/news/company_news.py @@ -0,0 +1,45 @@ +""" +news/company_news.py – Fetch company-specific news articles from Finnhub. +""" + +from typing import List, Dict, Optional +from ..client import FinnhubClient + +class CompanyNews: + def __init__(self): + """ + Initialize CompanyNews with a FinnhubClient instance. + """ + self.client = FinnhubClient() + + def get_company_news( + self, + symbol: str, + start_date: Optional[str] = None, + end_date: Optional[str] = None + ) -> List[Dict]: + """ + Retrieve news articles for a specific company. + + :param symbol: Stock ticker (e.g., "AAPL") + :param start_date: Filter news from this date (YYYY-MM-DD) + :param end_date: Filter news up to this date (YYYY-MM-DD) + :return: A list of news article dicts, each containing keys such as: + { + 'category': 'general', + 'datetime': 1623777600, + 'headline': 'Apple releases new product...', + 'id': 12345, + 'image': 'https://...', + 'related': 'AAPL', + 'source': 'CNBC', + 'summary': 'Apple announced...', + 'url': 'https://...' + } + """ + params: Dict[str, str] = {"symbol": symbol} + if start_date: + params["from"] = start_date + if end_date: + params["to"] = end_date + return self.client.get("company-news", params=params) diff --git a/src/fetchers/finnhub/incomp/news/general_news.py b/src/fetchers/finnhub/incomp/news/general_news.py new file mode 100644 index 0000000000000000000000000000000000000000..d911db49cd8a03170a7e09bbfac93382938f74f7 --- /dev/null +++ b/src/fetchers/finnhub/incomp/news/general_news.py @@ -0,0 +1,44 @@ +""" +news/general_news.py – Fetch general market news articles from Finnhub. +""" + +from typing import List, Dict, Optional +from ..client import FinnhubClient + +class GeneralNews: + def __init__(self): + """ + Initialize GeneralNews with a FinnhubClient instance. + """ + self.client = FinnhubClient() + + def get_general_news( + self, + category: Optional[str] = None, + min_id: Optional[int] = None + ) -> List[Dict]: + """ + Retrieve general news articles. + + :param category: (optional) News category filter, one of: + 'general', 'forex', 'crypto', 'merger', 'sentiment', + 'ipo', 'private equity', 'public offerings', etc. + :param min_id: (optional) Return articles with ID greater than this value + :return: A list of news article dicts, each containing keys such as: + { + 'category': 'general', + 'datetime': 1623777600, + 'headline': 'Market rallies on positive earnings...', + 'id': 12345, + 'image': 'https://...', + 'source': 'Reuters', + 'summary': 'Stocks rallied today...', + 'url': 'https://...' + } + """ + params: Dict[str, str] = {} + if category: + params["category"] = category + if min_id is not None: + params["minId"] = str(min_id) + return self.client.get("news", params=params) diff --git a/src/fetchers/finnhub/incomp/utils.py b/src/fetchers/finnhub/incomp/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ea194e45f20f066ed71e978334775594e79f3a6a --- /dev/null +++ b/src/fetchers/finnhub/incomp/utils.py @@ -0,0 +1,30 @@ +""" +utils.py – Utility functions, including rate limiting decorator. +""" + +import threading +import time +from functools import wraps + +def rate_limited(max_per_minute: int = 60): + """ + Decorator that limits a function to at most `max_per_minute` calls per minute. + Uses a simple token bucket algorithm. + """ + interval = 60.0 / max_per_minute + lock = threading.Lock() + last_time = {"t": 0.0} + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + with lock: + elapsed = time.time() - last_time["t"] + wait = interval - elapsed + if wait > 0: + time.sleep(wait) + result = func(*args, **kwargs) + last_time["t"] = time.time() + return result + return wrapper + return decorator diff --git a/src/fetchers/finnhub/main.py b/src/fetchers/finnhub/main.py new file mode 100644 index 0000000000000000000000000000000000000000..c9766a5cf9816798947e678cf7ce1e744464c098 --- /dev/null +++ b/src/fetchers/finnhub/main.py @@ -0,0 +1,476 @@ +import finnhub +import pandas as pd +import pyarrow as pa +import pyarrow.parquet as pq +import os +import json +import time +from datetime import datetime, timedelta +from dotenv import load_dotenv +from typing import List, Dict, Any, Optional +# import logging + +# Load environment variables +load_dotenv() + +class FinnhubDataFetcher: + + def __init__(self): + # Parse multiple API keys from environment variable + api_keys_string = os.getenv('FINNHUB_API_KEY') + if not api_keys_string: + raise ValueError("FINNHUB_API_KEY not found in environment variables") + + # Support both comma-separated and single API key formats + self.api_keys = [key.strip() for key in api_keys_string.split(',') if key.strip()] + if not self.api_keys: + raise ValueError("No valid API keys found in FINNHUB_API_KEY") + + print(f"[INFO] Loaded {len(self.api_keys)} Finnhub API key(s)") + + self.current_key_index = 0 + self.rate_limit_switches = 0 + + self.stock_symbols = os.getenv('STOCK_SYMBOLS', 'AAPL,NVDA,TSLA,GOOGL,MSFT,AMZN').split(',') + self.stock_symbols = [symbol.strip() for symbol in self.stock_symbols] + + self.crypto_symbols = os.getenv('CRYPTO_SYMBOLS', 'BTC,ETH,ADA,DOT,LINK,UNI,AVAX,MATIC').split(',') + self.crypto_symbols = [symbol.strip() for symbol in self.crypto_symbols] + + self.crypto_pairs = os.getenv('CRYPTO_PAIRS', 'BINANCE:BTCUSDT,BINANCE:ETHUSDT,BINANCE:ADAUSDT,COINBASE:BTC-USD,COINBASE:ETH-USD').split(',') + self.crypto_pairs = [pair.strip() for pair in self.crypto_pairs] + + self.forex_pairs = os.getenv('FOREX_PAIRS', 'OANDA:EUR_USD,OANDA:GBP_USD,OANDA:USD_JPY,OANDA:AUD_USD,OANDA:USD_CAD').split(',') + self.forex_pairs = [pair.strip() for pair in self.forex_pairs] + + # Initialize client with first API key + from finnhub import Client + self.client = self._create_client() + self.base_dir = "data/finnhub" + + # Create base directory structure + self.create_directory_structure() + + # Date ranges for historical data + self.end_date = int(datetime.now().timestamp()) + self.start_date = int((datetime.now() - timedelta(days=365)).timestamp()) + self.date_str_from = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d') + self.date_str_to = datetime.now().strftime('%Y-%m-%d') + + def _create_client(self): + """Create a Finnhub client with the current API key""" + from finnhub import Client + current_key = self.api_keys[self.current_key_index] + print(f"[INFO] Using API key #{self.current_key_index + 1}: {current_key[:8]}...") + return Client(api_key=current_key) + + def _switch_api_key(self): + """Switch to the next available API key""" + if len(self.api_keys) == 1: + print("[WARNING] Only one API key available, cannot switch") + return False + + old_index = self.current_key_index + self.current_key_index = (self.current_key_index + 1) % len(self.api_keys) + self.rate_limit_switches += 1 + + print(f"[SWITCH] Switching from API key #{old_index + 1} to #{self.current_key_index + 1} (switch #{self.rate_limit_switches})") + + # Create new client with the new API key + self.client = self._create_client() + + # Add a longer delay after switching keys + time.sleep(2.0) + return True + + def _is_rate_limit_error(self, error_message): + """Check if the error indicates a rate limit issue""" + rate_limit_indicators = [ + "429", + "rate limit", + "too many requests", + "api limit", + "quota exceeded", + "limit exceeded" + ] + error_str = str(error_message).lower() + return any(indicator in error_str for indicator in rate_limit_indicators) + + + def create_directory_structure(self): + """Create directory structure for organized data storage""" + subdirs = [ + 'stock_data', 'company_info', 'financials', 'earnings', 'news', + 'crypto', 'forex', 'market_data', + 'ownership', 'ratings', 'regulatory' + ] + for subdir in subdirs: + dir_path = os.path.join(self.base_dir, subdir) + os.makedirs(dir_path, exist_ok=True) + + def save_data(self, data: Any, filename: str, subdir: str, symbol: str = None): + """ + Robust Parquet writer: + • DataFrame → direct to_parquet + • dict → one-row DataFrame → parquet + • list-of-dicts → DataFrame → parquet + • other lists → single-column table + • other objects → single-row, single-column table + """ + if symbol: + base = os.path.join(self.base_dir, subdir, f"{symbol}_{filename}") + else: + base = os.path.join(self.base_dir, subdir, filename) + path = f"{base}.parquet" + + try: + if isinstance(data, pd.DataFrame): + data = data.replace({'N/A': None, '-': None}) + for col in data.columns: + if data[col].dtype == object: + data[col] = pd.to_numeric(data[col], errors='coerce') + data.to_parquet(path, index=False, engine="pyarrow") + return + + # ———————————————— + # Special-case: any dict whose “data” is a list-of-lists → tabular form + if isinstance(data, dict) and 'data' in data and isinstance(data['data'], list): + cols = None + # Finnhub sometimes calls labels 'metricType' or 'metric' + if 'metricType' in data: + cols = data['metricType'] + elif 'metric' in data: + cols = data['metric'] + # only proceed if we have column names + if cols and all(isinstance(row, (list, tuple)) for row in data['data']): + df = pd.DataFrame(data['data'], columns=cols) + # sanitize placeholders + df = df.replace({'N/A': None, '-': None}) + # coerce object cols to numeric where possible + for c in df.columns: + if df[c].dtype == object: + df[c] = pd.to_numeric(df[c], errors='coerce') + df.to_parquet(path, index=False, engine="pyarrow") + return + # ———————————————— + + if isinstance(data, dict): + df = pd.DataFrame([{k: (None if v == "N/A" else v) for k, v in data.items()}]) + df.to_parquet(path, index=False, engine="pyarrow") + return + + if isinstance(data, list): + if all(isinstance(item, dict) for item in data): + df = pd.DataFrame([{k: (None if v == "N/A" else v) for k, v in item.items()} for item in data]) + df.to_parquet(path, index=False, engine="pyarrow") + else: + tbl = pa.Table.from_pydict({"value": [None if v == "N/A" else v for v in data]}) + pq.write_table(tbl, path) + return + + tbl = pa.Table.from_pydict({"value": [str(data) if data != "N/A" else None]}) + pq.write_table(tbl, path) + + except Exception as e: + print(f"Error saving {filename}: {e}") + + def rate_limit_delay(self, delay: float = 0.1): + """Add delay to respect rate limits""" + time.sleep(delay) + + def safe_api_call(self, func, *args, **kwargs): + """Safely call Finnhub API with retries and rate limit handling""" + max_retries = 3 + base_delay = 1.2 + keys_tried = set() + + for attempt in range(max_retries): + try: + # If we've tried all keys, wait longer and reset + if len(keys_tried) >= len(self.api_keys): + print(f"[WARNING] All {len(self.api_keys)} API keys exhausted, waiting 30 seconds...") + time.sleep(30) + keys_tried.clear() + self.current_key_index = 0 + self.client = self._create_client() + + result = func(*args, **kwargs) + + # If successful, apply rate limit delay and return + if result: + if attempt > 0: + print(f"[SUCCESS] API call succeeded on attempt {attempt + 1}") + self.rate_limit_delay() + return result + else: + print("[WARNING] API returned empty result") + + except Exception as e: + error_msg = str(e) + keys_tried.add(self.current_key_index) + + # Check if it's a rate limit error + if self._is_rate_limit_error(error_msg): + print(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit: {error_msg}") + + # Try to switch to next API key + if self._switch_api_key(): + continue # Retry with new API key + else: + print("[ERROR] No more API keys available for switching") + + print(f"[ERROR] API call attempt {attempt + 1}/{max_retries} failed: {error_msg}") + + if attempt < max_retries - 1: + delay = base_delay * (2 ** attempt) + print(f"[RETRY] Waiting {delay:.1f} seconds before retry...") + time.sleep(delay) + else: + print(f"[FAILED] All {max_retries} attempts failed") + + return None + + def get_api_key_status(self): + """Get status information about API key usage""" + return { + "total_keys": len(self.api_keys), + "current_key": self.current_key_index + 1, + "rate_limit_switches": self.rate_limit_switches, + "current_key_preview": self.api_keys[self.current_key_index][:8] + "..." + } + + def print_api_key_status(self): + """Print API key usage status""" + status = self.get_api_key_status() + print(f"\n[API_STATUS] Using {status['total_keys']} API keys") + print(f"[API_STATUS] Current: Key #{status['current_key']} ({status['current_key_preview']})") + print(f"[API_STATUS] Rate limit switches: {status['rate_limit_switches']}") + if status['rate_limit_switches'] > 0: + print(f"[API_STATUS] Effective rate limit handling active") + print() + + def fetch_stock_data(self): + """Fetch stock-related data""" + for symbol in self.stock_symbols: + quote = self.safe_api_call(self.client.quote, symbol) + if quote: + self.save_data(quote, "current_quote", "stock_data", symbol) + + def fetch_company_info(self): + """Fetch company information""" + for symbol in self.stock_symbols: + profile = self.safe_api_call(self.client.company_profile2, symbol=symbol) + if profile: + self.save_data(profile, "company_profile", "company_info", symbol) + peers = self.safe_api_call(self.client.company_peers, symbol) + if peers: + self.save_data(peers, "company_peers", "company_info", symbol) + + def fetch_financials(self): + """Fetch financial data""" + for symbol in self.stock_symbols: + basic_financials = self.safe_api_call( + self.client.company_basic_financials, symbol, 'all' + ) + if basic_financials: + self.save_data(basic_financials, "basic_financials", "financials", symbol) + reported_financials = self.safe_api_call( + self.client.financials_reported, symbol=symbol, freq='annual' + ) + if reported_financials: + self.save_data(reported_financials, "reported_financials", "financials", symbol) + + def fetch_earnings_data(self): + """Fetch earnings-related data""" + for symbol in self.stock_symbols: + earnings = self.safe_api_call(self.client.company_earnings, symbol, limit=10) + if earnings: + self.save_data(earnings, "earnings_surprises", "earnings", symbol) + + def fetch_news_data(self): + """Fetch news and sentiment data""" + for symbol in self.stock_symbols: + company_news = self.safe_api_call( + self.client.company_news, symbol, + _from=self.date_str_from, to=self.date_str_to + ) + if company_news: + self.save_data(company_news, "company_news", "news", symbol) + + def fetch_ownership_data(self): + """Fetch ownership data""" + for symbol in self.stock_symbols: + insider_transactions = self.safe_api_call( + self.client.stock_insider_transactions, symbol, + self.date_str_from, self.date_str_to + ) + if insider_transactions: + self.save_data(insider_transactions, "insider_transactions", "ownership", symbol) + insider_sentiment = self.safe_api_call( + self.client.stock_insider_sentiment, symbol, + self.date_str_from, self.date_str_to + ) + if insider_sentiment: + self.save_data(insider_sentiment, "insider_sentiment", "ownership", symbol) + + def fetch_ratings_data(self): + """Fetch analyst ratings and recommendations""" + for symbol in self.stock_symbols: + recommendations = self.safe_api_call(self.client.recommendation_trends, symbol) + if recommendations: + self.save_data(recommendations, "recommendation_trends", "ratings", symbol) + + def fetch_regulatory_data(self): + """Fetch regulatory and compliance data""" + for symbol in self.stock_symbols: + filings = self.safe_api_call( + self.client.filings, symbol=symbol, + _from=self.date_str_from, to=self.date_str_to + ) + if filings: + self.save_data(filings, "sec_filings", "regulatory", symbol) + # patents = self.safe_api_call( + # self.client.stock_uspto_patent, symbol, + # self.date_str_from, self.date_str_to + # ) + # if patents: + # self.save_data(patents, "uspto_patents", "regulatory", symbol) + visa_apps = self.safe_api_call( + self.client.stock_visa_application, symbol, + self.date_str_from, self.date_str_to + ) + if visa_apps: + self.save_data(visa_apps, "visa_applications", "regulatory", symbol) + lobbying = self.safe_api_call( + self.client.stock_lobbying, symbol, + self.date_str_from, self.date_str_to + ) + if lobbying: + self.save_data(lobbying, "lobbying_data", "regulatory", symbol) + usa_spending = self.safe_api_call( + self.client.stock_usa_spending, symbol, + self.date_str_from, self.date_str_to + ) + if usa_spending: + self.save_data(usa_spending, "usa_spending", "regulatory", symbol) + + def fetch_market_data(self): + """Fetch general market data""" + stock_symbols = self.safe_api_call(self.client.stock_symbols, 'US') + if stock_symbols: + self.save_data(stock_symbols, "stock_symbols_us", "market_data") + ipo_calendar = self.safe_api_call( + self.client.ipo_calendar, _from=self.date_str_from, to=self.date_str_to + ) + if ipo_calendar: + self.save_data(ipo_calendar, "ipo_calendar", "market_data") + market_status = self.safe_api_call(self.client.market_status, exchange='US') + if market_status: + self.save_data(market_status, "market_status", "market_data") + market_holidays = self.safe_api_call(self.client.market_holiday, exchange='US') + if market_holidays: + self.save_data(market_holidays, "market_holidays", "market_data") + general_news = self.safe_api_call(self.client.general_news, 'general', min_id=0) + if general_news: + self.save_data(general_news, "general_news", "market_data") + covid_data = self.safe_api_call(self.client.covid19) + if covid_data: + self.save_data(covid_data, "covid19_data", "market_data") + fda_calendar = self.safe_api_call(self.client.fda_calendar) + if fda_calendar: + self.save_data(fda_calendar, "fda_calendar", "market_data") + + def fetch_crypto_data(self): + """Fetch cryptocurrency data""" + crypto_exchanges = self.safe_api_call(self.client.crypto_exchanges) + if crypto_exchanges: + self.save_data(crypto_exchanges, "crypto_exchanges", "crypto") + exchanges = ['BINANCE', 'COINBASE'] + for exchange in exchanges: + symbols = self.safe_api_call(self.client.crypto_symbols, exchange) + if symbols: + self.save_data(symbols, f"crypto_symbols_{exchange.lower()}", "crypto") + + def fetch_forex_data(self): + """Fetch forex data""" + forex_exchanges = self.safe_api_call(self.client.forex_exchanges) + if forex_exchanges: + self.save_data(forex_exchanges, "forex_exchanges", "forex") + forex_symbols = self.safe_api_call(self.client.forex_symbols, 'OANDA') + if forex_symbols: + self.save_data(forex_symbols, "forex_symbols_oanda", "forex") + + def run_full_fetch(self): + """Run complete data fetch for all APIs""" + try: + self.fetch_stock_data() + self.fetch_company_info() + self.fetch_financials() + self.fetch_earnings_data() + self.fetch_news_data() + self.fetch_ownership_data() + self.fetch_ratings_data() + self.fetch_regulatory_data() + self.fetch_market_data() + self.fetch_crypto_data() + self.fetch_forex_data() + except Exception as e: + raise + +def main(): + import sys + import os + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) + """Main function to run the data fetcher""" + try: + fetcher = FinnhubDataFetcher() + + # Print API key status + fetcher.print_api_key_status() + + fetcher.run_full_fetch() + print("Data fetching completed successfully!") + + # Print final API key status + print("\n[FINAL_STATUS] Finnhub API Key Usage Summary:") + fetcher.print_api_key_status() + + # Upload all files in data/finnhub/news to S3 under news/finnhub_news/ + from data_cloud.cloud_utils import StorageHandler + from dotenv import load_dotenv + load_dotenv() + import os + from pathlib import Path + + FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT") + FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY") + FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY") + FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET") + + storage = StorageHandler( + endpoint_url=FILEBASE_ENDPOINT, + access_key=FILEBASE_ACCESS_KEY, + secret_key=FILEBASE_SECRET_KEY, + bucket_name=FILEBASE_BUCKET, + local_base="data" + ) + + local_news_dir = os.path.join("data", "finnhub", "news") + s3_news_prefix = "news/finnhub_news/" + for root, _, files in os.walk(local_news_dir): + for fname in files: + local_path = os.path.join(root, fname) + rel_path = os.path.relpath(local_path, local_news_dir) + s3_key = s3_news_prefix + rel_path.replace("\\", "/") + with open(local_path, "rb") as f: + file_bytes = f.read() + storage.upload(s3_key, file_bytes, content_type="application/octet-stream") + print(f"[OK] Uploaded {local_path} -> S3:{s3_key}") + + except Exception as e: + print(f"Error: {str(e)}") + return 1 + return 0 + +if __name__ == "__main__": + exit(main()) \ No newline at end of file diff --git a/src/fetchers/finnhub/report.py b/src/fetchers/finnhub/report.py new file mode 100644 index 0000000000000000000000000000000000000000..f95e4dd2f8f2af92d1922bd0c9b9d8491bafe442 --- /dev/null +++ b/src/fetchers/finnhub/report.py @@ -0,0 +1,116 @@ +# import os +# import glob +# import json +# import pandas as pd + +# # —— CONFIG —__ +# BASE_DIR = "data/finnhub" +# OUTPUT_FILE = "data/finnhub/finnhub_feature_report.json" + +# # Define your feature‐category buckets by column‐name patterns: +# FEATURE_CATEGORIES = { +# "Price Quotes": [r"^c$", r"^o$", r"^h$", r"^l$", r"^pc$", r"^d$", r"^dp$"], +# "Company Profile": [r"^logo$", r"^name$", r"^country$", r"^ticker$"], +# "Financial Metrics": [r"^metric", r"^value$", r"^data$"], +# "Earnings": [r"^surprise", r"^actual", r"^estimate"], +# # add more as needed… +# } + +# def load_all_parquets(base_dir): +# """Read every .parquet under subdirs into one wide DataFrame, then drop all‐NA columns.""" +# dfs = [] +# for sub in os.listdir(base_dir): +# path = os.path.join(base_dir, sub) +# if not os.path.isdir(path): +# continue +# for fn in glob.glob(os.path.join(path, "*.parquet")): +# df = pd.read_parquet(fn) +# dfs.append(df) +# if not dfs: +# return pd.DataFrame() +# full = pd.concat(dfs, axis=1) +# # --- NEW: remove any column that’s entirely NA --- +# full = full.dropna(axis=1, how='all') +# return full + +# def bucket_features(cols, buckets): +# """Assign each column to the first matching bucket.""" +# import re +# result = {k: [] for k in buckets} +# leftovers = [] +# for col in cols: +# placed = False +# for name, patterns in buckets.items(): +# if any(re.match(pat, col) for pat in patterns): +# result[name].append(col) +# placed = True +# break +# if not placed: +# leftovers.append(col) +# if leftovers: +# result["Other"] = leftovers +# return result + +# def generate_report(df): +# # --- NEW: again ensure we dropped fully‐missing columns in case of downstream mutations --- +# df = df.dropna(axis=1, how='all') + +# now = pd.Timestamp.utcnow().isoformat() +# rows, cols = df.shape +# mem_mb = df.memory_usage(deep=True).sum() / 1e6 + +# # time range +# if "interval_timestamp" in df.columns: +# ts = pd.to_datetime(df["interval_timestamp"]) +# time_range = {"start": str(ts.min()), "end": str(ts.max())} +# else: +# time_range = {} + +# # feature categories +# feat_cats = bucket_features(df.columns.tolist(), FEATURE_CATEGORIES) + +# # --- NEW: de-dupe each feature list while preserving order --- +# for cat, flist in feat_cats.items(): +# feat_cats[cat] = list(dict.fromkeys(flist)) + +# feat_summary = { +# cat: {"count": len(cols), "features": cols} +# for cat, cols in feat_cats.items() +# } + +# # data quality +# missing = df.isna().sum().to_dict() +# total_cells = rows * cols +# total_missing = sum(missing.values()) +# completeness = 1 - total_missing / total_cells + +# report = { +# "timestamp": now, +# "dataset_info": { +# "shape": [rows, cols], +# "memory_usage_mb": mem_mb, +# "time_range": time_range +# }, +# "feature_categories": feat_summary, +# "data_quality": { +# "completeness": completeness, +# "avg_quality_score": completeness +# }, +# "feature_importance": {}, # fill in if you run a model +# "configuration": { +# "source": "Finnhub Free Plan", +# "generated_by": "generate_finnhub_report.py" +# } +# } +# return report + +# def main(): +# df = load_all_parquets(BASE_DIR) +# report = generate_report(df) +# with open(OUTPUT_FILE, "w") as f: +# json.dump(report, f, indent=2) +# print(f"Wrote report to {OUTPUT_FILE}") + +# if __name__ == "__main__": +# main() +# main() diff --git a/src/fetchers/finviz_sentiment/app.py b/src/fetchers/finviz_sentiment/app.py new file mode 100644 index 0000000000000000000000000000000000000000..9faf5ae65ddb8d5edeafa59d552f6422561def83 --- /dev/null +++ b/src/fetchers/finviz_sentiment/app.py @@ -0,0 +1,136 @@ +from urllib.request import urlopen, Request +from urllib.error import HTTPError +from bs4 import BeautifulSoup +import pandas as pd +import datetime +from dateutil import parser +from pathlib import Path +import sys +import os +import nltk + +# Ensure VADER lexicon is available in a writable location +try: + from src import config as app_config + _nltk_dir = os.path.join(app_config.DATA_DIR, 'nltk_data') +except Exception: + _nltk_dir = os.path.join(os.environ.get('DATA_DIR', '/data'), 'nltk_data') +os.makedirs(_nltk_dir, exist_ok=True) +if _nltk_dir not in nltk.data.path: + nltk.data.path.insert(0, _nltk_dir) +try: + nltk.data.find('vader_lexicon') +except LookupError: + nltk.download('vader_lexicon', download_dir=_nltk_dir) +from nltk.sentiment.vader import SentimentIntensityAnalyzer + +class StockSentimentAnalyzer: + def __init__(self): + self.stock_url = 'https://finviz.com/quote.ashx?t=' + self.crypto_url = 'https://finviz.com/crypto_charts.ashx?t=' + self.headers = { + 'User-Agent': ( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/115.0.0.0 Safari/537.36' + ) + } + self.vader = SentimentIntensityAnalyzer() + + def get_news(self, ticker): + """Fetch the Finviz news table for a ticker, falling back to crypto endpoint.""" + ticker = ticker.upper() + # Try stock quotes endpoint first + try: + req = Request(self.stock_url + ticker, headers=self.headers) + resp = urlopen(req) + except HTTPError as e: + # On 404 (no stock page), retry crypto endpoint with USD suffix + if e.code == 404: + if not ticker.endswith('USD'): + ticker += 'USD' + req = Request(self.crypto_url + ticker, headers=self.headers) + resp = urlopen(req) + else: + raise + html = resp.read() + soup = BeautifulSoup(html, 'lxml') + return soup.find(id='news-table') + + def parse_news(self, news_table): + """Parse rows into DataFrame of date, time, headline.""" + today = datetime.datetime.today().strftime('%b-%d-%y') + rows = [] + for tr in news_table.find_all('tr'): + try: + text = tr.a.get_text() + parts = tr.td.text.split() + if len(parts) == 1: + date_str, time_str = today, parts[0] + else: + date_str, time_str = parts + if date_str.lower() == 'today': + date_str = today + rows.append([date_str, time_str, text]) + except: + continue + df = pd.DataFrame(rows, columns=['date','time','headline']) + if not df.empty: + df['datetime'] = df.apply( + lambda r: self._parse_datetime(r['date'], r['time']), axis=1 + ) + df = df.dropna(subset=['datetime']) + return df + + def _parse_datetime(self, date_str, time_str): + try: + return parser.parse(f"{date_str} {time_str}") + except: + return None + + def score_news(self, df): + """Attach VADER sentiment_score to each headline.""" + if df.empty: + return df + scores = df['headline'].apply(self.vader.polarity_scores).tolist() + scores_df = pd.DataFrame(scores) + out = df.join(scores_df).set_index('datetime') + return out.drop(['date','time'], axis=1).rename(columns={'compound':'sentiment_score'}) + + def get_sentiment_data(self, ticker): + try: + table = self.get_news(ticker) + if table is None: + return None, f"No news table for '{ticker}'" + parsed = self.parse_news(table) + if parsed.empty: + return None, f"No articles for '{ticker}'" + scored = self.score_news(parsed) + if scored.empty: + return None, f"Sentiment scoring failed for '{ticker}'" + return scored, "Success" + except Exception as e: + return None, f"Error occurred: {e}" + +def main(): + tickers = sys.argv[1:] or ["AAPL","TSLA","GOOGL","NVDA","MSFT","BTC","SOL","XRP","ETH","ADA", "COIN"] + analyzer = StockSentimentAnalyzer() + + # Get project root directory (3 levels up from this file) + project_root = Path(__file__).parent.parent.parent.parent + out_dir = project_root / "data" / "finviz" / "sentiment" + out_dir.mkdir(parents=True, exist_ok=True) + + for t in tickers: + df, status = analyzer.get_sentiment_data(t) + if df is not None: + path = out_dir / f"{t.upper()}_sentiment.parquet" + # Ensure 'datetime' is a column before saving + df_reset = df.reset_index() if df.index.name == 'datetime' else df + df_reset.to_parquet(path) + print(f"Saved sentiment data for {t} to {path}") + else: + print(f"Error for {t}: {status}") + +if __name__ == "__main__": + main() diff --git a/src/fetchers/main.py b/src/fetchers/main.py new file mode 100644 index 0000000000000000000000000000000000000000..41af443d666b6d7870bd40a0578d38e1c55c1417 --- /dev/null +++ b/src/fetchers/main.py @@ -0,0 +1,159 @@ +# Entrypoint for all fetchers + +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))) # project root +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) # src +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers"))) # fetchers +import asyncio + +def run_advisorai_data(): + # print("[DEBUG] sys.path:") + # for p in sys.path: + # print(" ", p) + try: + from advisorai_data.advisorai_data_fetcher import main as advisorai_data_main + advisorai_data_main() + except ModuleNotFoundError as e: + print("[WARN] advisorai_data import failed, trying importlib fallback...") + import importlib.util + fetcher_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "advisorai_data", "advisorai_data_fetcher.py")) + spec = importlib.util.spec_from_file_location("advisorai_data_fetcher", fetcher_path) + advisorai_data_fetcher = importlib.util.module_from_spec(spec) + spec.loader.exec_module(advisorai_data_fetcher) + advisorai_data_fetcher.main() + +# def run_crawl_news(): +# import importlib.util +# import os +# import asyncio +# crawl_news_path = os.path.join(os.path.dirname(__file__), "crawl4ai", "crawl_news.py") +# spec = importlib.util.spec_from_file_location("crawl_news", crawl_news_path) +# crawl_news = importlib.util.module_from_spec(spec) +# spec.loader.exec_module(crawl_news) +# asyncio.run(crawl_news.main()) + +def run_crypto_bubbles(): + from crypto_bubbles.fetch_crypto_bubbles import main as crypto_bubbles_main + crypto_bubbles_main() + +def run_finnhub(): + # Use the installed finnhub package, not the local finnhub module + import importlib.util + import os + finnhub_main_path = os.path.join(os.path.dirname(__file__), "finnhub", "main.py") + spec = importlib.util.spec_from_file_location("finnhub_main", finnhub_main_path) + finnhub_main = importlib.util.module_from_spec(spec) + spec.loader.exec_module(finnhub_main) + finnhub_main.main() + +def run_alpaca_features(): + import importlib + import os + alpaca_path = os.path.join(os.path.dirname(__file__), "alpaca_api", "main.py") + spec = importlib.util.spec_from_file_location("alpaca_features", alpaca_path) + alpaca_features = importlib.util.module_from_spec(spec) + spec.loader.exec_module(alpaca_features) + alpaca_features.main() + +def run_marketaux_news(): + import importlib.util + import os + marketaux_news_path = os.path.join(os.path.dirname(__file__), "marketaux", "news.py") + spec = importlib.util.spec_from_file_location("marketaux_news", marketaux_news_path) + marketaux_news = importlib.util.module_from_spec(spec) + spec.loader.exec_module(marketaux_news) + marketaux_news.main() + +def run_finviz_sentiment(): + import importlib.util + finviz_path = os.path.join(os.path.dirname(__file__), "finviz_sentiment", "app.py") + spec = importlib.util.spec_from_file_location("finviz_sentiment_app", finviz_path) + finviz_sentiment_app = importlib.util.module_from_spec(spec) + spec.loader.exec_module(finviz_sentiment_app) + finviz_sentiment_app.main() + +def run_santiment(): + """Run Santiment with frequency control to preserve API limits""" + # Import frequency controller + import sys + import os + controller_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "santiment_frequency_controller.py")) + sys.path.insert(0, os.path.dirname(controller_path)) + + try: + from santiment_frequency_controller import SantimentFrequencyController + + # Check if Santiment should run + controller = SantimentFrequencyController() + + if not controller.should_run_santiment(max_runs_per_day=2): + print("[SANTIMENT] Skipping run due to frequency control") + status = controller.get_status() + print(f"[SANTIMENT] Runs today: {status['runs_today']}/2") + print(f"[SANTIMENT] Last run: {status['last_run']}") + return + + print("[SANTIMENT] Frequency control allows run - proceeding...") + + # Run Santiment + import importlib.util + santiment_path = os.path.join(os.path.dirname(__file__), "santiment", "main.py") + spec = importlib.util.spec_from_file_location("santiment_main", santiment_path) + santiment_main = importlib.util.module_from_spec(spec) + spec.loader.exec_module(santiment_main) + santiment_main.main() + + # Record the run + controller.record_run() + print("[SANTIMENT] Run completed and recorded") + + except Exception as e: + print(f"[SANTIMENT] Error in frequency control: {e}") + print("[SANTIMENT] Falling back to direct run...") + # Fallback to direct run if frequency control fails + import importlib.util + santiment_path = os.path.join(os.path.dirname(__file__), "santiment", "main.py") + spec = importlib.util.spec_from_file_location("santiment_main", santiment_path) + santiment_main = importlib.util.module_from_spec(spec) + spec.loader.exec_module(santiment_main) + santiment_main.main() + +def run_all(): + run_advisorai_data() + # run_crawl_news() + run_crypto_bubbles() + run_finnhub() + run_alpaca_features() + run_marketaux_news() + run_finviz_sentiment() + run_santiment() + print("[OK] All fetchers completed successfully.") + + +def main(): + # Simple CLI: python main.py [advisorai|crawl_news|crypto_bubbles|finnhub|alpaca_features|marketaux_news|finviz_sentiment|santiment|all] [TICKERS...] + if len(sys.argv) < 2 or sys.argv[1] == "all": + run_all() + elif sys.argv[1] == "advisorai": + run_advisorai_data() + # elif sys.argv[1] == "crawl_news": + # run_crawl_news() + elif sys.argv[1] == "crypto_bubbles": + run_crypto_bubbles() + elif sys.argv[1] == "finnhub": + run_finnhub() + elif sys.argv[1] == "alpaca_features": + run_alpaca_features() + elif sys.argv[1] == "marketaux_news": + run_marketaux_news() + elif sys.argv[1] == "finviz_sentiment": + run_finviz_sentiment() + elif sys.argv[1] == "santiment": + run_santiment() + else: + print("Usage: python main.py [advisorai|crawl_news|crypto_bubbles|finnhub|alpaca_features|marketaux_news|finviz_sentiment|santiment|all] [TICKERS...]") + sys.exit(1) + +if __name__ == "__main__": + main() diff --git a/src/fetchers/marketaux/news.py b/src/fetchers/marketaux/news.py new file mode 100644 index 0000000000000000000000000000000000000000..5507bc2ef75ea6a3a3c244c7da73a59b4e25a3c3 --- /dev/null +++ b/src/fetchers/marketaux/news.py @@ -0,0 +1,386 @@ +import os +import time +import logging +import random +from typing import Any, Dict, List, Optional + +import requests +from dotenv import load_dotenv + + +class MarketauxEndpointRestricted(RuntimeError): + """Raised when the requested API endpoint is not available for the current subscription.""" + + +class MarketauxClient: + """ + Client for interacting with the Marketaux API. + + Key features + ------------ + • Graceful handling of rate limits (HTTP 429) with exponential back‑off. + • API key switching for better rate limit management + • Friendly error when an endpoint is restricted by the subscription plan (HTTP 403 + with `endpoint_access_restricted` code). + • Transparent pagination through `_fetch_all`. + """ + + BASE_URL = "https://api.marketaux.com/v1/" + MAX_RETRIES = 3 + MAX_RETRIES_EXHAUSTED = 1 # Lower retry count when all keys are exhausted + BACKOFF_FACTOR = 2 + + def __init__(self, api_token: Optional[str] = None, logger: Optional[logging.Logger] = None): + load_dotenv() # load MARKETAUX_API_TOKEN from .env if present + + # Set up API key switching + self.api_keys = self._load_api_keys(api_token) + self.current_key_index = 0 + self.exhausted_keys = set() + + if not self.api_keys: + raise ValueError("Marketaux API token(s) must be provided or set as MARKETAUX_API_TOKEN") + + self.logger = logger or logging.getLogger(self.__class__.__name__) + self.logger.info(f"Initialized MarketauxClient with {len(self.api_keys)} API key(s)") + + # Validate API key diversity + self._validate_api_key_diversity() + + def _load_api_keys(self, provided_token: Optional[str] = None) -> List[str]: + """Load and validate API keys from environment or provided token.""" + api_keys = [] + + if provided_token: + api_keys.append(provided_token) + + # Try to load from environment variables + env_token = os.getenv("MARKETAUX_API_TOKEN") + if env_token: + # Support comma-separated multiple keys + env_keys = [key.strip() for key in env_token.split(',') if key.strip()] + api_keys.extend(env_keys) + + # Try numbered environment variables + for i in range(1, 10): # Support up to 9 keys + key = os.getenv(f"MARKETAUX_API_TOKEN_{i}") + if key: + api_keys.append(key.strip()) + + # Remove duplicates while preserving order + seen = set() + unique_keys = [] + for key in api_keys: + if key not in seen: + seen.add(key) + unique_keys.append(key) + + return unique_keys + + def _get_current_api_key(self) -> str: + """Get the current API key, switching if necessary.""" + if self.current_key_index >= len(self.api_keys): + self.current_key_index = 0 + + return self.api_keys[self.current_key_index] + + def _are_all_keys_exhausted(self) -> bool: + """Check if all API keys have been exhausted.""" + return len(self.exhausted_keys) >= len(self.api_keys) + + def _switch_api_key(self) -> bool: + """Switch to the next available API key.""" + current_key = self._get_current_api_key() + self.exhausted_keys.add(current_key) + + # Find next non-exhausted key + for i in range(len(self.api_keys)): + next_index = (self.current_key_index + 1 + i) % len(self.api_keys) + next_key = self.api_keys[next_index] + + if next_key not in self.exhausted_keys: + self.current_key_index = next_index + self.logger.info(f"Switched to API key #{next_index + 1}") + return True + + # All keys are exhausted + self.logger.error("All API keys have been exhausted") + return False + + def _validate_api_key_diversity(self) -> bool: + """Validate that we have diverse API keys from different accounts.""" + if len(self.api_keys) < 2: + return True # Single key is always valid + + # Check for diversity by comparing key prefixes/suffixes + prefixes = set() + suffixes = set() + + for key in self.api_keys: + if len(key) >= 8: + prefixes.add(key[:4]) + suffixes.add(key[-4:]) + + diversity_score = len(prefixes) + len(suffixes) + total_possible = len(self.api_keys) * 2 + + if diversity_score < total_possible * 0.6: # Less than 60% diversity + self.logger.warning("API keys may be from the same account - limited rate limit benefits") + + return True + + # ------------------------------------------------------------ + # Low‑level HTTP request + # ------------------------------------------------------------ + def _request(self, endpoint: str, params: Dict[str, Any]) -> Dict[str, Any]: + url = f"{self.BASE_URL}{endpoint}" + + # Add current API key to params + params = params.copy() + params["api_token"] = self._get_current_api_key() + + # Determine retry limit based on whether keys are exhausted + max_retries = self.MAX_RETRIES_EXHAUSTED if self._are_all_keys_exhausted() else self.MAX_RETRIES + + for attempt in range(1, max_retries + 1): + response = requests.get(url, params=params, timeout=30) + + # 429 – Rate limit or 402 – Usage limit reached + if response.status_code in [429, 402]: + # Try switching API key first + if self._switch_api_key(): + params["api_token"] = self._get_current_api_key() + error_type = "Rate limit" if response.status_code == 429 else "Usage limit" + self.logger.warning(f"{error_type} hit, switched API key and retrying immediately (attempt {attempt}/{max_retries})") + continue + + # If no more keys available, handle gracefully + error_type = "Rate limit" if response.status_code == 429 else "Usage limit" + if attempt >= max_retries: + # On final attempt, log and skip gracefully instead of crashing + self.logger.warning(f"{error_type} hit, all keys exhausted. Skipping request to avoid crash.") + return {"data": []} # Return empty result instead of crashing + + # Wait before retry + reset_header = response.headers.get("X-Api-Ratelimit-Reset") + wait = 60 + if reset_header and reset_header.isdigit(): + wait = max(5, int(reset_header) - int(time.time())) + self.logger.warning(f"{error_type} hit, all keys exhausted. Waiting %s s before retrying (%s/%s)…", wait, attempt, max_retries) + time.sleep(wait * self.BACKOFF_FACTOR ** (attempt - 1)) + continue + + # 403 – Endpoint not available on plan or potentially invalid key + if response.status_code == 403: + try: + payload = response.json() + error_code = payload.get("error", {}).get("code") + + if error_code == "endpoint_access_restricted": + raise MarketauxEndpointRestricted(payload["error"]["message"]) + elif error_code in ["invalid_api_token", "unauthorized"]: + # Try switching API key + if self._switch_api_key(): + params["api_token"] = self._get_current_api_key() + self.logger.warning(f"Invalid API key detected, switched and retrying (attempt {attempt}/{self.MAX_RETRIES})") + continue + else: + raise RuntimeError("All API keys are invalid or exhausted") + except ValueError: + pass # fall through to generic error handler + + # Other errors + if not response.ok: + raise RuntimeError(f"Marketaux API error: {response.status_code} — {response.text}") + + # Success + return response.json() + + # If we exhausted retries, return empty result instead of crashing + if self._are_all_keys_exhausted(): + self.logger.warning("All API keys exhausted, returning empty result to prevent pipeline crash") + return {"data": []} + else: + raise RuntimeError("Exceeded maximum retries for Marketaux API") + + # ------------------------------------------------------------ + # Pagination helper + # ------------------------------------------------------------ + def _fetch_all(self, endpoint: str, params: Optional[Dict[str, Any]] = None, *, paginate: bool = True) -> List[Dict[str, Any]]: + params = params.copy() if params else {} + # Don't add api_token here - it's added in _request method + + all_data: List[Dict[str, Any]] = [] + page = 1 + + while True: + params["page"] = page + result = self._request(endpoint, params) + all_data.extend(result.get("data", [])) + + if not paginate: + break + + meta = result.get("meta", {}) + returned = meta.get("returned", 0) + limit = meta.get("limit", 0) + found = meta.get("found", 0) + + if returned < limit or len(all_data) >= found: + break + page += 1 + + return all_data + + # ------------------------------------------------------------ + # High‑level convenience methods + # ------------------------------------------------------------ + def fetch_news_all( + self, + *, + symbols: Optional[List[str]] = None, + limit: int = 20, + must_have_entities: bool = True, + published_after: Optional[str] = None, + published_before: Optional[str] = None, + language: Optional[str] = None, + sort: str = "published_at.desc", + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch news articles matching the supplied filters.""" + params: Dict[str, Any] = {"limit": limit, "sort": sort} + if symbols: + params["symbols"] = ",".join(symbols) + if must_have_entities: + params["must_have_entities"] = "true" + if published_after: + params["published_after"] = published_after + if published_before: + params["published_before"] = published_before + if language: + params["language"] = language + return self._fetch_all("news/all", params, paginate=paginate) + + def fetch_entity_stats_aggregation( + self, + *, + symbols: Optional[List[str]] = None, + interval: str = "day", + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch aggregated entity stats (daily/hourly).""" + params: Dict[str, Any] = {"interval": interval, "limit": limit} + if symbols: + params["symbols"] = ",".join(symbols) + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("entity/stats/aggregation", params, paginate=paginate) + + def fetch_entity_stats_intraday( + self, + *, + symbols: Optional[List[str]] = None, + interval: str = "minute", + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch intraday entity stats (minute/5‑minute granularity).""" + params: Dict[str, Any] = {"interval": interval, "limit": limit} + if symbols: + params["symbols"] = ",".join(symbols) + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("entity/stats/intraday", params, paginate=paginate) + + def fetch_trending_aggregation( + self, + *, + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch aggregated trending data.""" + params: Dict[str, Any] = {"limit": limit} + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("trending/aggregation", params, paginate=paginate) + + def fetch_trending_intraday( + self, + *, + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch intraday trending data.""" + params: Dict[str, Any] = {"limit": limit} + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("trending/intraday", params, paginate=paginate) + +def main(): + import sys + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) + import pprint + + client = MarketauxClient() + pp = pprint.PrettyPrinter(depth=2) + + news = client.fetch_news_all(symbols=["AAPL", "MSFT","NVDA", "GOOGL", "TSLA"], limit=20, paginate=False) + print(f"Fetched {len(news)} news articles from Marketaux.") + + # Save to Parquet file + import pandas as pd + out_dir = os.path.join("data", "marketaux", "news") + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, "news_latest.parquet") + df = pd.DataFrame(news) + df.to_parquet(out_path, index=False) + print(f"Saved news articles to {out_path}") + + # Upload all files in data/marketaux/news to S3 under news/marketaux_news/ + from data_cloud.cloud_utils import StorageHandler + from dotenv import load_dotenv + load_dotenv() + FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT") + FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY") + FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY") + FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET") + + storage = StorageHandler( + endpoint_url=FILEBASE_ENDPOINT, + access_key=FILEBASE_ACCESS_KEY, + secret_key=FILEBASE_SECRET_KEY, + bucket_name=FILEBASE_BUCKET, + local_base="data" + ) + + local_news_dir = os.path.join("data", "marketaux", "news") + s3_news_prefix = "news/marketaux_news/" + for root, _, files in os.walk(local_news_dir): + for fname in files: + local_path = os.path.join(root, fname) + rel_path = os.path.relpath(local_path, local_news_dir) + s3_key = s3_news_prefix + rel_path.replace("\\", "/") + with open(local_path, "rb") as f: + file_bytes = f.read() + storage.upload(s3_key, file_bytes, content_type="application/octet-stream") + print(f"[OK] Uploaded {local_path} -> S3:{s3_key}") + +if __name__ == "__main__": + main() diff --git a/src/fetchers/marketaux/news_original.py b/src/fetchers/marketaux/news_original.py new file mode 100644 index 0000000000000000000000000000000000000000..5b0f48f912c0eaea0d213ca484f860fe7800b17c --- /dev/null +++ b/src/fetchers/marketaux/news_original.py @@ -0,0 +1,253 @@ +import os +import time +import logging +from typing import Any, Dict, List, Optional + +import requests +from dotenv import load_dotenv + + +class MarketauxEndpointRestricted(RuntimeError): + """Raised when the requested API endpoint is not available for the current subscription.""" + + +class MarketauxClient: + """ + Client for interacting with the Marketaux API. + + Key features + ------------ + • Graceful handling of rate limits (HTTP 429) with exponential back‑off. + • Friendly error when an endpoint is restricted by the subscription plan (HTTP 403 + with `endpoint_access_restricted` code). + • Transparent pagination through `_fetch_all`. + """ + + BASE_URL = "https://api.marketaux.com/v1/" + MAX_RETRIES = 3 + BACKOFF_FACTOR = 2 + + def __init__(self, api_token: Optional[str] = None, logger: Optional[logging.Logger] = None): + load_dotenv() # load MARKETAUX_API_TOKEN from .env if present + self.api_token = api_token or os.getenv("MARKETAUX_API_TOKEN") + if not self.api_token: + raise ValueError("Marketaux API token must be provided or set as MARKETAUX_API_TOKEN") + self.logger = logger or logging.getLogger(self.__class__.__name__) + + # ------------------------------------------------------------ + # Low‑level HTTP request + # ------------------------------------------------------------ + def _request(self, endpoint: str, params: Dict[str, Any]) -> Dict[str, Any]: + url = f"{self.BASE_URL}{endpoint}" + for attempt in range(1, self.MAX_RETRIES + 1): + response = requests.get(url, params=params, timeout=30) + + # 429 – Rate limit + if response.status_code == 429: + reset_header = response.headers.get("X-Api-Ratelimit-Reset") + wait = 60 + if reset_header and reset_header.isdigit(): + wait = max(5, int(reset_header) - int(time.time())) + self.logger.warning("Rate‑limit hit. Waiting %s s before retrying (%s/%s)…", wait, attempt, self.MAX_RETRIES) + time.sleep(wait * self.BACKOFF_FACTOR ** (attempt - 1)) + continue + + # 403 – Endpoint not available on plan + if response.status_code == 403: + try: + payload = response.json() + if payload.get("error", {}).get("code") == "endpoint_access_restricted": + raise MarketauxEndpointRestricted(payload["error"]["message"]) + except ValueError: + pass # fall through to generic error handler + + # Other errors + if not response.ok: + raise RuntimeError(f"Marketaux API error: {response.status_code} — {response.text}") + + # Success + return response.json() + + # If we exhausted retries + raise RuntimeError("Exceeded maximum retries for Marketaux API") + + # ------------------------------------------------------------ + # Pagination helper + # ------------------------------------------------------------ + def _fetch_all(self, endpoint: str, params: Optional[Dict[str, Any]] = None, *, paginate: bool = True) -> List[Dict[str, Any]]: + params = params.copy() if params else {} + params["api_token"] = self.api_token + + all_data: List[Dict[str, Any]] = [] + page = 1 + + while True: + params["page"] = page + result = self._request(endpoint, params) + all_data.extend(result.get("data", [])) + + if not paginate: + break + + meta = result.get("meta", {}) + returned = meta.get("returned", 0) + limit = meta.get("limit", 0) + found = meta.get("found", 0) + + if returned < limit or len(all_data) >= found: + break + page += 1 + + return all_data + + # ------------------------------------------------------------ + # High‑level convenience methods + # ------------------------------------------------------------ + def fetch_news_all( + self, + *, + symbols: Optional[List[str]] = None, + limit: int = 20, + must_have_entities: bool = True, + published_after: Optional[str] = None, + published_before: Optional[str] = None, + language: Optional[str] = None, + sort: str = "published_at.desc", + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch news articles matching the supplied filters.""" + params: Dict[str, Any] = {"limit": limit, "sort": sort} + if symbols: + params["symbols"] = ",".join(symbols) + if must_have_entities: + params["must_have_entities"] = "true" + if published_after: + params["published_after"] = published_after + if published_before: + params["published_before"] = published_before + if language: + params["language"] = language + return self._fetch_all("news/all", params, paginate=paginate) + + def fetch_entity_stats_aggregation( + self, + *, + symbols: Optional[List[str]] = None, + interval: str = "day", + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch aggregated entity stats (daily/hourly).""" + params: Dict[str, Any] = {"interval": interval, "limit": limit} + if symbols: + params["symbols"] = ",".join(symbols) + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("entity/stats/aggregation", params, paginate=paginate) + + def fetch_entity_stats_intraday( + self, + *, + symbols: Optional[List[str]] = None, + interval: str = "minute", + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch intraday entity stats (minute/5‑minute granularity).""" + params: Dict[str, Any] = {"interval": interval, "limit": limit} + if symbols: + params["symbols"] = ",".join(symbols) + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("entity/stats/intraday", params, paginate=paginate) + + def fetch_trending_aggregation( + self, + *, + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch aggregated trending data.""" + params: Dict[str, Any] = {"limit": limit} + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("trending/aggregation", params, paginate=paginate) + + def fetch_trending_intraday( + self, + *, + limit: int = 100, + date_from: Optional[str] = None, + date_to: Optional[str] = None, + paginate: bool = True, + ) -> List[Dict[str, Any]]: + """Fetch intraday trending data.""" + params: Dict[str, Any] = {"limit": limit} + if date_from: + params["date_from"] = date_from + if date_to: + params["date_to"] = date_to + return self._fetch_all("trending/intraday", params, paginate=paginate) + +def main(): + import sys + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) + import pprint + + client = MarketauxClient() + pp = pprint.PrettyPrinter(depth=2) + + news = client.fetch_news_all(symbols=["AAPL", "MSFT","NVDA", "GOOGL", "TSLA"], limit=20, paginate=False) + print(f"Fetched {len(news)} news articles from Marketaux.") + + # Save to Parquet file + import pandas as pd + out_dir = os.path.join("data", "marketaux", "news") + os.makedirs(out_dir, exist_ok=True) + out_path = os.path.join(out_dir, "news_latest.parquet") + df = pd.DataFrame(news) + df.to_parquet(out_path, index=False) + print(f"Saved news articles to {out_path}") + + # Upload all files in data/marketaux/news to S3 under news/marketaux_news/ + from data_cloud.cloud_utils import StorageHandler + from dotenv import load_dotenv + load_dotenv() + FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT") + FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY") + FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY") + FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET") + + storage = StorageHandler( + endpoint_url=FILEBASE_ENDPOINT, + access_key=FILEBASE_ACCESS_KEY, + secret_key=FILEBASE_SECRET_KEY, + bucket_name=FILEBASE_BUCKET, + local_base="data" + ) + + local_news_dir = os.path.join("data", "marketaux", "news") + s3_news_prefix = "news/marketaux_news/" + for root, _, files in os.walk(local_news_dir): + for fname in files: + local_path = os.path.join(root, fname) + rel_path = os.path.relpath(local_path, local_news_dir) + s3_key = s3_news_prefix + rel_path.replace("\\", "/") + with open(local_path, "rb") as f: + file_bytes = f.read() + storage.upload(s3_key, file_bytes, content_type="application/octet-stream") + print(f"[OK] Uploaded {local_path} -> S3:{s3_key}") +if __name__ == "__main__": + main() diff --git a/src/fetchers/santiment/main.py b/src/fetchers/santiment/main.py new file mode 100644 index 0000000000000000000000000000000000000000..12bb0f1e1075029185e3eb15464b7a88336e9d7f --- /dev/null +++ b/src/fetchers/santiment/main.py @@ -0,0 +1,1871 @@ +""" +Comprehensive Santiment Data Fetcher +==================================== + +This module provides a complete data fetcher for the Santiment API using the sanpy library. +It maximizes data retrieval by organizing metrics into categories and providing batch operations. + +Features: +- Fetches all available metrics organized by category +- Supports batch operations for efficient API usage +- Handles rate limiting and error management +- Provides data export capabilities +- Supports both single asset and multi-asset queries +- Includes SQL query execution for custom data needs + +Author: AI Assistant +Version: 1.0.0 +""" + +import san +import pandas as pd +import numpy as np +import time +import logging +from datetime import datetime, timedelta +from typing import List, Dict, Optional, Union, Any +import json +import os +from dataclasses import dataclass, field +from concurrent.futures import ThreadPoolExecutor, as_completed + +# Load environment variables +try: + from dotenv import load_dotenv + load_dotenv() +except ImportError: + pass # dotenv not available, continue without it +import warnings + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Resolve data directory base +try: + from src.config import DATA_DIR as CFG_DATA_DIR +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR + except Exception: + CFG_DATA_DIR = "/data" + +from pathlib import Path + +def _resolve_under_data(path_like: str | os.PathLike) -> str: + p = Path(path_like) + if p.is_absolute(): + return str(p) + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return str(Path(CFG_DATA_DIR) / rel) + +@dataclass +class FetchConfig: + """Configuration class for data fetching parameters - OPTIMIZED FOR API CONSERVATION""" + from_date: str = "2024-01-01" # Reduced from 2020 to save API calls + to_date: str = "utc_now" + interval: str = "1d" + include_incomplete_data: bool = False + batch_size: int = 25 # Reduced from 50 to save API calls + max_workers: int = 5 # Reduced from 10 to save API calls + rate_limit_delay: int = 60 + export_format: str = "parquet" # csv, json, parquet + export_directory: str = "data/santiment" + +class SantimentDataFetcher: + """ + Comprehensive Santiment Data Fetcher + + This class provides methods to fetch maximum possible data from Santiment API + using the sanpy library with efficient batch operations and error handling. + """ + + def __init__(self, api_key: Optional[str] = None, config: Optional[FetchConfig] = None): + """ + Initialize the Santiment Data Fetcher + + Args: + api_key: Santiment API key(s) for accessing restricted data (comma-separated for multiple keys) + config: FetchConfig object with fetching parameters + """ + self.config = config or FetchConfig() + self._normalize_dates() + + # Set up multiple API keys + self._setup_api_keys(api_key) + + # Resolve export directory under DATA_DIR, create and clean up existing files + self.config.export_directory = _resolve_under_data(self.config.export_directory) + os.makedirs(self.config.export_directory, exist_ok=True) + self._cleanup_existing_files() + + # Initialize data storage + self.fetched_data: Dict[str, pd.DataFrame] = {} + self.failed_queries: List[Dict] = [] + + # Define comprehensive metric categories + self.metric_categories = self._define_metric_categories() + + # Get available metrics and projects + self._initialize_metadata() + + # Initialize symbol normalization + self.symbol_normalizer = self._setup_symbol_normalizer() + + def _setup_symbol_normalizer(self): + """ + Set up symbol normalization mapping for consistent asset identification + + Returns: + Dictionary mapping various symbol formats to canonical slugs + """ + # Canonical mapping for major crypto assets + # Maps various symbols/names to the official Santiment slug + symbol_mapping = { + # Bitcoin variants + 'bitcoin': 'bitcoin', + 'btc': 'bitcoin', + 'Bitcoin': 'bitcoin', + 'BTC': 'bitcoin', + + # Ethereum variants + 'ethereum': 'ethereum', + 'eth': 'ethereum', + 'Ethereum': 'ethereum', + 'ETH': 'ethereum', + + # Ripple/XRP variants + 'ripple': 'ripple', + 'xrp': 'ripple', + 'Ripple': 'ripple', + 'XRP': 'ripple', + + # Solana variants + 'solana': 'solana', + 'sol': 'solana', + 'Solana': 'solana', + 'SOL': 'solana', + + # Cardano variants + 'cardano': 'cardano', + 'ada': 'cardano', + 'Cardano': 'cardano', + 'ADA': 'cardano', + + # Polkadot variants + 'polkadot': 'polkadot', + 'dot': 'polkadot', + 'Polkadot': 'polkadot', + 'DOT': 'polkadot', + + # Chainlink variants + 'chainlink': 'chainlink', + 'link': 'chainlink', + 'Chainlink': 'chainlink', + 'LINK': 'chainlink', + + # Litecoin variants + 'litecoin': 'litecoin', + 'ltc': 'litecoin', + 'Litecoin': 'litecoin', + 'LTC': 'litecoin', + + # Bitcoin Cash variants + 'bitcoin-cash': 'bitcoin-cash', + 'bch': 'bitcoin-cash', + 'Bitcoin Cash': 'bitcoin-cash', + 'BCH': 'bitcoin-cash', + + # Stellar variants + 'stellar': 'stellar', + 'xlm': 'stellar', + 'Stellar': 'stellar', + 'XLM': 'stellar', + + # Ethereum Classic variants + 'ethereum-classic': 'ethereum-classic', + 'etc': 'ethereum-classic', + 'Ethereum Classic': 'ethereum-classic', + 'ETC': 'ethereum-classic', + + # EOS variants + 'eos': 'eos', + 'EOS': 'eos', + } + + logger.info(f"Initialized symbol normalizer with {len(symbol_mapping)} mappings") + return symbol_mapping + + def normalize_symbol(self, symbol: str) -> str: + """ + Normalize a symbol to its canonical Santiment slug + + Args: + symbol: Symbol to normalize + + Returns: + Canonical slug + """ + if symbol in self.symbol_normalizer: + canonical = self.symbol_normalizer[symbol] + if symbol != canonical: + logger.debug(f"Normalized '{symbol}' -> '{canonical}'") + return canonical + + # If not found in mapping, return as-is but log warning + logger.warning(f"Unknown symbol '{symbol}' not found in normalization mapping") + return symbol.lower() + + def get_symbol_alternatives(self, symbol: str) -> List[str]: + """ + Get all alternative symbols for a given symbol (both directions) + + Args: + symbol: Symbol to find alternatives for + + Returns: + List of alternative symbols including the original + """ + alternatives = [symbol] + + # Create reverse mapping to find alternatives + reverse_mapping = {} + for variant, canonical in self.symbol_normalizer.items(): + if canonical not in reverse_mapping: + reverse_mapping[canonical] = [] + reverse_mapping[canonical].append(variant) + + # If symbol is a canonical, get all its variants + if symbol in reverse_mapping: + alternatives.extend(reverse_mapping[symbol]) + + # If symbol is a variant, get the canonical and other variants + canonical = self.normalize_symbol(symbol) + if canonical in reverse_mapping: + alternatives.extend(reverse_mapping[canonical]) + + # Remove duplicates and return + return list(set(alternatives)) + + def fetch_single_metric_with_alternatives(self, metric: str, slug: str, **kwargs) -> Optional[pd.DataFrame]: + """ + Fetch a single metric for a single asset, trying alternative symbols if the primary fails + + Args: + metric: The metric name + slug: The asset slug (will try alternatives if this fails) + **kwargs: Additional parameters for the API call + + Returns: + DataFrame with the metric data or None if failed + """ + # Get all alternative symbols to try + alternatives = self.get_symbol_alternatives(slug) + logger.debug(f"Trying alternatives for {slug}: {alternatives}") + + # Try each alternative in order (start with the normalized canonical form) + canonical = self.normalize_symbol(slug) + if canonical != slug: + alternatives = [canonical] + [alt for alt in alternatives if alt != canonical] + + for i, alt_slug in enumerate(alternatives): + try: + data = self.fetch_single_metric(metric, alt_slug, **kwargs) + if data is not None and not data.empty: + if i > 0 or alt_slug != slug: # Successfully fetched with alternative + logger.info(f"[ALT_SUCCESS] {metric} for {slug} succeeded using alternative '{alt_slug}'") + # Update slug column to reflect the original requested slug for consistency + data['slug'] = slug + data['alternative_slug_used'] = alt_slug + return data + except Exception as e: + error_msg = str(e) + # Check if this is a metric-level error that won't be fixed by trying other slugs + if any(skip_phrase in error_msg.lower() for skip_phrase in [ + 'not supported for', + 'not implemented for', + 'outside the allowed interval', + 'upgrade to a higher tier' + ]): + logger.warning(f"[METRIC_SKIP] {metric} has fundamental issues, skipping all alternatives: {error_msg}") + break # Don't try other alternatives for this metric + + # If it's just a slug issue, continue trying alternatives + if 'is not an existing slug' in error_msg.lower(): + logger.debug(f"Alternative {alt_slug} failed for {metric}: {e}") + continue + else: + logger.debug(f"Alternative {alt_slug} failed for {metric}: {e}") + continue + + logger.warning(f"[ALT_FAILED] All alternatives failed for {metric} with slug {slug}") + return None + + def normalize_slug_list(self, slugs: List[str]) -> List[str]: + """ + Normalize a list of slugs and remove duplicates + + Args: + slugs: List of slugs to normalize + + Returns: + List of normalized, deduplicated slugs + """ + normalized = [] + seen = set() + + for slug in slugs: + canonical = self.normalize_symbol(slug) + if canonical not in seen: + normalized.append(canonical) + seen.add(canonical) + else: + logger.debug(f"Removed duplicate slug: {slug} (canonical: {canonical})") + + logger.info(f"Normalized {len(slugs)} slugs to {len(normalized)} unique canonical slugs") + return normalized + + def _normalize_dates(self): + """ + Convert relative date strings in self.config.from_date / to_date + into absolute YYYY-MM-DD dates that Sanpy can parse. + Supports: + - "ND" (e.g. "30d") → today minus N days + - "utc_now" → today + """ + now = datetime.utcnow() + # from_date: e.g. "30d" + fd = self.config.from_date.strip().lower() + if fd.endswith('d') and fd[:-1].isdigit(): + days = int(fd[:-1]) + from_dt = now - timedelta(days=days) + # Sanpy expects "YYYY-MM-DD" + self.config.from_date = from_dt.strftime('%Y-%m-%d') + + # to_date: sometimes set to "utc_now" + td = self.config.to_date.strip().lower() + if td == 'utc_now': + self.config.to_date = now.strftime('%Y-%m-%d') + + def _setup_api_keys(self, api_key: Optional[str] = None): + """ + Set up multiple API keys for rate limit handling + + Args: + api_key: API key(s) - can be comma-separated for multiple keys + """ + # Parse API keys from parameter or environment + api_key_string = api_key or os.getenv('SANTIMENT_API_KEY') + + if api_key_string: + # Support comma-separated API keys + self.api_keys = [key.strip() for key in api_key_string.split(',') if key.strip()] + logger.info(f"Santiment fetcher initialized with {len(self.api_keys)} API key(s)") + + # Check if all keys are from the same account + if len(self.api_keys) > 1: + logger.info("Multiple API keys detected. Testing key diversity...") + self._validate_api_key_diversity() + else: + self.api_keys = [] + logger.warning("No API key provided - limited to free tier data") + + # Initialize API key management + self.current_key_index = 0 + self.rate_limit_switches = 0 + + # Set initial API key + if self.api_keys: + self._set_current_api_key() + + def _validate_api_key_diversity(self): + """ + Validate that API keys are from different accounts for effective rate limit handling + """ + try: + user_ids = set() + functional_keys = 0 + rate_limited_keys = 0 + + for i, key in enumerate(self.api_keys[:3]): # Test only first 3 to avoid exhausting quota + # Temporarily set this key + san.ApiConfig.api_key = key + + try: + # Make a simple query to get user info + result = san.execute_sql(query="SELECT 1", set_index=None) + + # If successful, key is functional but we can't determine user ID without error + functional_keys += 1 + logger.info(f"API Key #{i+1}: {key[:8]}... appears functional") + + except Exception as e: + error_str = str(e) + if 'user with id' in error_str: + # Extract user ID from error message + import re + match = re.search(r'user with id (\d+)', error_str) + if match: + user_id = match.group(1) + user_ids.add(user_id) + rate_limited_keys += 1 + logger.info(f"API Key #{i+1}: {key[:8]}... belongs to user ID {user_id} (rate limited)") + else: + logger.debug(f"API Key #{i+1}: {key[:8]}... - {error_str}") + + # Reset to first key + self.current_key_index = 0 + self._set_current_api_key() + + # Analyze results + if rate_limited_keys > 0 and len(user_ids) == 1: + if functional_keys > 0: + logger.warning("⚠️ WARNING: Cannot determine if all API keys are from different accounts!") + logger.warning(f"⚠️ {rate_limited_keys} key(s) belong to user ID {list(user_ids)[0]}, {functional_keys} key(s) appear functional") + logger.warning("⚠️ If functional keys are from the same account, rate limit switching won't work.") + logger.warning("⚠️ For guaranteed effective rate limiting, use API keys from different Santiment accounts.") + logger.warning("⚠️ Create additional accounts at https://app.santiment.net/") + else: + logger.warning("⚠️ WARNING: All tested API keys belong to the same Santiment account!") + logger.warning("⚠️ Rate limits are applied per account, not per key.") + logger.warning("⚠️ API key switching will not be effective with same-account keys.") + logger.warning("⚠️ Create additional accounts at https://app.santiment.net/") + elif len(user_ids) > 1: + logger.info(f"✅ Good! API keys are from {len(user_ids)} different accounts.") + logger.info("✅ This will provide effective rate limit distribution.") + elif functional_keys == len(self.api_keys): + logger.info("✅ All API keys appear functional.") + logger.info("ℹ️ Cannot determine account diversity without rate limit errors.") + logger.info("ℹ️ Monitor rate limit switches during operation to verify effectiveness.") + + except Exception as e: + logger.debug(f"Could not validate API key diversity: {e}") + logger.info("API key diversity validation skipped - continuing with provided keys") + + def _set_current_api_key(self): + """Set the current API key in san.ApiConfig""" + if self.api_keys: + current_key = self.api_keys[self.current_key_index] + san.ApiConfig.api_key = current_key + logger.info(f"Using API key #{self.current_key_index + 1}: {current_key[:8]}...") + else: + san.ApiConfig.api_key = None + + def _switch_api_key(self): + """Switch to the next available API key""" + if len(self.api_keys) <= 1: + logger.warning("Only one or no API keys available, cannot switch") + return False + + old_index = self.current_key_index + self.current_key_index = (self.current_key_index + 1) % len(self.api_keys) + self.rate_limit_switches += 1 + + logger.info(f"[SWITCH] Switching from API key #{old_index + 1} to #{self.current_key_index + 1} (switch #{self.rate_limit_switches})") + + # Warn if switching too frequently (indicates same account issue) + if self.rate_limit_switches > len(self.api_keys) * 2: + logger.warning("⚠️ High number of API key switches detected!") + logger.warning("⚠️ This suggests all keys may be from the same account.") + logger.warning("⚠️ Consider using API keys from different Santiment accounts.") + + # Set new API key + self._set_current_api_key() + + # Add a delay after switching keys + time.sleep(2.0) + return True + + def _is_rate_limit_error(self, error_message): + """Check if the error indicates a rate limit issue""" + rate_limit_indicators = [ + "429", + "rate limit", + "too many requests", + "api limit", + "quota exceeded", + "limit exceeded", + "rate_limit_exception", + "API Rate Limit Reached", + "rate limit reached" + ] + error_str = str(error_message).lower() + return any(indicator in error_str for indicator in rate_limit_indicators) + + def _cleanup_existing_files(self): + """ + Clean up all existing files in the export directory before starting a new fetch. + This prevents accumulation of old data files from previous runs. + """ + import glob + import shutil + + if not os.path.exists(self.config.export_directory): + return + + try: + # Get all files in the export directory + all_files = glob.glob(os.path.join(self.config.export_directory, "*")) + + if all_files: + logger.info(f"Cleaning up {len(all_files)} existing files in {self.config.export_directory}") + + for file_path in all_files: + try: + if os.path.isfile(file_path): + os.remove(file_path) + logger.debug(f"Removed file: {os.path.basename(file_path)}") + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + logger.debug(f"Removed directory: {os.path.basename(file_path)}") + except Exception as e: + logger.warning(f"Failed to remove {file_path}: {e}") + + logger.info(f"Successfully cleaned up export directory: {self.config.export_directory}") + else: + logger.info(f"Export directory is already clean: {self.config.export_directory}") + + except Exception as e: + logger.error(f"Failed to cleanup export directory {self.config.export_directory}: {e}") + # Don't raise the exception - just log it and continue + + def _define_metric_categories(self) -> Dict[str, List[str]]: + """Define REDUCED categories of Santiment metrics for API conservation.""" + return { + # Essential Financial Metrics Only + 'financial': [ + 'price_usd', 'marketcap_usd', 'volume_usd' + # Reduced from 12 to 3 most important metrics + ], + + # Core Network Activity + 'network_activity': [ + 'daily_active_addresses', 'new_addresses' + # Reduced from 9 to 2 most important metrics + ], + + # Basic Transaction Metrics + 'transactions': [ + 'transaction_count', 'transaction_volume_usd' + # Reduced from 8 to 2 most important metrics + ], + + # Essential Exchange Metrics + 'exchange': [ + 'exchange_inflow', 'exchange_outflow' + # Reduced from 8 to 2 most important metrics + ] + + # Removed: supply, development, social, derivatives, whales + # This reduces API calls by ~70% while keeping core metrics + } + + def _initialize_metadata(self): + """Initialize metadata about available metrics and projects""" + try: + logger.info("Fetching available metrics...") + self.available_metrics = san.available_metrics() + logger.info(f"Found {len(self.available_metrics)} available metrics") + + logger.info("Fetching available projects...") + self.projects_df = san.get("projects/all") + self.available_slugs = self.projects_df['slug'].tolist() + logger.info(f"Found {len(self.available_slugs)} available projects") + + except Exception as e: + logger.error(f"Failed to initialize metadata: {e}") + self.available_metrics = [] + self.available_slugs = [] + + def get_metric_metadata(self, metric: str) -> Dict[str, Any]: + """ + Get metadata for a specific metric + + Args: + metric: The metric name + + Returns: + Dictionary containing metric metadata + """ + try: + metadata = san.metadata( + metric, + arr=["availableSlugs", "defaultAggregation", "humanReadableName", + "isAccessible", "isRestricted", "restrictedFrom", "restrictedTo"] + ) + return metadata + except Exception as e: + logger.warning(f"Failed to get metadata for {metric}: {e}") + return {} + + def fetch_single_metric(self, metric: str, slug: str, **kwargs) -> Optional[pd.DataFrame]: + """ + Fetch a single metric for a single asset + + Args: + metric: The metric name + slug: The asset slug + **kwargs: Additional parameters for the API call + + Returns: + DataFrame with the metric data or None if failed + """ + max_retries = len(self.api_keys) if self.api_keys else 1 + keys_tried = set() + + for attempt in range(max_retries): + try: + # If we've tried all keys, reset and wait + if len(keys_tried) >= len(self.api_keys) and self.api_keys: + logger.warning(f"All {len(self.api_keys)} API keys exhausted for {metric}, waiting 30 seconds...") + time.sleep(30) + keys_tried.clear() + self.current_key_index = 0 + self._set_current_api_key() + + params = { + 'slug': slug, + 'from_date': kwargs.get('from_date', self.config.from_date), + 'to_date': kwargs.get('to_date', self.config.to_date), + 'interval': kwargs.get('interval', self.config.interval), + 'include_incomplete_data': kwargs.get('include_incomplete_data', self.config.include_incomplete_data) + } + + # Add any additional selector parameters + if 'selector' in kwargs: + params['selector'] = kwargs['selector'] + + data = san.get(metric, **params) + + if data is not None and not data.empty: + # Add metadata columns + data['metric'] = metric + data['slug'] = slug + if attempt > 0: + logger.info(f"[SUCCESS] {metric} for {slug} succeeded on attempt {attempt + 1}") + return data + + except Exception as e: + error_msg = str(e) + keys_tried.add(self.current_key_index) + + # Check if it's a rate limit error + if self._is_rate_limit_error(error_msg) and self.api_keys: + logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for {metric}: {error_msg}") + + # Check if we've tried all keys + if len(keys_tried) >= len(self.api_keys): + logger.error(f"All {len(self.api_keys)} API keys exhausted for {metric}. Skipping.") + break # Exit retry loop since all keys are exhausted + + # Try to switch to next API key + if self._switch_api_key(): + continue # Retry with new API key + else: + logger.error("No more API keys available for switching") + + # Handle rate limit with san library specific check + if hasattr(san, 'is_rate_limit_exception') and san.is_rate_limit_exception(e): + if hasattr(san, 'rate_limit_time_left'): + rate_limit_seconds = san.rate_limit_time_left(e) + logger.warning(f"Santiment rate limit hit. Sleeping for {rate_limit_seconds} seconds") + time.sleep(rate_limit_seconds) + else: + # Try switching API key if available + if self.api_keys and self._switch_api_key(): + continue + else: + time.sleep(60) # Default wait + else: + # Check for specific error types that mean we should skip this metric entirely + if any(skip_phrase in error_msg.lower() for skip_phrase in [ + 'not supported for', + 'is not an existing slug', + 'not implemented for', + 'missing_contract', + 'outside the allowed interval', + 'upgrade to a higher tier' + ]): + logger.warning(f"[SKIP] {metric} for {slug} - {error_msg}") + return None # Skip this metric/slug combination entirely + + logger.error(f"Failed to fetch {metric} for {slug}: {error_msg}") + + error_info = { + 'metric': metric, + 'slug': slug, + 'error': error_msg, + 'timestamp': datetime.now().isoformat(), + 'api_key_index': self.current_key_index + } + self.failed_queries.append(error_info) + + return None + + def fetch_multi_asset_metric(self, metric: str, slugs: List[str], **kwargs) -> Optional[pd.DataFrame]: + """ + Fetch a single metric for multiple assets using get_many + + Args: + metric: The metric name + slugs: List of asset slugs + **kwargs: Additional parameters for the API call + + Returns: + DataFrame with the metric data or None if failed + """ + max_retries = len(self.api_keys) if self.api_keys else 1 + keys_tried = set() + + for attempt in range(max_retries): + try: + # If we've tried all keys, reset and wait + if len(keys_tried) >= len(self.api_keys) and self.api_keys: + logger.warning(f"All {len(self.api_keys)} API keys exhausted for {metric}, waiting 30 seconds...") + time.sleep(30) + keys_tried.clear() + self.current_key_index = 0 + self._set_current_api_key() + + params = { + 'slugs': slugs, + 'from_date': kwargs.get('from_date', self.config.from_date), + 'to_date': kwargs.get('to_date', self.config.to_date), + 'interval': kwargs.get('interval', self.config.interval), + 'include_incomplete_data': kwargs.get('include_incomplete_data', self.config.include_incomplete_data) + } + + data = san.get_many(metric, **params) + + if data is not None and not data.empty: + # Reshape data for consistent format + data_melted = data.reset_index().melt( + id_vars=['datetime'], + var_name='slug', + value_name='value' + ) + data_melted['metric'] = metric + data_melted.set_index('datetime', inplace=True) + if attempt > 0: + logger.info(f"[SUCCESS] {metric} for multiple assets succeeded on attempt {attempt + 1}") + return data_melted + + except Exception as e: + error_msg = str(e) + keys_tried.add(self.current_key_index) + + # Check if it's a rate limit error + if self._is_rate_limit_error(error_msg) and self.api_keys: + logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for {metric}: {error_msg}") + + # Check if we've tried all keys + if len(keys_tried) >= len(self.api_keys): + logger.error(f"All {len(self.api_keys)} API keys exhausted for {metric}. Skipping.") + break # Exit retry loop since all keys are exhausted + + # Try to switch to next API key + if self._switch_api_key(): + continue # Retry with new API key + else: + logger.error("No more API keys available for switching") + + # Handle rate limit with san library specific check + if hasattr(san, 'is_rate_limit_exception') and san.is_rate_limit_exception(e): + if hasattr(san, 'rate_limit_time_left'): + rate_limit_seconds = san.rate_limit_time_left(e) + logger.warning(f"Santiment rate limit hit. Sleeping for {rate_limit_seconds} seconds") + time.sleep(rate_limit_seconds) + else: + # Try switching API key if available + if self.api_keys and self._switch_api_key(): + continue + else: + time.sleep(60) # Default wait + else: + logger.error(f"Failed to fetch {metric} for multiple assets: {error_msg}") + + error_info = { + 'metric': metric, + 'slugs': slugs, + 'error': error_msg, + 'timestamp': datetime.now().isoformat(), + 'api_key_index': self.current_key_index + } + self.failed_queries.append(error_info) + + return None + + def fetch_category_batch(self, category: str, slugs: List[str], use_async_batch: bool = True) -> Dict[str, pd.DataFrame]: + """ + Fetch all metrics in a category using batch operations with symbol alternatives fallback + + Args: + category: The metric category name + slugs: List of asset slugs to fetch for + use_async_batch: Whether to use AsyncBatch (recommended) or Batch + + Returns: + Dictionary mapping metric names to DataFrames + """ + if category not in self.metric_categories: + logger.error(f"Unknown category: {category}") + return {} + + metrics = self.metric_categories[category] + category_data = {} + + # Filter metrics that are actually available + available_metrics_in_category = [m for m in metrics if m in self.available_metrics] + + if not available_metrics_in_category: + logger.warning(f"No available metrics found for category: {category}") + return {} + + logger.info(f"Fetching {len(available_metrics_in_category)} metrics for category: {category}") + + # First try batch operation with normalized slugs + normalized_slugs = self.normalize_slug_list(slugs) + batch_success = self._try_batch_fetch(category, available_metrics_in_category, normalized_slugs, use_async_batch) + category_data.update(batch_success) + + # For failed metrics, try individual fetches with alternatives + failed_metrics = [m for m in available_metrics_in_category if m not in batch_success] + if failed_metrics: + logger.info(f"Retrying {len(failed_metrics)} failed metrics with alternatives") + individual_results = self._fetch_failed_metrics_with_alternatives(failed_metrics, slugs) + category_data.update(individual_results) + + return category_data + + def _try_batch_fetch(self, category: str, metrics: List[str], slugs: List[str], use_async_batch: bool) -> Dict[str, pd.DataFrame]: + """Try batch fetch operation""" + category_data = {} + + try: + if use_async_batch: + batch = san.AsyncBatch() + else: + batch = san.Batch() + + # Add queries to batch + for metric in metrics: + try: + if len(slugs) == 1: + batch.get( + metric, + slug=slugs[0], + from_date=self.config.from_date, + to_date=self.config.to_date, + interval=self.config.interval, + include_incomplete_data=self.config.include_incomplete_data + ) + else: + batch.get_many( + metric, + slugs=slugs, + from_date=self.config.from_date, + to_date=self.config.to_date, + interval=self.config.interval, + include_incomplete_data=self.config.include_incomplete_data + ) + except Exception as e: + logger.warning(f"Failed to add {metric} to batch: {e}") + + # Execute batch + if use_async_batch: + results = batch.execute(max_workers=self.config.max_workers) + else: + results = batch.execute() + + # Process results + for i, (metric, result) in enumerate(zip(metrics, results)): + if result is not None and not result.empty: + if len(slugs) > 1: + # Reshape multi-asset data + result_melted = result.reset_index().melt( + id_vars=['datetime'], + var_name='slug', + value_name='value' + ) + result_melted['metric'] = metric + result_melted.set_index('datetime', inplace=True) + category_data[metric] = result_melted + else: + result['metric'] = metric + result['slug'] = slugs[0] + category_data[metric] = result + else: + logger.debug(f"No data received for metric: {metric} in batch") + + except Exception as e: + logger.error(f"Batch execution failed for category {category}: {e}") + + return category_data + + def _fetch_failed_metrics_with_alternatives(self, metrics: List[str], original_slugs: List[str]) -> Dict[str, pd.DataFrame]: + """Fetch failed metrics individually using symbol alternatives""" + individual_data = {} + + for metric in metrics: + logger.info(f"Retrying {metric} with symbol alternatives...") + + if len(original_slugs) == 1: + # Single asset - use alternatives + result = self.fetch_single_metric_with_alternatives(metric, original_slugs[0]) + if result is not None: + individual_data[metric] = result + else: + # Multiple assets - try each with alternatives and combine + all_results = [] + for slug in original_slugs: + result = self.fetch_single_metric_with_alternatives(metric, slug) + if result is not None: + all_results.append(result) + + if all_results: + # Concatenate results - they already have datetime as index + combined_result = pd.concat(all_results, ignore_index=False, sort=False) + # Ensure datetime index is properly set + if not isinstance(combined_result.index, pd.DatetimeIndex): + if 'datetime' in combined_result.columns: + combined_result.set_index('datetime', inplace=True) + individual_data[metric] = combined_result + + return individual_data + + def fetch_special_metrics(self, slugs: List[str]) -> Dict[str, pd.DataFrame]: + """ + Fetch special metrics that have different API signatures + + Args: + slugs: List of asset slugs + + Returns: + Dictionary mapping metric names to DataFrames + """ + special_data = {} + + for slug in slugs: + max_retries = len(self.api_keys) if self.api_keys else 1 + keys_tried = set() + + for attempt in range(max_retries): + try: + # If we've tried all keys, reset and wait + if len(keys_tried) >= len(self.api_keys) and self.api_keys: + logger.warning(f"All {len(self.api_keys)} API keys exhausted for special metrics on {slug}, waiting 30 seconds...") + time.sleep(30) + keys_tried.clear() + self.current_key_index = 0 + self._set_current_api_key() + + # OHLCV data + logger.info(f"Fetching OHLCV data for {slug}") + ohlcv = san.get( + f"ohlcv/{slug}", + from_date=self.config.from_date, + to_date=self.config.to_date, + interval=self.config.interval + ) + if ohlcv is not None and not ohlcv.empty: + ohlcv['metric'] = 'ohlcv' + ohlcv['slug'] = slug + special_data[f'ohlcv_{slug}'] = ohlcv + + # Prices with OHLC format + logger.info(f"Fetching detailed prices for {slug}") + prices = san.get( + "prices", + slug=slug, + from_date=self.config.from_date, + to_date=self.config.to_date, + interval=self.config.interval + ) + if prices is not None and not prices.empty: + prices['metric'] = 'prices_detailed' + prices['slug'] = slug + special_data[f'prices_{slug}'] = prices + + # If we get here, the attempt was successful + break + + except Exception as e: + error_msg = str(e) + keys_tried.add(self.current_key_index) + + # Check if it's a rate limit error + if self._is_rate_limit_error(error_msg) and self.api_keys: + logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for special metrics on {slug}: {error_msg}") + + # Check if we've tried all keys + if len(keys_tried) >= len(self.api_keys): + logger.error(f"All {len(self.api_keys)} API keys exhausted for special metrics on {slug}. Skipping.") + break # Exit retry loop since all keys are exhausted + + # Try to switch to next API key + if self._switch_api_key(): + continue # Retry with new API key + else: + logger.error("No more API keys available for switching") + + logger.error(f"Failed to fetch special metrics for {slug}: {e}") + break # Exit retry loop for this slug + + return special_data + + def fetch_blockchain_address_data(self, addresses: List[str], slugs: List[str]) -> Dict[str, pd.DataFrame]: + """ + Fetch blockchain address-related data + + Args: + addresses: List of blockchain addresses + slugs: List of asset slugs for context + + Returns: + Dictionary mapping data types to DataFrames + """ + address_data = {} + + for slug in slugs: + for address in addresses: + try: + # Historical balance + balance = san.get( + "historical_balance", + slug=slug, + address=address, + from_date=self.config.from_date, + to_date=self.config.to_date, + interval=self.config.interval + ) + if balance is not None and not balance.empty: + balance['address'] = address + balance['slug'] = slug + address_data[f'historical_balance_{slug}_{address[:8]}'] = balance + + # Top transactions + top_txs = san.get( + "eth_top_transactions", + slug=slug, + from_date=self.config.from_date, + to_date=self.config.to_date, + limit=100, + transaction_type="ALL" + ) + if top_txs is not None and not top_txs.empty: + top_txs['slug'] = slug + address_data[f'eth_top_transactions_{slug}'] = top_txs + + except Exception as e: + logger.error(f"Failed to fetch address data for {address} on {slug}: {e}") + + return address_data + + def execute_custom_sql_queries(self) -> Dict[str, pd.DataFrame]: + """ + Execute custom SQL queries for additional data insights, using dictGetString for asset metadata. + + Returns: + Dictionary mapping query names to DataFrames + """ + sql_data = {} + custom_queries = { + 'top_assets_by_volume': """ + SELECT + dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name, + dictGetString('default.asset_metadata_dict', 'slug', asset_id) as slug, + SUM(value) as total_volume + FROM daily_metrics_v2 + WHERE metric_id = get_metric_id('volume_usd') + AND dt >= now() - INTERVAL 30 DAY + GROUP BY asset_id + ORDER BY total_volume DESC + LIMIT 50 + """, + 'recent_high_activity_addresses': """ + SELECT + dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name, + get_metric_name(metric_id) as metric_name, + dt, + value + FROM daily_metrics_v2 + WHERE metric_id = get_metric_id('daily_active_addresses') + AND dt >= now() - INTERVAL 7 DAY + AND value > 1000 + ORDER BY dt DESC, value DESC + LIMIT 100 + """, + 'exchange_flow_summary': """ + SELECT + dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name, + dt, + SUM(CASE WHEN metric_id = get_metric_id('exchange_inflow') THEN value ELSE 0 END) as inflow, + SUM(CASE WHEN metric_id = get_metric_id('exchange_outflow') THEN value ELSE 0 END) as outflow + FROM daily_metrics_v2 + WHERE metric_id IN (get_metric_id('exchange_inflow'), get_metric_id('exchange_outflow')) + AND dt >= now() - INTERVAL 30 DAY + GROUP BY asset_id, dt + ORDER BY dt DESC + LIMIT 1000 + """ + } + for query_name, query in custom_queries.items(): + try: + logger.info(f"Executing SQL query: {query_name}") + result = san.execute_sql(query=query, set_index="dt" if "dt" in query else None) + if result is not None and not result.empty: + sql_data[query_name] = result + logger.info(f"SQL query {query_name} returned {len(result)} rows") + except Exception as e: + logger.error(f"Failed to execute SQL query {query_name}: {e}") + return sql_data + + def fetch_comprehensive_data(self, + slugs: List[str] = None, + categories: List[str] = None, + include_special_metrics: bool = True, + include_sql_queries: bool = True, + addresses: List[str] = None) -> Dict[str, Any]: + """ + Fetch comprehensive data across all categories and metrics + + Args: + slugs: List of asset slugs (if None, uses top assets) + categories: List of categories to fetch (if None, fetches all) + include_special_metrics: Whether to include special format metrics + include_sql_queries: Whether to execute custom SQL queries + addresses: List of blockchain addresses for address-specific data + + Returns: + Dictionary containing all fetched data organized by category + """ + # Set defaults + if slugs is None: + slugs = ['bitcoin', 'ethereum', 'cardano', 'polkadot', 'chainlink', + 'litecoin', 'bitcoin-cash', 'stellar', 'ethereum-classic', 'eos'] + + # Normalize and deduplicate slugs + slugs = self.normalize_slug_list(slugs) + + if categories is None: + categories = list(self.metric_categories.keys()) + + # Limit slugs for free tier + if not san.ApiConfig.api_key: + slugs = slugs[:3] # Limit to 3 assets for free tier + logger.warning("No API key detected. Limiting to 3 assets to avoid rate limits.") + + all_data = {} + start_time = datetime.now() + + logger.info(f"Starting comprehensive data fetch for {len(slugs)} assets across {len(categories)} categories") + + # Check if all API keys are exhausted early + all_keys_exhausted = False + if self.api_keys and self.rate_limit_switches > len(self.api_keys) * 3: + logger.warning("⚠️ All API keys appear to be rate-limited. Attempting reduced fetch...") + all_keys_exhausted = True + + # Fetch data by category + for category in categories: + if all_keys_exhausted: + logger.info(f"Skipping category {category} due to API exhaustion") + continue + + logger.info(f"Fetching category: {category}") + category_data = self.fetch_category_batch(category, slugs, use_async_batch=True) + + if category_data: + all_data[category] = category_data + # Store individual DataFrames for later use + for metric_name, df in category_data.items(): + self.fetched_data[f"{category}_{metric_name}"] = df + + # Check if we should stop due to rate limits + if self.rate_limit_switches > len(self.api_keys) * 5: + logger.warning("⚠️ Excessive rate limit switches detected. Stopping data fetch to avoid further exhaustion.") + all_keys_exhausted = True + break + + # Fetch special metrics (only if not exhausted) + if include_special_metrics and not all_keys_exhausted: + logger.info("Fetching special metrics...") + special_data = self.fetch_special_metrics(slugs) + if special_data: + all_data['special_metrics'] = special_data + self.fetched_data.update(special_data) + elif all_keys_exhausted: + logger.info("Skipping special metrics due to API exhaustion") + + # Fetch blockchain address data + if addresses and not all_keys_exhausted: + logger.info("Fetching blockchain address data...") + address_data = self.fetch_blockchain_address_data(addresses, slugs) + if address_data: + all_data['address_data'] = address_data + self.fetched_data.update(address_data) + elif addresses and all_keys_exhausted: + logger.info("Skipping blockchain address data due to API exhaustion") + + # Execute SQL queries (only if not exhausted) + if include_sql_queries and san.ApiConfig.api_key and not all_keys_exhausted: + logger.info("Executing custom SQL queries...") + sql_data = self.execute_custom_sql_queries() + if sql_data: + all_data['sql_queries'] = sql_data + self.fetched_data.update(sql_data) + elif all_keys_exhausted: + logger.info("Skipping SQL queries due to API exhaustion") + + end_time = datetime.now() + duration = end_time - start_time + + logger.info(f"Comprehensive data fetch completed in {duration}") + logger.info(f"Successfully fetched {len(self.fetched_data)} datasets") + logger.info(f"Failed queries: {len(self.failed_queries)}") + + # Add exhaustion notice to summary + if all_keys_exhausted: + logger.warning("⚠️ Data fetch completed with API rate limit exhaustion - some data may be missing") + + # Generate summary + summary = self._generate_fetch_summary(all_data, duration) + summary['all_keys_exhausted'] = all_keys_exhausted + summary['rate_limit_switches'] = self.rate_limit_switches + all_data['fetch_summary'] = summary + + return all_data + + def _generate_fetch_summary(self, data: Dict[str, Any], duration: timedelta) -> Dict[str, Any]: + """Generate a summary of the data fetching operation""" + summary = { + 'fetch_duration': str(duration), + 'total_datasets': len(self.fetched_data), + 'failed_queries': len(self.failed_queries), + 'categories_fetched': list(data.keys()), + 'data_points_by_category': {}, + 'date_range': f"{self.config.from_date} to {self.config.to_date}", + 'interval': self.config.interval, + 'timestamp': datetime.now().isoformat() + } + + # Count data points by category + for category, category_data in data.items(): + if isinstance(category_data, dict): + total_points = sum(len(df) for df in category_data.values() if isinstance(df, pd.DataFrame)) + summary['data_points_by_category'][category] = total_points + + return summary + + def export_data(self, + export_format: str = None, + combine_categories: bool = False, + include_metadata: bool = True) -> Dict[str, str]: + """ + Export fetched data to files + + Args: + export_format: Export format ('csv', 'json', 'parquet') + combine_categories: Whether to combine all data into single files + include_metadata: Whether to include metadata files + + Returns: + Dictionary mapping data names to file paths + """ + export_format = export_format or self.config.export_format + exported_files = {} + + if not self.fetched_data: + logger.warning("No data to export") + return exported_files + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + if combine_categories: + # Combine all DataFrames + all_dfs = [] + for name, df in self.fetched_data.items(): + if isinstance(df, pd.DataFrame) and not df.empty: + df_copy = df.copy() + df_copy['dataset_name'] = name + all_dfs.append(df_copy) + + if all_dfs: + combined_df = pd.concat(all_dfs, ignore_index=True, sort=False) + filename = f"santiment_comprehensive_data_{timestamp}.{export_format}" + filepath = os.path.join(self.config.export_directory, filename) + + self._export_dataframe(combined_df, filepath, export_format) + exported_files['combined_data'] = filepath + else: + # Export individual datasets + for name, df in self.fetched_data.items(): + if isinstance(df, pd.DataFrame) and not df.empty: + filename = f"santiment_{name}_{timestamp}.{export_format}" + filepath = os.path.join(self.config.export_directory, filename) + + self._export_dataframe(df, filepath, export_format) + exported_files[name] = filepath + + # Export metadata and summary + if include_metadata: + metadata = { + 'failed_queries': self.failed_queries, + 'available_metrics': self.available_metrics, + 'config': { + 'from_date': self.config.from_date, + 'to_date': self.config.to_date, + 'interval': self.config.interval, + 'batch_size': self.config.batch_size + }, + 'export_timestamp': datetime.now().isoformat() + } + + metadata_file = os.path.join(self.config.export_directory, f"santiment_metadata_{timestamp}.json") + with open(metadata_file, 'w') as f: + json.dump(metadata, f, indent=2) + exported_files['metadata'] = metadata_file + + logger.info(f"Exported {len(exported_files)} files to {self.config.export_directory}") + return exported_files + + def _export_dataframe(self, df: pd.DataFrame, filepath: str, format_type: str): + """Export a DataFrame to the specified format""" + try: + if format_type == 'csv': + df.to_csv(filepath) + elif format_type == 'json': + df.to_json(filepath, date_format='iso', orient='records') + elif format_type == 'parquet': + df.to_parquet(filepath) + else: + logger.error(f"Unsupported export format: {format_type}") + return + + logger.info(f"Exported DataFrame to {filepath}") + + except Exception as e: + logger.error(f"Failed to export DataFrame to {filepath}: {e}") + + def get_api_usage_stats(self) -> Dict[str, Any]: + """Get API usage statistics""" + try: + stats = { + 'calls_made': san.api_calls_made(), + 'calls_remaining': san.api_calls_remaining(), + 'failed_queries': len(self.failed_queries), + 'successful_datasets': len(self.fetched_data) + } + return stats + except Exception as e: + logger.error(f"Failed to get API usage stats: {e}") + return {} + + def print_summary(self): + """Print a comprehensive summary of the fetching operation""" + print("\n" + "="*60) + print("SANTIMENT DATA FETCHER SUMMARY") + print("="*60) + + # Basic stats + print(f"Total datasets fetched: {len(self.fetched_data)}") + print(f"Failed queries: {len(self.failed_queries)}") + + # Configuration info + print(f"\nConfiguration:") + print(f" Date range: {self.config.from_date} to {self.config.to_date}") + print(f" Interval: {self.config.interval}") + print(f" Export directory: {self.config.export_directory}") + + # Categories summary + if self.fetched_data: + print(f"\nData by category:") + category_counts = {} + for key in self.fetched_data.keys(): + if '_' in key: + category = key.split('_')[0] + category_counts[category] = category_counts.get(category, 0) + 1 + + for category, count in sorted(category_counts.items()): + print(f" {category}: {count} datasets") + + # Sample data info + if self.fetched_data: + print(f"\nSample datasets:") + for i, (name, df) in enumerate(list(self.fetched_data.items())[:5]): + if isinstance(df, pd.DataFrame): + print(f" {name}: {len(df)} rows, {len(df.columns)} columns") + if not df.empty: + date_range = f"{df.index.min()} to {df.index.max()}" if hasattr(df.index, 'min') else "N/A" + print(f" Date range: {date_range}") + + # Failed queries summary + if self.failed_queries: + print(f"\nFailed queries summary:") + error_types = {} + for failed in self.failed_queries: + error_msg = str(failed.get('error', 'Unknown error')) + error_type = error_msg.split(':')[0] if ':' in error_msg else error_msg + error_types[error_type] = error_types.get(error_type, 0) + 1 + + for error_type, count in sorted(error_types.items()): + print(f" {error_type}: {count} occurrences") + + # API usage stats + try: + api_stats = self.get_api_usage_stats() + if api_stats: + print(f"\nAPI Usage:") + print(f" Calls made: {api_stats.get('calls_made', 'N/A')}") + print(f" Calls remaining: {api_stats.get('calls_remaining', 'N/A')}") + except: + pass + + print("="*60) + + def analyze_data_quality(self) -> Dict[str, Any]: + """Analyze the quality of fetched data""" + quality_report = { + 'total_datasets': len(self.fetched_data), + 'empty_datasets': 0, + 'datasets_with_nulls': 0, + 'date_coverage': {}, + 'data_completeness': {}, + 'outliers_detected': {} + } + + for name, df in self.fetched_data.items(): + if isinstance(df, pd.DataFrame): + # Check if dataset is empty + if df.empty: + quality_report['empty_datasets'] += 1 + continue + + # Check for null values + if df.isnull().any().any(): + quality_report['datasets_with_nulls'] += 1 + null_percentage = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100 + quality_report['data_completeness'][name] = f"{100 - null_percentage:.2f}%" + + # Analyze date coverage + if hasattr(df.index, 'min') and hasattr(df.index, 'max'): + try: + date_range = { + 'start': str(df.index.min()), + 'end': str(df.index.max()), + 'days': (df.index.max() - df.index.min()).days if hasattr(df.index.max() - df.index.min(), 'days') else 'N/A' + } + quality_report['date_coverage'][name] = date_range + except: + quality_report['date_coverage'][name] = 'Unable to determine' + + # Simple outlier detection for numeric columns + numeric_cols = df.select_dtypes(include=[np.number]).columns + outlier_info = {} + for col in numeric_cols: + if col not in ['metric', 'slug']: # Skip metadata columns + try: + q1 = df[col].quantile(0.25) + q3 = df[col].quantile(0.75) + iqr = q3 - q1 + lower_bound = q1 - 1.5 * iqr + upper_bound = q3 + 1.5 * iqr + outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)] + if len(outliers) > 0: + outlier_info[col] = len(outliers) + except: + continue + + if outlier_info: + quality_report['outliers_detected'][name] = outlier_info + + return quality_report + + def create_data_dashboard(self) -> str: + """Create a simple HTML dashboard summarizing the fetched data""" + timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + total_datasets = len(self.fetched_data) + date_range = f"{self.config.from_date} to {self.config.to_date}" + + html_content = f""" + + + + Santiment Data Dashboard + + + +
+

Santiment Data Dashboard

+

Generated on: {timestamp}

+

Total Datasets: {total_datasets}

+

Date Range: {date_range}

+
+""" + + # Add category summary + if self.fetched_data: + category_counts = {} + for key in self.fetched_data.keys(): + if '_' in key: + category = key.split('_')[0] + category_counts[category] = category_counts.get(category, 0) + 1 + + html_content += """ +
+

Categories Overview

+ """ + for category, count in sorted(category_counts.items()): + html_content += f'
{category}
{count} datasets
' + html_content += "
" + + # Add failed queries section + if self.failed_queries: + html_content += """ +
+

Failed Queries

+ + + """ + for failed in self.failed_queries[:10]: # Show first 10 + metric = failed.get('metric', 'N/A') + slug = failed.get('slug', failed.get('slugs', 'N/A')) + error = str(failed.get('error', 'Unknown'))[:100] + '...' if len(str(failed.get('error', ''))) > 100 else failed.get('error', 'Unknown') + html_content += f"" + html_content += "
MetricSlugError
{metric}{slug}{error}
" + + html_content += "" + + # Save dashboard + dashboard_path = os.path.join( + self.config.export_directory, + f"santiment_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html" + ) + with open(dashboard_path, 'w') as f: + f.write(html_content) + + logger.info(f"Dashboard created at {dashboard_path}") + return dashboard_path + + def get_top_performing_assets(self, metric: str = 'price_usd', days: int = 30) -> pd.DataFrame: + """ + Analyze top performing assets based on a specific metric + + Args: + metric: The metric to analyze performance on + days: Number of days to look back for performance calculation + + Returns: + DataFrame with performance analysis + """ + performance_data = [] + + for name, df in self.fetched_data.items(): + if isinstance(df, pd.DataFrame) and metric in str(name) and not df.empty: + try: + if 'slug' in df.columns: + # Group by slug and calculate performance + for slug in df['slug'].unique(): + slug_data = df[df['slug'] == slug].copy() + if len(slug_data) >= 2: + slug_data = slug_data.sort_index() + + # Calculate performance over the specified period + if len(slug_data) > days: + recent_data = slug_data.tail(days) + else: + recent_data = slug_data + + if 'value' in recent_data.columns and not recent_data['value'].empty: + start_value = recent_data['value'].iloc[0] + end_value = recent_data['value'].iloc[-1] + + if start_value and start_value != 0: + performance = ((end_value - start_value) / start_value) * 100 + + performance_data.append({ + 'slug': slug, + 'metric': metric, + 'start_value': start_value, + 'end_value': end_value, + 'performance_pct': performance, + 'data_points': len(recent_data), + 'period_days': days + }) + except Exception as e: + logger.warning(f"Failed to analyze performance for {name}: {e}") + + if performance_data: + performance_df = pd.DataFrame(performance_data) + return performance_df.sort_values('performance_pct', ascending=False) + else: + return pd.DataFrame() + + def cleanup_export_directory(self) -> bool: + """ + Manually clean up the export directory. + + Returns: + bool: True if cleanup was successful, False otherwise + """ + try: + self._cleanup_existing_files() + return True + except Exception as e: + logger.error(f"Manual cleanup failed: {e}") + return False + + def get_api_key_status(self): + """Get status information about API key usage""" + if not self.api_keys: + return { + "total_keys": 0, + "current_key": "None", + "rate_limit_switches": self.rate_limit_switches, + "current_key_preview": "No API key" + } + + return { + "total_keys": len(self.api_keys), + "current_key": self.current_key_index + 1, + "rate_limit_switches": self.rate_limit_switches, + "current_key_preview": self.api_keys[self.current_key_index][:8] + "..." + } + + def print_api_key_status(self): + """Print API key usage status""" + status = self.get_api_key_status() + print(f"\n[API_STATUS] Using {status['total_keys']} API key(s)") + if status['total_keys'] > 0: + print(f"[API_STATUS] Current: Key #{status['current_key']} ({status['current_key_preview']})") + print(f"[API_STATUS] Rate limit switches: {status['rate_limit_switches']}") + if status['rate_limit_switches'] > 0: + print(f"[API_STATUS] Effective rate limit handling active") + else: + print(f"[API_STATUS] No API keys configured - using free tier") + print() + + def save_configuration(self, config_path: str = None) -> str: + """Save current configuration to a JSON file""" + if config_path is None: + config_path = os.path.join(self.config.export_directory, "santiment_config.json") + + config_dict = { + 'from_date': self.config.from_date, + 'to_date': self.config.to_date, + 'interval': self.config.interval, + 'include_incomplete_data': self.config.include_incomplete_data, + 'batch_size': self.config.batch_size, + 'max_workers': self.config.max_workers, + 'rate_limit_delay': self.config.rate_limit_delay, + 'export_format': self.config.export_format, + 'export_directory': self.config.export_directory, + 'saved_at': datetime.now().isoformat() + } + + with open(config_path, 'w') as f: + json.dump(config_dict, f, indent=2) + + logger.info(f"Configuration saved to {config_path}") + return config_path + + @classmethod + def load_configuration(cls, config_path: str) -> 'SantimentDataFetcher': + """Load configuration from a JSON file and create a fetcher instance""" + with open(config_path, 'r') as f: + config_dict = json.load(f) + + # Remove metadata fields + config_dict.pop('saved_at', None) + + config = FetchConfig(**config_dict) + return cls(config=config) + + +# Utility functions for easy usage +def cleanup_santiment_directory(directory_path: str = "data/santiment") -> bool: + """ + Utility function to clean up a Santiment data directory without creating a fetcher instance. + + Args: + directory_path: Path to the directory to clean up + + Returns: + bool: True if cleanup was successful, False otherwise + """ + import glob + import shutil + + try: + if not os.path.exists(directory_path): + logger.info(f"Directory does not exist: {directory_path}") + return True + + # Get all files in the directory + all_files = glob.glob(os.path.join(directory_path, "*")) + + if all_files: + logger.info(f"Cleaning up {len(all_files)} existing files in {directory_path}") + + for file_path in all_files: + try: + if os.path.isfile(file_path): + os.remove(file_path) + logger.debug(f"Removed file: {os.path.basename(file_path)}") + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + logger.debug(f"Removed directory: {os.path.basename(file_path)}") + except Exception as e: + logger.warning(f"Failed to remove {file_path}: {e}") + + logger.info(f"Successfully cleaned up directory: {directory_path}") + else: + logger.info(f"Directory is already clean: {directory_path}") + + return True + + except Exception as e: + logger.error(f"Failed to cleanup directory {directory_path}: {e}") + return False + +def fetch_quick_crypto_overview(assets: List[str] = None, api_key: str = None) -> Dict[str, pd.DataFrame]: + """ + Quick function to fetch essential crypto data for analysis + + Args: + assets: List of asset slugs (defaults to top 10 cryptos) + api_key: Santiment API key + + Returns: + Dictionary with essential data + """ + if assets is None: + assets = ['bitcoin', 'ethereum', 'solana', 'ripple', 'cardano'] + + config = FetchConfig( + from_date="2025-07-01", # Changed to be within free tier allowed range + to_date="2025-07-06", # Use last valid date for free tier + interval="30m", + export_format="parquet" + ) + + fetcher = SantimentDataFetcher(api_key=api_key, config=config) + + # Fetch essential categories + essential_categories = ['financial', 'network_activity', 'exchange'] + + data = fetcher.fetch_comprehensive_data( + slugs=assets, + categories=essential_categories, + include_special_metrics=True, + include_sql_queries=False + ) + + return data + +def create_crypto_report(assets: List[str], output_dir: str = "./crypto_report", api_key: str = None): + """ + Create a comprehensive crypto analysis report + + Args: + assets: List of asset slugs to analyze + output_dir: Directory to save the report + api_key: Santiment API key(s) - can be comma-separated for multiple keys + """ + config = FetchConfig( + from_date="2025-07-01", # Changed to be within free tier allowed range + to_date="2025-07-06", # Use last valid date for free tier + interval="30m", + export_directory=output_dir, + export_format="parquet" # Use Parquet for output + ) + + fetcher = SantimentDataFetcher(api_key=api_key, config=config) + + # Print API key status + fetcher.print_api_key_status() + + # Fetch comprehensive data + logger.info("Fetching comprehensive cryptocurrency data...") + data = fetcher.fetch_comprehensive_data( + slugs=assets, + include_special_metrics=True, + include_sql_queries=True + ) + + # Export data + logger.info("Exporting data to files...") + exported_files = fetcher.export_data(combine_categories=False, include_metadata=True) + + # Create dashboard + logger.info("Creating data dashboard...") + dashboard_path = fetcher.create_data_dashboard() + + # Analyze data quality + logger.info("Analyzing data quality...") + quality_report = fetcher.analyze_data_quality() + + # Save quality report + quality_path = os.path.join(output_dir, "data_quality_report.json") + with open(quality_path, 'w') as f: + json.dump(quality_report, f, indent=2, default=str) + + # Print summary + fetcher.print_summary() + + print(f"\nReport generated successfully!") + print(f"Dashboard: {dashboard_path}") + print(f"Data files: {len(exported_files)} files in {output_dir}") + print(f"Quality report: {quality_path}") + + # Print final API key status + print("\n[FINAL_STATUS] Santiment API Key Usage Summary:") + fetcher.print_api_key_status() + +# Example usage +def main(): + # Get API key from environment (already loaded at module top) + santiment_api_key = os.getenv("SANTIMENT_API_KEY") + + # Create fetcher instance + fetcher = SantimentDataFetcher(api_key=santiment_api_key) + + # Print API key status + fetcher.print_api_key_status() + + # DISABLED: Do not cleanup Santiment directory to preserve data + # cleanup_santiment_directory("./data/santiment") + print("[SANTIMENT] Data preservation mode - keeping existing data") + + # Reduced scope for API conservation - only top 2 crypto assets + print("Fetching reduced crypto overview (API conservation mode)...") + # Note: Reduced from 5 to 2 assets to conserve API calls + overview_data = fetch_quick_crypto_overview(['bitcoin', 'ethereum'], api_key=santiment_api_key) + + # Comprehensive analysis - reduced scope + print("\nCreating conservative crypto report...") + # Note: Reduced scope - only Bitcoin and Ethereum to preserve API limits + create_crypto_report( + assets=['bitcoin', 'ethereum'], # Reduced from 5 to 2 assets + output_dir="./data/santiment", + api_key=santiment_api_key + ) + + # Print final API key status + print("\n[FINAL_STATUS] Santiment API Key Usage Summary:") + fetcher.print_api_key_status() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/fetchers/santiment/test_api_switching.py b/src/fetchers/santiment/test_api_switching.py new file mode 100644 index 0000000000000000000000000000000000000000..0569070fb82f875624aced03d14f3f0d881a9ca2 --- /dev/null +++ b/src/fetchers/santiment/test_api_switching.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +""" +Test script to verify if API key switching is effective +""" + +import os +import sys +from datetime import datetime +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Add the current directory to Python path for imports +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +from main import SantimentDataFetcher + +def test_api_key_switching(): + """ + Test if API key switching actually works by attempting multiple requests + """ + print("🔧 Testing API Key Switching Effectiveness") + print("=" * 60) + + # Get API keys + api_keys = os.getenv('SANTIMENT_API_KEY') + if not api_keys: + print("❌ No SANTIMENT_API_KEY found in environment") + return + + key_list = [key.strip() for key in api_keys.split(',')] + print(f"📊 Testing with {len(key_list)} API keys") + + # Create fetcher + fetcher = SantimentDataFetcher() + + # Track switches + initial_switches = fetcher.rate_limit_switches + + print(f"\n🚀 Starting test at {datetime.now().strftime('%H:%M:%S')}") + print(f"Initial API key switches: {initial_switches}") + + # Attempt to fetch a simple metric multiple times + test_slug = 'bitcoin' + test_metric = 'price_usd' + + success_count = 0 + attempt_count = 5 # Try 5 requests + + for i in range(attempt_count): + print(f"\n--- Attempt {i+1}/{attempt_count} ---") + print(f"Current API key: #{fetcher.current_key_index + 1}") + + try: + result = fetcher.fetch_single_metric(test_metric, test_slug) + + if result is not None and not result.empty: + success_count += 1 + print(f"✅ Success! Got {len(result)} data points") + else: + print("⚠️ No data returned") + + except Exception as e: + print(f"❌ Error: {e}") + + print(f"API key switches so far: {fetcher.rate_limit_switches}") + + # Small delay between requests + import time + time.sleep(1) + + # Final report + print(f"\n📈 FINAL RESULTS") + print("=" * 40) + print(f"Successful requests: {success_count}/{attempt_count}") + print(f"Total API key switches: {fetcher.rate_limit_switches}") + print(f"Final API key: #{fetcher.current_key_index + 1}") + + # Interpret results + if fetcher.rate_limit_switches > 0: + print("\n✅ API key switching IS working!") + print("✅ Your keys appear to be from different accounts.") + elif success_count == attempt_count: + print("\n✅ All requests successful without switching!") + print("ℹ️ Either keys are from different accounts OR current key still has quota.") + else: + print("\n❌ No switching occurred and some requests failed.") + print("⚠️ All keys might be from the same exhausted account.") + + return fetcher.rate_limit_switches > 0 + +if __name__ == "__main__": + test_api_key_switching() diff --git a/src/fetchers/stocktwits/ticker_stream.py b/src/fetchers/stocktwits/ticker_stream.py new file mode 100644 index 0000000000000000000000000000000000000000..76bc5ea1b0a5e870548170377759e56366082559 --- /dev/null +++ b/src/fetchers/stocktwits/ticker_stream.py @@ -0,0 +1,56 @@ +# """ +# ticker_stream.py – StockTwits Ticker Streams (Raw Messages) + +# Fetches real-time “cashtag” message streams for any US ticker (e.g., $AAPL). +# No API key required. You can apply your own NLP/sentiment models client-side. + +# Endpoint: +# https://api.stocktwits.com/api/2/streams/symbol/{symbol}.json +# """ + + +# import os +# import requests + + +# class StockTwitsTickerStream: +# BASE_URL = "https://api.stocktwits.com/api/2/streams/symbol/" + +# def fetch_stream(self, symbol: str, access_token: str, **kwargs): +# """ +# Fetch raw message stream for a given ticker symbol (e.g., 'AAPL'). +# Requires OAuth access_token. +# Returns JSON with messages and metadata. +# """ +# url = f"{self.BASE_URL}{symbol}.json" +# params = {"access_token": access_token} +# params.update(kwargs) +# headers = { +# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" +# } +# resp = requests.get(url, params=params, headers=headers) +# resp.raise_for_status() +# return resp.json() + + +# def main(): +# """ +# Example usage: Fetch and print StockTwits stream for a sample ticker. +# """ +# stream_client = StockTwitsTickerStream() +# symbol = "AAPL" # Example ticker +# access_token = os.getenv("STOCKTWITS_ACCESS_TOKEN") +# if not access_token: +# raise RuntimeError("STOCKTWITS_ACCESS_TOKEN environment variable not set.") +# try: +# data = stream_client.fetch_stream(symbol, access_token) +# print(f"Fetched {len(data.get('messages', []))} messages for ${symbol}.") +# # Print first message text as a sample +# if data.get('messages'): +# print("Sample message:", data['messages'][0].get('body', '')) +# except Exception as e: +# print(f"Error fetching stream for ${symbol}: {e}") + + +# if __name__ == "__main__": +# main() diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000000000000000000000000000000000000..f9971c0fb19483766e654718b42c94d01287ef76 --- /dev/null +++ b/src/main.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Memory-Optimized Main Pipeline for AdvisorAI Data Enhanced +Addresses critical memory issues causing instance failures (512MB limit) +""" + +import sys +import os +import gc +import psutil +from datetime import datetime +from contextlib import contextmanager + +# Add paths +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "."))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers"))) + +class MemoryMonitor: + """Memory monitoring and optimization utility""" + + def __init__(self, max_memory_mb=450): # Set to 450MB to stay under 512MB limit + self.max_memory_mb = max_memory_mb + self.process = psutil.Process() + + def get_memory_usage(self): + """Get current memory usage in MB""" + return self.process.memory_info().rss / 1024 / 1024 + + def check_and_cleanup(self, operation_name=""): + """Check memory usage and cleanup if needed""" + memory_mb = self.get_memory_usage() + + if memory_mb > self.max_memory_mb * 0.8: # 80% threshold (360MB) + print(f"[MemOpt] High memory usage during {operation_name}: {memory_mb:.1f}MB") + collected = gc.collect() + new_memory_mb = self.get_memory_usage() + print(f"[MemOpt] Memory after GC: {new_memory_mb:.1f}MB (freed {collected} objects)") + + if new_memory_mb > self.max_memory_mb * 0.9: # Still high (405MB) + print(f"[MemOpt] WARNING: Memory still high after cleanup") + + return memory_mb + + @contextmanager + def memory_context(self, operation_name): + """Context manager for memory monitoring""" + start_memory = self.get_memory_usage() + print(f"[MemOpt] Starting {operation_name} - Memory: {start_memory:.1f}MB") + + try: + yield + finally: + end_memory = self.get_memory_usage() + diff = end_memory - start_memory + print(f"[MemOpt] Finished {operation_name} - Memory: {end_memory:.1f}MB (Δ{diff:+.1f}MB)") + + # Force cleanup if memory is getting high + if end_memory > self.max_memory_mb * 0.8: + print(f"[MemOpt] Memory high after {operation_name}, forcing cleanup...") + gc.collect() + final_memory = self.get_memory_usage() + print(f"[MemOpt] Memory after cleanup: {final_memory:.1f}MB") + +def run_fetchers_optimized(memory_monitor): + """Run fetchers with memory optimization""" + try: + with memory_monitor.memory_context("Fetchers"): + # Import fetchers main (only when needed) + from fetchers.main import main as fetchers_main + + print("[Pipeline] Starting data fetchers (memory optimized)...") + result = fetchers_main() + + # Clear imports to free memory + if 'fetchers.main' in sys.modules: + del sys.modules['fetchers.main'] + + # Force cleanup after fetchers + memory_monitor.check_and_cleanup("Fetchers") + + return result + + except Exception as e: + print(f"[Pipeline] Error in fetchers: {e}") + # Still cleanup on error + memory_monitor.check_and_cleanup("Fetchers (error)") + return False + +def run_merge_optimized(memory_monitor): + """Run merge operations with memory optimization""" + try: + with memory_monitor.memory_context("Merge"): + # Import merge main (only when needed) + from merge import main as merge_main + + print("[Pipeline] Starting data merge (memory optimized)...") + result = merge_main.main() + + # Clear imports to free memory + if 'merge.main' in sys.modules: + del sys.modules['merge.main'] + + # Force cleanup after merge + memory_monitor.check_and_cleanup("Merge") + + return result + + except Exception as e: + print(f"[Pipeline] Error in merge: {e}") + # Still cleanup on error + memory_monitor.check_and_cleanup("Merge (error)") + return False + +def main(): + """Memory-optimized main pipeline execution""" + print("AdvisorAI Data Pipeline - Memory Optimized") + print("=" * 50) + print(f"Pipeline started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Initialize memory monitor + memory_monitor = MemoryMonitor(max_memory_mb=450) # Stay under 512MB limit + + initial_memory = memory_monitor.get_memory_usage() + print(f"[Pipeline] Initial memory usage: {initial_memory:.1f}MB") + + # Check if we're already too high + if initial_memory > 200: + print(f"[Pipeline] WARNING: High initial memory usage: {initial_memory:.1f}MB") + memory_monitor.check_and_cleanup("Initial") + + try: + # Step 1: Run fetchers with memory optimization + print("\n" + "="*30) + print("STEP 1: DATA FETCHERS") + print("="*30) + + fetchers_success = run_fetchers_optimized(memory_monitor) + + if not fetchers_success: + print("[Pipeline] Fetchers failed, but continuing to merge existing data...") + + # Memory checkpoint + mid_memory = memory_monitor.get_memory_usage() + print(f"\n[Pipeline] Memory after fetchers: {mid_memory:.1f}MB") + + if mid_memory > 400: # Getting close to limit + print("[Pipeline] Memory high after fetchers, forcing cleanup...") + gc.collect() + mid_memory = memory_monitor.get_memory_usage() + print(f"[Pipeline] Memory after cleanup: {mid_memory:.1f}MB") + + # Step 2: Run merge with memory optimization + print("\n" + "="*30) + print("STEP 2: DATA MERGE") + print("="*30) + + merge_success = run_merge_optimized(memory_monitor) + + if not merge_success: + print("[Pipeline] Merge failed") + return False + + # Final memory check + final_memory = memory_monitor.get_memory_usage() + print(f"\n[Pipeline] Final memory usage: {final_memory:.1f}MB") + + if final_memory > 450: # Close to 512MB limit + print("⚠️ WARNING: Memory usage approaching limit - optimization needed") + + print(f"Pipeline ended at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print("[OK] All steps completed successfully!") + return True + + except Exception as e: + import traceback + print(f"[ERROR] Pipeline execution failed: {e}") + print(traceback.format_exc()) + + # Emergency memory cleanup + print("[Pipeline] Emergency memory cleanup...") + memory_monitor.check_and_cleanup("Emergency") + + return False + +if __name__ == "__main__": + success = main() + if not success: + sys.exit(1) diff --git a/src/main_memory_optimized.py b/src/main_memory_optimized.py new file mode 100644 index 0000000000000000000000000000000000000000..f9971c0fb19483766e654718b42c94d01287ef76 --- /dev/null +++ b/src/main_memory_optimized.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Memory-Optimized Main Pipeline for AdvisorAI Data Enhanced +Addresses critical memory issues causing instance failures (512MB limit) +""" + +import sys +import os +import gc +import psutil +from datetime import datetime +from contextlib import contextmanager + +# Add paths +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "."))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers"))) + +class MemoryMonitor: + """Memory monitoring and optimization utility""" + + def __init__(self, max_memory_mb=450): # Set to 450MB to stay under 512MB limit + self.max_memory_mb = max_memory_mb + self.process = psutil.Process() + + def get_memory_usage(self): + """Get current memory usage in MB""" + return self.process.memory_info().rss / 1024 / 1024 + + def check_and_cleanup(self, operation_name=""): + """Check memory usage and cleanup if needed""" + memory_mb = self.get_memory_usage() + + if memory_mb > self.max_memory_mb * 0.8: # 80% threshold (360MB) + print(f"[MemOpt] High memory usage during {operation_name}: {memory_mb:.1f}MB") + collected = gc.collect() + new_memory_mb = self.get_memory_usage() + print(f"[MemOpt] Memory after GC: {new_memory_mb:.1f}MB (freed {collected} objects)") + + if new_memory_mb > self.max_memory_mb * 0.9: # Still high (405MB) + print(f"[MemOpt] WARNING: Memory still high after cleanup") + + return memory_mb + + @contextmanager + def memory_context(self, operation_name): + """Context manager for memory monitoring""" + start_memory = self.get_memory_usage() + print(f"[MemOpt] Starting {operation_name} - Memory: {start_memory:.1f}MB") + + try: + yield + finally: + end_memory = self.get_memory_usage() + diff = end_memory - start_memory + print(f"[MemOpt] Finished {operation_name} - Memory: {end_memory:.1f}MB (Δ{diff:+.1f}MB)") + + # Force cleanup if memory is getting high + if end_memory > self.max_memory_mb * 0.8: + print(f"[MemOpt] Memory high after {operation_name}, forcing cleanup...") + gc.collect() + final_memory = self.get_memory_usage() + print(f"[MemOpt] Memory after cleanup: {final_memory:.1f}MB") + +def run_fetchers_optimized(memory_monitor): + """Run fetchers with memory optimization""" + try: + with memory_monitor.memory_context("Fetchers"): + # Import fetchers main (only when needed) + from fetchers.main import main as fetchers_main + + print("[Pipeline] Starting data fetchers (memory optimized)...") + result = fetchers_main() + + # Clear imports to free memory + if 'fetchers.main' in sys.modules: + del sys.modules['fetchers.main'] + + # Force cleanup after fetchers + memory_monitor.check_and_cleanup("Fetchers") + + return result + + except Exception as e: + print(f"[Pipeline] Error in fetchers: {e}") + # Still cleanup on error + memory_monitor.check_and_cleanup("Fetchers (error)") + return False + +def run_merge_optimized(memory_monitor): + """Run merge operations with memory optimization""" + try: + with memory_monitor.memory_context("Merge"): + # Import merge main (only when needed) + from merge import main as merge_main + + print("[Pipeline] Starting data merge (memory optimized)...") + result = merge_main.main() + + # Clear imports to free memory + if 'merge.main' in sys.modules: + del sys.modules['merge.main'] + + # Force cleanup after merge + memory_monitor.check_and_cleanup("Merge") + + return result + + except Exception as e: + print(f"[Pipeline] Error in merge: {e}") + # Still cleanup on error + memory_monitor.check_and_cleanup("Merge (error)") + return False + +def main(): + """Memory-optimized main pipeline execution""" + print("AdvisorAI Data Pipeline - Memory Optimized") + print("=" * 50) + print(f"Pipeline started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + + # Initialize memory monitor + memory_monitor = MemoryMonitor(max_memory_mb=450) # Stay under 512MB limit + + initial_memory = memory_monitor.get_memory_usage() + print(f"[Pipeline] Initial memory usage: {initial_memory:.1f}MB") + + # Check if we're already too high + if initial_memory > 200: + print(f"[Pipeline] WARNING: High initial memory usage: {initial_memory:.1f}MB") + memory_monitor.check_and_cleanup("Initial") + + try: + # Step 1: Run fetchers with memory optimization + print("\n" + "="*30) + print("STEP 1: DATA FETCHERS") + print("="*30) + + fetchers_success = run_fetchers_optimized(memory_monitor) + + if not fetchers_success: + print("[Pipeline] Fetchers failed, but continuing to merge existing data...") + + # Memory checkpoint + mid_memory = memory_monitor.get_memory_usage() + print(f"\n[Pipeline] Memory after fetchers: {mid_memory:.1f}MB") + + if mid_memory > 400: # Getting close to limit + print("[Pipeline] Memory high after fetchers, forcing cleanup...") + gc.collect() + mid_memory = memory_monitor.get_memory_usage() + print(f"[Pipeline] Memory after cleanup: {mid_memory:.1f}MB") + + # Step 2: Run merge with memory optimization + print("\n" + "="*30) + print("STEP 2: DATA MERGE") + print("="*30) + + merge_success = run_merge_optimized(memory_monitor) + + if not merge_success: + print("[Pipeline] Merge failed") + return False + + # Final memory check + final_memory = memory_monitor.get_memory_usage() + print(f"\n[Pipeline] Final memory usage: {final_memory:.1f}MB") + + if final_memory > 450: # Close to 512MB limit + print("⚠️ WARNING: Memory usage approaching limit - optimization needed") + + print(f"Pipeline ended at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") + print("[OK] All steps completed successfully!") + return True + + except Exception as e: + import traceback + print(f"[ERROR] Pipeline execution failed: {e}") + print(traceback.format_exc()) + + # Emergency memory cleanup + print("[Pipeline] Emergency memory cleanup...") + memory_monitor.check_and_cleanup("Emergency") + + return False + +if __name__ == "__main__": + success = main() + if not success: + sys.exit(1) diff --git a/src/main_original.py b/src/main_original.py new file mode 100644 index 0000000000000000000000000000000000000000..b7ab1497c4fe0e777c402dd28642c6718c75569d --- /dev/null +++ b/src/main_original.py @@ -0,0 +1,21 @@ +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "."))) +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers"))) + +from fetchers.main import main as fetchers_main +from merge import main as merge_main + +if __name__ == "__main__": + try: + print("Starting fetchers...") + fetchers_main() + print("Starting merge pipeline...") + merge_main.main() + print("[OK] All steps completed.") + except Exception as e: + import traceback + print(f"[ERROR] Pipeline execution failed: {e}") + print(traceback.format_exc()) + sys.exit(1) diff --git a/src/merge/ENHANCED_MERGE_README.md b/src/merge/ENHANCED_MERGE_README.md new file mode 100644 index 0000000000000000000000000000000000000000..51303c632f61c935cc068c45f6807c50facaea6c --- /dev/null +++ b/src/merge/ENHANCED_MERGE_README.md @@ -0,0 +1,123 @@ +# Enhanced Merge with Intelligent Null Filling + +## Overview + +The `merge_temp.py` module has been enhanced with sophisticated null filling capabilities that prioritize finding values from the **same symbol + interval_timestamp** combination across different data sources before falling back to other strategies. + +## Key Features + +### 1. Symbol-First Null Filling Strategy + +When merging temp files to existing features, the system now: + +1. **Identifies null values** in the target (merged) dataset +2. **Searches for matching records** in the source (temp) dataset using `(symbol, interval_timestamp)` as the key +3. **Fills null values** only when: + - The same symbol + timestamp exists in the temp data + - The temp data has a non-null value for that column + - The column exists in both datasets + +### 2. Cross-Dataset Null Filling + +During train file creation and merged features generation: + +1. **Combines multiple sources** (archive, features, temp files) +2. **Creates a comprehensive lookup** of all non-null values by `(symbol, timestamp)` +3. **Fills nulls intelligently** using the best available data from any source +4. **Preserves data integrity** by only filling with values from the exact same symbol and time + +### 3. Enhanced Functions + +#### `fill_nulls_from_temp(df_merged, df_temp)` +- Fills null values in `df_merged` using data from `df_temp` +- Only fills when exact `(symbol, interval_timestamp)` match exists +- Returns count of null values filled +- Provides detailed logging of the filling process + +#### `merge_temp_to_merged(temp_name, merged_name)` +- Enhanced to perform null filling before adding new records +- Reports both new records added and null values filled +- Maintains existing functionality while adding intelligent null handling + +#### `merge_all_to_train()` +- Cross-source null filling during train file creation +- Combines archive, features, and temp data optimally +- Eliminates duplicates while preserving the best available data + +#### `create_merged_features()` +- Creates the main `merged_features.parquet` file +- Combines crypto and stock features with cross-dataset null filling +- Provides comprehensive statistics on the merge process + +## Benefits + +### 🎯 **Data Quality Improvements** +- **Preserves Symbol Characteristics**: Uses same-symbol data to fill nulls +- **Temporal Consistency**: Only uses data from the exact same timestamp +- **No Data Pollution**: Never mixes data from different symbols or times + +### 📊 **Better Coverage** +- **Reduced Null Values**: Significantly fewer missing values in final datasets +- **Multi-Source Integration**: Leverages all available data sources +- **Smart Deduplication**: Keeps the best version of each record + +### 🔧 **Robust Processing** +- **Error Handling**: Graceful handling of missing files and edge cases +- **Detailed Logging**: Clear reporting of what was filled and why +- **Validation**: Built-in checks to ensure data integrity + +## Usage Examples + +### Test the Null Filling +```bash +cd src/merge +python merge_temp.py --test-null-filling +``` + +### Run Normal Merge Process +```bash +cd src/merge +python merge_temp.py +``` + +### Manual Testing +```bash +cd src/merge +python test_null_filling_merge.py +``` + +## Integration with Main Pipeline + +The enhanced merge functionality is automatically integrated into the main pipeline: + +1. **After data collection**: Temp files are created with new data +2. **During merge_temp.py**: Null filling happens automatically +3. **Before normalization**: Data is as complete as possible +4. **Train file creation**: Uses all available historical data + +## Example Output + +``` +[INFO] Attempting to fill nulls in 4 columns: ['price', 'volume', 'rsi', 'macd'] +[INFO] Successfully filled 7 null values from temp data +[INFO] Column 'price': 0 nulls remaining +[INFO] Column 'volume': 0 nulls remaining +[INFO] Column 'rsi': 0 nulls remaining +[INFO] Column 'macd': 0 nulls remaining +[OK] Added 15 new records from crypto_features.parquet to crypto_features.parquet, filled 7 null values +``` + +## Performance Considerations + +- **Efficient Lookups**: Uses dictionary-based lookups for O(1) access +- **Memory Optimized**: Processes data in chunks when possible +- **Minimal Overhead**: Only processes columns that actually have nulls + +## Future Enhancements + +- **Time-Window Filling**: Fill with nearest timestamp if exact match not found +- **Interpolation**: Smart interpolation for numerical features +- **Symbol Similarity**: Fill using similar symbols when exact match unavailable +- **Quality Scoring**: Rank data sources by quality for better filling decisions + +This enhanced merge system ensures that your machine learning models receive the highest quality, most complete data possible while preserving the integrity and characteristics of each financial instrument. diff --git a/src/merge/alpaca_features.py b/src/merge/alpaca_features.py new file mode 100644 index 0000000000000000000000000000000000000000..858ba941bd7364fd98995df12f768a591b3f33dd --- /dev/null +++ b/src/merge/alpaca_features.py @@ -0,0 +1,142 @@ +""" +Merge Alpaca bars + quotes + trades into a single feature table. + +• data/alpaca/*_bars.parquet ← master timeline (daily) +• data/alpaca/*_quotes.parquet ← L1 quotes (intraday ticks) +• data/alpaca/*_trades.parquet ← raw trades (intraday ticks) + +The script logs shapes / null counts so you can eyeball data quality. +""" + +from __future__ import annotations + + +import os +import sys +from glob import glob + +import pandas as pd +import warnings +warnings.filterwarnings("ignore", category=FutureWarning) + +# --------------------------------------------------------------------------- # +# CONFIG +# --------------------------------------------------------------------------- # +# Resolve writable base using central config (fallback to /data) +try: + from src import config as app_config + BASE_DATA_DIR = app_config.DATA_DIR +except Exception: + BASE_DATA_DIR = os.environ.get("DATA_DIR", "/data") + +DATA_DIR = os.path.join(BASE_DATA_DIR, "alpaca") +os.makedirs(DATA_DIR, exist_ok=True) +OUT_FILE = "alpaca_features.parquet" +TOLERANCE = 86_400_000 # 1 day in ms for integer timestamps +MERGE_DIR = "nearest" # ← **important change** + +# --------------------------------------------------------------------------- # +# HELPERS +# --------------------------------------------------------------------------- # +def log(title: str, char: str = "=", width: int = 60) -> None: + print(f"\n{title.center(width, char)}") + +def load_parquets(suffix: str) -> pd.DataFrame: + """Read every *{suffix}.parquet in DATA_DIR and concat.""" + paths = glob(os.path.join(DATA_DIR, f"*{suffix}.parquet")) + if not paths: + return pd.DataFrame() + + def normalize(df: pd.DataFrame) -> pd.DataFrame: + # Normalize symbol: "XRP/USD" -> "XRP" + df["symbol"] = df["symbol"].astype(str).str.replace(r"([A-Z]+)[/_][A-Z]+", r"\1", regex=True) + # Convert timestamp to ms since epoch + df["timestamp"] = pd.to_datetime(df["timestamp"]) + df["timestamp"] = df["timestamp"].astype("int64") // 10**6 + return df + + dfs: list[pd.DataFrame] = [] + for p in paths: + df = pd.read_parquet(p) + df = normalize(df) + dfs.append(df) + + out = pd.concat(dfs, ignore_index=True) + return out + + +# --------------------------------------------------------------------------- # +# MAIN LOGIC +# --------------------------------------------------------------------------- # +def build_features() -> pd.DataFrame: + bars = load_parquets("_bars") + quotes = load_parquets("_quotes") + trades = load_parquets("_trades") + + if bars.empty: + raise RuntimeError(f"No '*_bars.parquet' files found in {DATA_DIR}") + + # Merge symbol-by-symbol so each group is already sorted + features = [] + symbols = sorted(bars["symbol"].unique()) + + for sym in symbols: + bar_df = bars[bars["symbol"] == sym].sort_values("timestamp").reset_index(drop=True) + + # nearest quote merge + if not quotes.empty: + q = quotes[quotes["symbol"] == sym].sort_values("timestamp") + if not q.empty: + bar_df = pd.merge_asof( + bar_df, + q, + on="timestamp", + suffixes=("", "_quote"), + tolerance=TOLERANCE, + direction=MERGE_DIR, # ← nearest! + ) + + # nearest trade merge + if not trades.empty: + t = trades[trades["symbol"] == sym].sort_values("timestamp") + if not t.empty: + bar_df = pd.merge_asof( + bar_df, + t, + on="timestamp", + suffixes=("", "_trade"), + tolerance=TOLERANCE, + direction=MERGE_DIR, # ← nearest! + ) + + features.append(bar_df) + + feat = pd.concat(features, ignore_index=True) + + # --------------------------------------------------------------------- # + # Fill remaining holes within each symbol + # --------------------------------------------------------------------- # + feat = ( + feat + .groupby("symbol", group_keys=False) + .apply(lambda df: df.ffill().bfill()) + .reset_index(drop=True) + ) + + return feat + + +def save(df: pd.DataFrame) -> None: + out_path = os.path.join(DATA_DIR, OUT_FILE) + df.to_parquet(out_path, index=False) + print(f"\n-> wrote merged features to {out_path}") + + +# --------------------------------------------------------------------------- # +def main() -> None: + merged = build_features() + save(merged) + +if __name__ == "__main__": + log("Merging Alpaca Features") + main() diff --git a/src/merge/crypto_data_filler.py b/src/merge/crypto_data_filler.py new file mode 100644 index 0000000000000000000000000000000000000000..b853535c81682620408ef4171ba560fa588ae7db --- /dev/null +++ b/src/merge/crypto_data_filler.py @@ -0,0 +1,865 @@ +import pandas as pd +import numpy as np +from sklearn.impute import KNNImputer +from sklearn.preprocessing import StandardScaler +import warnings +warnings.filterwarnings('ignore') + +class CryptoDataImputerFixed: + """ + Specialized imputation for cryptocurrency data that preserves unique + characteristics of different crypto assets and prevents homogenization. + """ + + def __init__(self, preserve_crypto_diversity=True): + self.preserve_crypto_diversity = preserve_crypto_diversity + self.crypto_profiles = {} + self.scalers = {} + + def _create_crypto_profiles(self, df): + """Create profiles for each cryptocurrency to guide imputation.""" + profiles = {} + + for symbol in df['symbol'].unique(): + symbol_data = df[df['symbol'] == symbol] + + # Calculate crypto-specific statistics + # Defensive mode extraction for 'stable' and 'blockchain_network' + stable_mode = symbol_data['stable'].mode() if 'stable' in symbol_data.columns else pd.Series() + is_stablecoin = stable_mode.iloc[0] if not stable_mode.empty else False + network_mode = symbol_data['blockchain_network'].mode() if 'blockchain_network' in symbol_data.columns else pd.Series() + blockchain_network = network_mode.iloc[0] if not network_mode.empty else None + + profile = { + 'symbol': symbol, + 'price_level': symbol_data['price'].median() if 'price' in symbol_data.columns else None, + 'price_volatility': symbol_data['price'].std() if 'price' in symbol_data.columns else None, + 'volume_level': symbol_data['volume'].median() if 'volume' in symbol_data.columns else None, + 'marketcap_level': symbol_data['marketcap'].median() if 'marketcap' in symbol_data.columns else None, + 'dominance_level': symbol_data['dominance'].median() if 'dominance' in symbol_data.columns else None, + 'rank': symbol_data['rank'].median() if 'rank' in symbol_data.columns else None, + 'is_stablecoin': is_stablecoin, + 'typical_rsi': symbol_data['rsi'].median() if 'rsi' in symbol_data.columns else None, + 'blockchain_network': blockchain_network, + 'has_onchain_data': symbol_data['transaction_count'].notna().any() if 'transaction_count' in symbol_data.columns else False, + 'exchange_coverage': len([col for col in symbol_data.columns if col.startswith('symbols.') and symbol_data[col].notna().any()]), + 'data_availability': len(symbol_data) / len(df) if len(df) > 0 else 0 + } + + profiles[symbol] = profile + + return profiles + + def _impute_with_crypto_context(self, df, column, crypto_profiles): + """Impute values using crypto-specific context to prevent homogenization.""" + + df_result = df.copy() + + for symbol in df['symbol'].unique(): + symbol_mask = df['symbol'] == symbol + symbol_data = df.loc[symbol_mask, column] + + if symbol_data.isnull().sum() == 0: + continue # No missing values for this symbol + + profile = crypto_profiles.get(symbol, {}) + is_stablecoin = profile.get('is_stablecoin', False) + rank = profile.get('rank', 999) + + # Strategy depends on column type and crypto characteristics + if column in ['price', 'open', 'high', 'low', 'close']: + # Price data - special handling for stablecoins + if is_stablecoin: + # Stablecoins should stay around $1 + base_price = 1.0 + symbol_hash = hash(symbol + column) % 1000 / 100000 # Very small variation + adjusted_price = base_price + symbol_hash + else: + # Regular crypto - use interpolation with crypto-specific bounds + interpolated = symbol_data.interpolate(method='linear', limit_direction='both') + + # If still missing, use crypto's typical price level with volatility-based noise + if interpolated.isnull().any() and profile.get('price_level'): + base_price = profile['price_level'] + volatility = profile.get('price_volatility', base_price * 0.05) # Crypto is more volatile + + # Add crypto-specific noise based on rank (higher rank = more volatile) + symbol_hash = hash(symbol) % 1000 / 1000 # 0-1 range + volatility_multiplier = 1 + (rank / 100) # Higher rank = higher volatility + noise_factor = (symbol_hash - 0.5) * 0.2 * volatility_multiplier # More volatile than stocks + adjusted_price = base_price * (1 + noise_factor) + else: + adjusted_price = interpolated + + df_result.loc[symbol_mask, column] = symbol_data.fillna(adjusted_price) + + elif column in ['volume', 'volume_alpaca']: + # Volume data - crypto volume patterns differ significantly + filled = symbol_data.fillna(method='ffill').fillna(method='bfill') + + if filled.isnull().any(): + base_volume = profile.get('volume_level', 1000000) # Default higher for crypto + # Major cryptos have much higher volume + if rank and rank <= 10: + volume_multiplier = 5 + (hash(symbol + column) % 1000 / 200) # 5x-10x + elif rank and rank <= 50: + volume_multiplier = 1 + (hash(symbol + column) % 1000 / 500) # 1x-3x + else: + volume_multiplier = 0.1 + (hash(symbol + column) % 1000 / 1000) # 0.1x-1.1x + + adjusted_volume = base_volume * volume_multiplier + filled = filled.fillna(adjusted_volume) + + df_result.loc[symbol_mask, column] = filled + + elif column in ['marketcap']: + # Market cap - highly dependent on rank + if profile.get('marketcap_level'): + baseline = profile['marketcap_level'] + else: + # Estimate based on rank + if rank and rank <= 10: + baseline = 10_000_000_000 # $10B+ for top 10 + elif rank and rank <= 50: + baseline = 1_000_000_000 # $1B+ for top 50 + elif rank and rank <= 100: + baseline = 100_000_000 # $100M+ for top 100 + else: + baseline = 10_000_000 # $10M+ for others + + # Add symbol-specific variation + symbol_hash = hash(symbol + column) % 1000 / 1000 + baseline *= (0.5 + symbol_hash) # 0.5x to 1.5x variation + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + elif column in ['dominance']: + # Market dominance - only meaningful for major cryptos + if rank and rank <= 5: + # Major cryptos have meaningful dominance + symbol_hash = hash(symbol + column) % 1000 / 1000 + if symbol.upper() == 'BTC': + baseline = 0.4 + (symbol_hash * 0.2) # BTC: 40-60% + elif symbol.upper() == 'ETH': + baseline = 0.15 + (symbol_hash * 0.1) # ETH: 15-25% + else: + baseline = 0.01 + (symbol_hash * 0.05) # Others: 1-6% + else: + baseline = 0.001 + (hash(symbol + column) % 1000 / 100000) # Very small + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + elif column in ['rsi', 'stoch_k', 'stoch_d']: + # Oscillator indicators - crypto markets are more extreme + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + symbol_hash = hash(symbol + column) % 1000 / 1000 + if column == 'rsi': + # Crypto RSI tends to be more extreme + if rank and rank <= 10: # Major cryptos more stable + baseline = 20 + (symbol_hash * 60) # 20-80 range + else: # Alt coins more extreme + baseline = 10 + (symbol_hash * 80) # 10-90 range + else: # stochastic + baseline = 10 + (symbol_hash * 80) # 10-90 range + else: + baseline = symbol_median + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + elif column in ['macd', 'macd_signal', 'macd_histogram']: + # MACD - crypto MACD values tend to be more volatile + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + price_level = profile.get('price_level', 1) + symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 # -1 to +1 + # Scale MACD relative to price level and volatility + volatility_factor = 2 if rank and rank > 50 else 1 # Alt coins more volatile + baseline = (price_level * 0.01 * volatility_factor) * symbol_hash + else: + baseline = symbol_median + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + elif column.startswith('performance.'): + # Performance metrics - crypto performance is more extreme + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 # -1 to +1 + + # Different baselines for different timeframes + if 'year' in column: + baseline = symbol_hash * 5 # ±500% annual performance possible + elif 'month' in column: + baseline = symbol_hash * 2 # ±200% monthly performance possible + elif 'week' in column: + baseline = symbol_hash * 0.5 # ±50% weekly performance possible + elif 'day' in column: + baseline = symbol_hash * 0.2 # ±20% daily performance possible + else: # hour, min + baseline = symbol_hash * 0.05 # ±5% short-term performance + + # Alt coins are more volatile + if rank and rank > 50: + baseline *= 2 + + else: + baseline = symbol_median + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + elif column.startswith('tx_') or column.startswith('gas_') or column in [ + 'transaction_volume', 'transaction_count', 'total_fees', 'total_gas_used', + 'avg_gas_price', 'avg_tx_size', 'fees_7d_change', 'gas_used_7d_change', 'gas_price_7d_change' + ] or '_7d_change' in column: + # On-chain features - only meaningful for blockchains with transaction data + network = profile.get('blockchain_network', 'unknown') + + # Special handling for 7d change columns + if '7d_change' in column: + # These are percentage changes, should be reasonable values + symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 # -1 to +1 range + + if 'fees' in column.lower(): + # Fee changes can be more volatile in crypto + baseline = symbol_hash * 0.5 # ±50% change + elif 'gas' in column.lower(): + # Gas usage changes + baseline = symbol_hash * 0.3 # ±30% change + else: + # Other transaction-related changes + baseline = symbol_hash * 0.4 # ±40% change + + # Alt coins more volatile + if rank and rank > 100: + baseline *= 2 + + elif network in ['ethereum', 'bitcoin', 'polygon', 'bsc', 'avalanche']: + # Major networks have meaningful on-chain data + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + # Estimate based on network and rank + symbol_hash = hash(symbol + column) % 1000 / 1000 + + if 'count' in column.lower(): + if network == 'ethereum': + baseline = 1000000 * (1 + symbol_hash) # High transaction count + elif network == 'bitcoin': + baseline = 300000 * (1 + symbol_hash) # Lower transaction count + else: + baseline = 500000 * (1 + symbol_hash) # Medium transaction count + elif 'gas' in column.lower(): + if network == 'ethereum': + baseline = 50 * (1 + symbol_hash) # Higher gas prices + else: + baseline = 5 * (1 + symbol_hash) # Lower gas prices + elif 'fee' in column.lower(): + baseline = 1000000 * (1 + symbol_hash) # Transaction fees in wei/satoshi + else: + # Other on-chain metrics + baseline = symbol_hash * 1000 + else: + baseline = symbol_median + else: + # Networks without meaningful on-chain data OR 7d_change columns + if '7d_change' in column: + # Use the calculated baseline from above + pass # baseline already set + else: + baseline = 0 + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + elif column.startswith('exchangePrices.') or column.startswith('symbols.'): + # Exchange-specific data + exchange = column.split('.')[1] if '.' in column else 'unknown' + + if column.startswith('exchangePrices.'): + # Use main price with small exchange-specific variation + main_price = profile.get('price_level', 100) + if main_price and not is_stablecoin: + # Different exchanges have small price differences + exchange_hash = hash(symbol + exchange) % 200 / 10000 # ±1% variation + baseline = main_price * (1 + exchange_hash) + else: + baseline = main_price or 1 + else: + # Exchange symbols - should be strings, handle separately + continue + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + else: + # Generic numeric imputation with crypto-specific variation + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + overall_median = df[column].median() + if pd.isna(overall_median): + overall_median = 0 + + # Add crypto-specific variation based on rank and volatility + symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 # -1 to +1 + volatility_factor = 2 if rank and rank > 100 else 1 + variation = overall_median * 0.2 * symbol_hash * volatility_factor + baseline = overall_median + variation + else: + baseline = symbol_median + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + return df_result[column] + + def _force_fill_stubborn_nulls(self, df): + """Aggressively fill any remaining nulls with appropriate defaults.""" + + # Target ALL the problematic 7d_change columns + stubborn_cols = ['fees_7d_change', 'gas_used_7d_change', 'gas_price_7d_change'] + + for col in stubborn_cols: + if col in df.columns: + null_count_before = df[col].isnull().sum() + if null_count_before > 0: + # Strategy 1: Try group-based fill first + df[col] = df.groupby('symbol')[col].transform(lambda x: x.fillna(x.median())) + + # Strategy 2: Fill remaining with symbol-specific hash-based values + still_null = df[col].isnull() + if still_null.any(): + for symbol in df[still_null]['symbol'].unique(): + symbol_mask = (df['symbol'] == symbol) & df[col].isnull() + if symbol_mask.any(): + # Create deterministic but varied values based on symbol + symbol_hash = hash(symbol + col) % 2000 / 1000 - 1 # -1 to +1 + + if 'fees' in col.lower(): + fill_value = symbol_hash * 0.3 # ±30% fee change + elif 'gas' in col.lower(): + fill_value = symbol_hash * 0.25 # ±25% gas change + else: + fill_value = symbol_hash * 0.2 # ±20% generic change + + df.loc[symbol_mask, col] = fill_value + + # Strategy 3: Nuclear option - fill any remaining with 0 + remaining_nulls = df[col].isnull().sum() + if remaining_nulls > 0: + print(f"[WARNING] Nuclear fill: {remaining_nulls} nulls in {col} filled with 0") + df[col] = df[col].fillna(0) + + return df + + def _nuclear_null_elimination(self, df): + """Final pass to eliminate ALL nulls with extreme prejudice.""" + print("[INFO] Performing nuclear null elimination...") + + # Get all numeric columns + numeric_cols = df.select_dtypes(include=[np.number]).columns + + for col in numeric_cols: + null_count = df[col].isnull().sum() + if null_count > 0: + print(f"[NUCLEAR] Eliminating {null_count} nulls in {col}") + + # Try different strategies in order + if '7d_change' in col or 'change' in col.lower(): + # Change columns - use symbol-specific hash + for symbol in df['symbol'].unique(): + symbol_mask = (df['symbol'] == symbol) & df[col].isnull() + if symbol_mask.any(): + symbol_hash = hash(symbol + col) % 2000 / 1000 - 1 # -1 to +1 + if 'fees' in col.lower(): + fill_value = symbol_hash * 0.3 + elif 'gas' in col.lower(): + fill_value = symbol_hash * 0.25 + else: + fill_value = symbol_hash * 0.2 + df.loc[symbol_mask, col] = fill_value + + elif 'timestamp' in col.lower(): + # Timestamp columns + df[col] = df[col].fillna(method='ffill').fillna(method='bfill').fillna(0) + + elif col in ['price', 'open', 'high', 'low', 'close']: + # Price columns - use symbol-specific values + for symbol in df['symbol'].unique(): + symbol_mask = (df['symbol'] == symbol) & df[col].isnull() + if symbol_mask.any(): + symbol_price = df[df['symbol'] == symbol][col].median() + if pd.isna(symbol_price): + symbol_hash = hash(symbol + col) % 10000 / 100 # 0-100 range + symbol_price = 1 + symbol_hash # $1-$101 + df.loc[symbol_mask, col] = symbol_price + + else: + # Generic columns - try median first, then 0 + median_val = df[col].median() + if pd.isna(median_val): + median_val = 0 + df[col] = df[col].fillna(median_val) + + # Final check - if still nulls, force to 0 + remaining_nulls = df[col].isnull().sum() + if remaining_nulls > 0: + print(f"[NUCLEAR] Force filling {remaining_nulls} remaining nulls in {col} with 0") + df[col] = df[col].fillna(0) + + return df + + def _enhanced_sentiment_imputation(self, df): + """Enhanced sentiment imputation that creates realistic, diverse sentiment values.""" + + print(f"[INFO] Starting enhanced sentiment imputation...") + + # Define sentiment columns + core_sentiment_cols = ['sentiment_score', 'neg', 'neu', 'pos'] + + for col in core_sentiment_cols: + if col in df.columns: + null_count_before = df[col].isnull().sum() + if null_count_before > 0: + print(f"[INFO] Processing {col}: {null_count_before} nulls to fill") + + # Process each symbol separately for core sentiment columns + for col in core_sentiment_cols: + if col in df.columns and df[col].isnull().any(): + print(f"Enhanced imputation for {col}...") + + for symbol in df['symbol'].unique(): + symbol_mask = df['symbol'] == symbol + symbol_sentiment = df.loc[symbol_mask, col] + + if symbol_sentiment.isnull().any(): + # Try forward/backward fill first + filled = symbol_sentiment.fillna(method='ffill').fillna(method='bfill') + + # For remaining nulls, use symbol-specific realistic values + if filled.isnull().any(): + symbol_hash = hash(symbol + col) % 10000 / 10000 + symbol_upper = symbol.upper() + + # Define crypto categories + stablecoins = ['USDT', 'USDC', 'BUSD', 'DAI', 'TUSD', 'USDP'] + major_cryptos = ['BTC', 'ETH', 'BNB', 'ADA', 'XRP', 'SOL', 'DOT', 'AVAX'] + + if col == 'sentiment_score': + # Sentiment score (-1 to +1) + if any(stable in symbol_upper for stable in stablecoins): + fill_value = (symbol_hash - 0.5) * 0.1 # Stable: ±0.05 + elif any(major in symbol_upper for major in major_cryptos): + fill_value = 0.1 + (symbol_hash - 0.5) * 0.4 # Major: 0.1 ± 0.2 + else: + fill_value = (symbol_hash - 0.5) * 0.6 # Alt: ±0.3 + fill_value = np.clip(fill_value, -1.0, 1.0) + + elif col == 'neu': + # Neutral sentiment (dominant) + if any(stable in symbol_upper for stable in stablecoins): + fill_value = 0.85 + symbol_hash * 0.1 # 0.85-0.95 + elif any(major in symbol_upper for major in major_cryptos): + fill_value = 0.65 + symbol_hash * 0.2 # 0.65-0.85 + else: + fill_value = 0.55 + symbol_hash * 0.3 # 0.55-0.85 + fill_value = np.clip(fill_value, 0.0, 1.0) + + elif col == 'pos': + # Positive sentiment + if any(stable in symbol_upper for stable in stablecoins): + fill_value = 0.05 + symbol_hash * 0.05 # 0.05-0.10 + elif any(major in symbol_upper for major in major_cryptos): + fill_value = 0.15 + symbol_hash * 0.15 # 0.15-0.30 + else: + fill_value = 0.10 + symbol_hash * 0.25 # 0.10-0.35 + fill_value = np.clip(fill_value, 0.0, 1.0) + + elif col == 'neg': + # Negative sentiment + if any(stable in symbol_upper for stable in stablecoins): + fill_value = 0.05 + symbol_hash * 0.05 # 0.05-0.10 + elif any(major in symbol_upper for major in major_cryptos): + fill_value = 0.10 + symbol_hash * 0.10 # 0.10-0.20 + else: + fill_value = 0.15 + symbol_hash * 0.15 # 0.15-0.30 + fill_value = np.clip(fill_value, 0.0, 1.0) + + filled = filled.fillna(fill_value) + + df.loc[symbol_mask, col] = filled + + # Normalize sentiment scores so neg + neu + pos = 1.0 + if all(col in df.columns for col in ['neg', 'neu', 'pos']): + print("Normalizing sentiment scores...") + for idx in df.index: + neg_val = df.at[idx, 'neg'] + neu_val = df.at[idx, 'neu'] + pos_val = df.at[idx, 'pos'] + + current_sum = neg_val + neu_val + pos_val + if current_sum > 0: + df.at[idx, 'neg'] = neg_val / current_sum + df.at[idx, 'neu'] = neu_val / current_sum + df.at[idx, 'pos'] = pos_val / current_sum + else: + # Default neutral sentiment + df.at[idx, 'neg'] = 0.1 + df.at[idx, 'neu'] = 0.8 + df.at[idx, 'pos'] = 0.1 + + # Handle other sentiment features + other_sentiment_features = [ + 'social_sentiment_mean', 'social_sentiment_std', 'social_sentiment_count', + 'social_confidence_mean', 'combined_sentiment', 'sentiment_agreement', + 'sentiment_change_1', 'sentiment_sma_7', 'sentiment_momentum' + ] + + for col in other_sentiment_features: + if col in df.columns and df[col].isnull().any(): + if 'sentiment' in col.lower() and 'count' not in col.lower(): + # Sentiment scores - neutral with crypto-specific variation + for symbol in df['symbol'].unique(): + mask = df['symbol'] == symbol + symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1 # -0.1 to +0.1 + df.loc[mask, col] = df.loc[mask, col].fillna(symbol_hash) + elif 'count' in col.lower(): + df[col] = df[col].fillna(0) + else: + median_val = df[col].median() + if pd.isna(median_val): + median_val = 0 + df[col] = df[col].fillna(median_val) + + # Final validation + print(f"[INFO] Enhanced sentiment imputation completed:") + for col in core_sentiment_cols: + if col in df.columns: + null_count_after = df[col].isnull().sum() + print(f" {col}: {null_count_after} nulls remaining") + + return df + + def fit_transform(self, df): + """Apply crypto-specific imputation with anti-homogenization measures.""" + + df_imputed = df.copy() + df_imputed = df_imputed.sort_values(['symbol', 'interval_timestamp']) + + # Create crypto profiles + self.crypto_profiles = self._create_crypto_profiles(df_imputed) + + print(f"Created profiles for {len(self.crypto_profiles)} unique cryptocurrencies") + + # 1. Handle categorical/flag columns + categorical_cols = [ + 'symbol', 'cg_id', 'blockchain_network', 'stable', 'is_crypto', 'is_stock', + 'is_other', 'alpaca_data_available', 'is_trading_hours', 'is_weekend' + ] + + for col in categorical_cols: + if col in df_imputed.columns: + if col in ['is_crypto']: + df_imputed[col] = df_imputed[col].fillna(1) # Default to crypto + elif col in ['is_stock', 'is_other']: + df_imputed[col] = df_imputed[col].fillna(0) # Not stock/other + elif col in ['stable']: + # Determine if stablecoin based on symbol + stablecoin_symbols = ['USDT', 'USDC', 'BUSD', 'DAI', 'TUSD', 'USDP'] + for symbol in stablecoin_symbols: + mask = df_imputed['symbol'].str.contains(symbol, case=False, na=False) + df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(True) + df_imputed[col] = df_imputed[col].fillna(False) + else: + df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill') + + # 2. Exchange symbols (string data) + exchange_symbol_cols = [col for col in df_imputed.columns if col.startswith('symbols.')] + for col in exchange_symbol_cols: + if df_imputed[col].dtype == 'object': + # Forward/backward fill within symbol groups + df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill') + + # 3. Core crypto market data + core_market_cols = [ + 'price', 'marketcap', 'volume', 'dominance', 'rank', + 'open', 'high', 'low', 'close' + ] + + for col in core_market_cols: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + print(f"Imputing {col} with crypto-specific context...") + df_imputed[col] = self._impute_with_crypto_context( + df_imputed, col, self.crypto_profiles + ) + + # 4. Exchange prices + exchange_price_cols = [col for col in df_imputed.columns if col.startswith('exchangePrices.')] + for col in exchange_price_cols: + if df_imputed[col].isnull().any(): + print(f"Imputing {col} with crypto-specific context...") + df_imputed[col] = self._impute_with_crypto_context( + df_imputed, col, self.crypto_profiles + ) + + # 5. Performance metrics + performance_cols = [col for col in df_imputed.columns if col.startswith('performance.') or col.startswith('rankDiffs.')] + for col in performance_cols: + if df_imputed[col].isnull().any(): + print(f"Imputing {col} with crypto-specific context...") + df_imputed[col] = self._impute_with_crypto_context( + df_imputed, col, self.crypto_profiles + ) + + # 6. Technical indicators + tech_indicators = [ + 'rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_position', + 'stoch_k', 'stoch_d', 'cci', 'roc_5', 'roc_10', 'mfi', 'rsi_macd_signal', + 'ema_convergence', 'true_range_pct' + ] + + for col in tech_indicators: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + print(f"Imputing {col} with crypto-specific context...") + df_imputed[col] = self._impute_with_crypto_context( + df_imputed, col, self.crypto_profiles + ) + + # 7. Price/volume change features + change_features = [ + 'price_change_1', 'price_change_7', 'price_change_14', 'volume_ratio', + 'volatility_7', 'price_volume_trend', 'volatility_consistency' + ] + + for col in change_features: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + df_imputed[col] = self._impute_with_crypto_context( + df_imputed, col, self.crypto_profiles + ) + + # 8. On-chain features (crypto-specific) - PRIORITY HANDLING for problematic columns + onchain_features = [ + 'transaction_volume', 'total_fees', 'total_gas_used', 'avg_gas_price', + 'transaction_count', 'tx_count_7d_change', 'tx_count_sma_7', + 'tx_volume_7d_change', 'tx_volume_sma_7', 'gas_used_7d_change', + 'gas_used_sma_7', 'gas_price_7d_change', 'gas_price_sma_7', + 'fees_7d_change', 'avg_tx_size', 'tx_price_correlation' + ] + + for col in onchain_features: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + print(f"Imputing {col} with crypto on-chain context...") + df_imputed[col] = self._impute_with_crypto_context( + df_imputed, col, self.crypto_profiles + ) + + # 9. AGGRESSIVE NULL ELIMINATION for stubborn columns + df_imputed = self._force_fill_stubborn_nulls(df_imputed) + + # 10. Sentiment features + sentiment_features = [ + 'social_sentiment_mean', 'social_sentiment_std', 'social_sentiment_count', + 'social_confidence_mean', 'combined_sentiment', 'sentiment_agreement', + 'sentiment_change_1', 'sentiment_sma_7', 'sentiment_momentum', + 'sentiment_score', 'neg', 'neu', 'pos' + ] + + for col in sentiment_features: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + if 'sentiment' in col.lower() and 'count' not in col.lower(): + # Sentiment scores - neutral with crypto-specific variation + for symbol in df_imputed['symbol'].unique(): + mask = df_imputed['symbol'] == symbol + symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1 # -0.1 to +0.1 + df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(symbol_hash) + elif 'count' in col.lower(): + df_imputed[col] = df_imputed[col].fillna(0) + else: + median_val = df_imputed[col].median() + df_imputed[col] = df_imputed[col].fillna(median_val) + + # 11. Quality metrics + quality_features = [ + 'data_quality_score', 'core_features_completeness', 'technical_indicators_completeness', + 'onchain_features_completeness', 'price_data_completeness', + 'overall_feature_completeness', 'data_completeness_score' + ] + + for col in quality_features: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + median_val = np.clip(df_imputed[col].median(), 0, 1) + # Add tiny crypto-specific variation + for symbol in df_imputed['symbol'].unique(): + mask = df_imputed['symbol'] == symbol + symbol_hash = hash(symbol + col) % 100 / 10000 # Very small variation + fill_val = np.clip(median_val + symbol_hash, 0, 1) + df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(fill_val) + + # 12. Temporal features + temporal_features = ['hour', 'day_of_week', 'is_weekend', 'is_trading_hours'] + for col in temporal_features: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + if col == 'hour': + df_imputed[col] = df_imputed[col].fillna(12) # Default to noon + elif col == 'day_of_week': + df_imputed[col] = df_imputed[col].fillna(3) # Default to Wednesday + elif col == 'is_weekend': + df_imputed[col] = df_imputed[col].fillna(0) # Default to weekday + elif col == 'is_trading_hours': + df_imputed[col] = df_imputed[col].fillna(1) # Crypto trades 24/7 + + # 13. Handle any remaining numeric columns + remaining_numeric = df_imputed.select_dtypes(include=[np.number]).columns + remaining_with_nulls = [col for col in remaining_numeric if df_imputed[col].isnull().any()] + + for col in remaining_with_nulls: + if col not in ['id', 'id_alpaca', 'backup_id'] and not col.endswith('_timestamp'): + print(f"Imputing remaining column {col}...") + df_imputed[col] = self._impute_with_crypto_context( + df_imputed, col, self.crypto_profiles + ) + + # 14. NUCLEAR NULL ELIMINATION - Final pass + df_imputed = self._nuclear_null_elimination(df_imputed) + + print("[INFO] Crypto imputation complete with anti-homogenization measures") + return df_imputed + +# Usage function with validation - FIXED VERSION +def impute_crypto_with_validation_fixed(file_path, output_path=None): + """Impute crypto data and validate no homogenization occurred.""" + try: + df = pd.read_parquet(file_path) + except Exception as e: + print(f"[ERROR] Failed to load file: {e}") + return None + + # Sample symbols for validation + symbols_sample = df['symbol'].unique()[:5] + + imputer = CryptoDataImputerFixed() + df_imputed = imputer.fit_transform(df) + + # TRIPLE CHECK: Ensure problematic columns have no nulls + problematic_cols = ['gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change'] + for col in problematic_cols: + if col in df_imputed.columns: + null_count = df_imputed[col].isnull().sum() + if null_count > 0: + print(f"[EMERGENCY] Still {null_count} nulls in {col} - applying emergency fix") + # Emergency symbol-specific fill + for symbol in df_imputed['symbol'].unique(): + symbol_mask = (df_imputed['symbol'] == symbol) & df_imputed[col].isnull() + if symbol_mask.any(): + symbol_hash = hash(symbol + col) % 2000 / 1000 - 1 # -1 to +1 + if 'fees' in col.lower(): + fill_value = symbol_hash * 0.3 + elif 'gas' in col.lower(): + fill_value = symbol_hash * 0.25 + else: + fill_value = symbol_hash * 0.2 + df_imputed.loc[symbol_mask, col] = fill_value + + # Final nuclear option + df_imputed[col] = df_imputed[col].fillna(0) + print(f"[EMERGENCY] {col} nulls after emergency fix: {df_imputed[col].isnull().sum()}") + + # Combine alpaca data with main data if available + price_cols = ['high', 'low', 'close', 'volume', 'open'] + for col in price_cols: + alpaca_col = f"{col}_alpaca" + if col in df_imputed.columns and alpaca_col in df_imputed.columns: + df_imputed[col] = df_imputed[col].combine_first(df_imputed[alpaca_col]) + + # Drop unwanted columns before saving + drop_cols = [ + '_filename', '_original_format', 'alpaca_data_available', + 'ask_exchange', 'ask_exchange_alpaca', 'bid_exchange', 'bid_exchange_alpaca', + 'conditions', 'conditions_alpaca', 'conditions_trade', 'conditions_trade_alpaca', + 'symbol_quote', 'symbol_quote_alpaca', 'symbol_trade', 'symbol_trade_alpaca', + 'tape', 'tape_alpaca', 'tape_trade', 'tape_trade_alpaca', + 'id', 'id_alpaca', 'is_new_symbol', 'timestamp_dt', + 'estimateCurrency', 'exchange', 'exchange_alpaca', 'exchange_company', + 'finnhubIndustry', 'logo', 'ticker', 'weburl', 'latest_news_timestamp', 'volume_price_momentum', + 'country', 'currency', 'ipo', 'name', 'period', 'phone', 'year', 'month', 'symbols.kraken', + 'datetime', 'headline', 'blockchain_network', 'symbols.cryptocom', 'symbols.bitmart', 'symbols.kucoin', 'symbols.okx', + 'symbols.coinbase','symbols.binance','symbols.mexc','symbols.bybit','symbols.bingx', 'symbols.huobi', 'symbols.bitget', 'symbols.gateio', + 'interval_timestamp_dt', 'interval_timestamp_alpaca', 'interval_timestamp_trade', 'feature_timestamp', 'alpaca_merge_timestamp', 'sentiment_timestamp', + 'hour', 'day_of_week', 'is_weekend', 'is_trading_hours', 'is_crypto', 'is_stock', 'is_other', 'gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change' + ] + + # Remove alpaca columns after combining + alpaca_cols = [col for col in df_imputed.columns if col.endswith('_alpaca')] + drop_cols.extend(alpaca_cols) + + for col in drop_cols: + if col in df_imputed.columns: + df_imputed = df_imputed.drop(columns=col) + + # Reorder columns: 'symbol' first, 'interval_timestamp' second, rest follow + cols = list(df_imputed.columns) + if 'symbol' in cols and 'interval_timestamp' in cols: + rest = [c for c in cols if c not in ['symbol', 'interval_timestamp']] + df_imputed = df_imputed[['symbol', 'interval_timestamp'] + rest] + + # FINAL FINAL CHECK for problematic columns (after all drops/reorders) + for col in problematic_cols: + if col in df_imputed.columns: + null_count = df_imputed[col].isnull().sum() + if null_count > 0: + print(f"[FINAL CHECK] Still {null_count} nulls in {col} - final nuclear fill") + df_imputed[col] = df_imputed[col].fillna(0) + + # Validation: Check that different symbols have different values + print("\n[VALIDATION] Checking for homogenization...") + for symbol in symbols_sample: + symbol_data = df_imputed[df_imputed['symbol'] == symbol] + if len(symbol_data) > 0: + price_mean = symbol_data['price'].mean() if 'price' in symbol_data.columns else 0 + volume_mean = symbol_data['volume'].mean() if 'volume' in symbol_data.columns else 0 + print(f" {symbol}: Price={price_mean:.2f}, Volume={volume_mean:.0f}") + + # Save results + if output_path: + # Clean up data types + if 'backup_id' in df_imputed.columns: + df_imputed['backup_id'] = df_imputed['backup_id'].astype(str) + + try: + df_imputed.to_parquet(output_path, compression='snappy') + print(f"[INFO] Crypto data imputed and saved to: {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save file: {e}") + + # Debug: print null count, dtype, and sample after saving + # for col in problematic_cols: + # if col in df_imputed.columns: + # print(f"[DEBUG] Nulls in {col} after save: {df_imputed[col].isnull().sum()}") + # print(f"[DEBUG] Dtype for {col}: {df_imputed[col].dtype}") + # print(f"[DEBUG] Sample values for {col}: {df_imputed[col].head(10).tolist()}") + + return df_imputed + +# Example usage - FIXED VERSION +def main(): + input_file = "data/merged/features/crypto_features.parquet" + output_file = input_file + + df_clean = impute_crypto_with_validation_fixed(input_file, output_file) + if df_clean is not None: + print(f"\n[SUCCESS] Crypto data processing completed!") + print(f"Final shape: {df_clean.shape}") + print(f"Null values remaining: {df_clean.isnull().sum().sum()}") + + # Final verification of problematic columns + problematic_cols = ['gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change'] + for col in problematic_cols: + if col in df_clean.columns: + nulls = df_clean[col].isnull().sum() + print(f"[FINAL VERIFICATION] {col}: {nulls} nulls") + else: + print("[ERROR] Failed to load or impute crypto data.") + +if __name__ == "__main__": + main() diff --git a/src/merge/extract_symbols.py b/src/merge/extract_symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..bbcbf5189e8c85e07f56e26fd2549c5c770eeb51 --- /dev/null +++ b/src/merge/extract_symbols.py @@ -0,0 +1,330 @@ +#!/usr/bin/env python3 +""" +Extract symbols from symbols.* columns and populate the symbol field for crypto data. + +This script runs after merge steps but before data_filler phases to ensure +the symbol column is properly populated from existing exchange symbol data. + +Example: symbols.gateio:"BTC_USDT" -> symbol:"BTC" +""" + +import pandas as pd +import sys +from pathlib import Path +import re + +def extract_symbol_from_exchange_symbols(df): + """Extract base symbol from exchange symbol columns""" + + if 'symbol' not in df.columns: + df['symbol'] = None + + # Find all symbols.* columns + symbol_columns = [col for col in df.columns if col.startswith('symbols.')] + + if not symbol_columns: + return df + + # Extract symbols from exchange symbol data + symbols_extracted = 0 + symbols_normalized = 0 + + # First pass: extract symbols from exchange data for null symbols + for idx, row in df.iterrows(): + # Skip if symbol is already populated + if pd.notna(row.get('symbol')): + continue + + # Try to extract symbol from any exchange symbol column + extracted_symbol = None + + for col in symbol_columns: + exchange_symbol = row.get(col) + if pd.notna(exchange_symbol) and isinstance(exchange_symbol, str): + # Extract base symbol from various exchange formats + symbol = extract_base_symbol(exchange_symbol) + if symbol: + extracted_symbol = symbol + break + + if extracted_symbol: + df.at[idx, 'symbol'] = extracted_symbol + symbols_extracted += 1 + + # Second pass: normalize cg_id values to proper ticker symbols + cg_id_to_symbol_mapping = { + 'bitcoin': 'BTC', + 'ethereum': 'ETH', + 'solana': 'SOL', + 'cardano': 'ADA', + 'ripple': 'XRP', + 'binancecoin': 'BNB', + 'dogecoin': 'DOGE', + 'polkadot': 'DOT', + 'chainlink': 'LINK', + 'litecoin': 'LTC', + 'uniswap': 'UNI', + 'avalanche-2': 'AVAX', + 'polygon': 'MATIC', + 'stellar': 'XLM', + 'bitcoin-cash': 'BCH', + 'filecoin': 'FIL', + 'tron': 'TRX', + 'ethereum-classic': 'ETC', + 'monero': 'XMR', + 'cosmos': 'ATOM', + 'algorand': 'ALGO', + 'vechain': 'VET', + 'hedera-hashgraph': 'HBAR', + 'internet-computer': 'ICP', + 'theta-token': 'THETA', + 'eos': 'EOS', + 'aave': 'AAVE', + 'maker': 'MKR', + 'curve-dao-token': 'CRV', + 'pancakeswap-token': 'CAKE', + 'the-sandbox': 'SAND', + 'decentraland': 'MANA', + 'axie-infinity': 'AXS', + 'shiba-inu': 'SHIB', + 'terra-luna': 'LUNA', + 'near': 'NEAR', + 'flow': 'FLOW', + 'fantom': 'FTM', + 'harmony': 'ONE', + 'basic-attention-token': 'BAT', + 'enjincoin': 'ENJ', + 'sushi': 'SUSHI', + 'compound': 'COMP', + 'yearn-finance': 'YFI', + 'synthetix': 'SNX', + 'uma': 'UMA', + '0x': 'ZRX', + 'loopring': 'LRC', + 'balancer': 'BAL' + } + + for idx, row in df.iterrows(): + current_symbol = row.get('symbol') + + # If symbol matches a cg_id pattern, normalize it to ticker symbol + if pd.notna(current_symbol) and isinstance(current_symbol, str): + normalized_symbol = cg_id_to_symbol_mapping.get(current_symbol.lower()) + if normalized_symbol and normalized_symbol != current_symbol: + df.at[idx, 'symbol'] = normalized_symbol + symbols_normalized += 1 + + # Final stats for debugging if needed + # print(f"Extracted symbols for {symbols_extracted} rows") + # print(f"Normalized symbols for {symbols_normalized} rows") + + # Show results + null_symbols_remaining = df['symbol'].isnull().sum() + # print(f"Remaining null symbols: {null_symbols_remaining}") + + if null_symbols_remaining > 0: + # print("Rows with remaining null symbols:") + sample_nulls = df[df['symbol'].isnull()][['symbol', 'cg_id'] + symbol_columns[:3]].head(5) + # print(sample_nulls) + + return df + +def extract_base_symbol(exchange_symbol): + """Extract base symbol from exchange symbol formats""" + + if not isinstance(exchange_symbol, str): + return None + + exchange_symbol = exchange_symbol.strip().upper() + + # Common patterns for crypto exchange symbols + patterns = [ + r'^([A-Z]{2,10})USDT?$', # BTCUSDT -> BTC + r'^([A-Z]{2,10})_USDT?$', # BTC_USDT -> BTC + r'^([A-Z]{2,10})/USDT?$', # BTC/USDT -> BTC + r'^([A-Z]{2,10})-USDT?$', # BTC-USDT -> BTC + r'^([A-Z]{2,10})USD$', # BTCUSD -> BTC + r'^([A-Z]{2,10})_USD$', # BTC_USD -> BTC + r'^([A-Z]{2,10})/USD$', # BTC/USD -> BTC + r'^([A-Z]{2,10})-USD$', # BTC-USD -> BTC + r'^([A-Z]{2,10})BUSD$', # BTCBUSD -> BTC + r'^([A-Z]{2,10})_BUSD$', # BTC_BUSD -> BTC + r'^([A-Z]{2,10})EUR$', # BTCEUR -> BTC + r'^([A-Z]{2,10})_EUR$', # BTC_EUR -> BTC + r'^([A-Z]{2,10})BTC$', # ETHBTC -> ETH + r'^([A-Z]{2,10})_BTC$', # ETH_BTC -> ETH + ] + + for pattern in patterns: + match = re.match(pattern, exchange_symbol) + if match: + base_symbol = match.group(1) + # Filter out obvious non-crypto symbols and ensure reasonable length + if len(base_symbol) >= 2 and len(base_symbol) <= 10: + # Skip if it looks like a quote currency + if base_symbol not in ['USDT', 'USDC', 'USD', 'EUR', 'BTC', 'ETH', 'BNB', 'BUSD']: + return base_symbol + elif base_symbol in ['BTC', 'ETH', 'BNB']: # These are valid base symbols + return base_symbol + + # If no pattern matches, try simple heuristics + # Remove common suffixes + for suffix in ['USDT', 'USDC', 'USD', 'EUR', 'BUSD']: + if exchange_symbol.endswith(suffix): + base = exchange_symbol[:-len(suffix)] + if len(base) >= 2 and len(base) <= 10: + return base + + # Split on common delimiters and take first part + for delimiter in ['_', '/', '-']: + if delimiter in exchange_symbol: + parts = exchange_symbol.split(delimiter) + if len(parts) >= 2: + base = parts[0] + if len(base) >= 2 and len(base) <= 10: + return base + + return None + +def process_crypto_features(): + """Process crypto features to extract symbols""" + + # Try different possible paths + possible_paths = [ + Path('data/merged/features/crypto_features.parquet'), + Path('../../data/merged/features/crypto_features.parquet'), + Path('../../../data/merged/features/crypto_features.parquet') + ] + + crypto_file = None + for path in possible_paths: + if path.exists(): + crypto_file = path + break + + if crypto_file is None: + print(f"Crypto features file not found in any of these locations:") + for path in possible_paths: + print(f" {path.absolute()}") + return False + + print(f"Loading crypto features from: {crypto_file}") + df = pd.read_parquet(crypto_file) + + print(f"Loaded {len(df)} rows with {len(df.columns)} columns") + + # Check current state + null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df) + print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)") + + # Extract symbols + df_fixed = extract_symbol_from_exchange_symbols(df) + + # Check results - note that extract_symbol_from_exchange_symbols tracks its own changes + null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed) + + # Calculate total improvement + total_improvement = null_symbols_before - null_symbols_after + + print("Successfully extracted crypto symbols!") + + # Save if there's been any improvement or if nulls are very low + if total_improvement > 0 or null_symbols_after <= 2: + # Save the fixed file + df_fixed.to_parquet(crypto_file) + return True + else: + return True # Success even if no changes needed + +def process_stocks_features(): + """Process stocks features to extract symbols (if needed)""" + + # Try different possible paths + possible_paths = [ + Path('data/merged/features/stocks_features.parquet'), + Path('../../data/merged/features/stocks_features.parquet'), + Path('../../../data/merged/features/stocks_features.parquet') + ] + + stocks_file = None + for path in possible_paths: + if path.exists(): + stocks_file = path + break + + if stocks_file is None: + return False + + df = pd.read_parquet(stocks_file) + + # Check if stocks need symbol extraction too + null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df) + print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)") + + if null_symbols_before == 0: + print("Stocks symbols are already populated, skipping") + return True + + # For stocks, we might have different symbol patterns + # Extract symbols if needed + df_fixed = extract_symbol_from_exchange_symbols(df) + + # Check results + null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed) + symbols_fixed = null_symbols_before - null_symbols_after + + print(f"\nResults:") + print(f"- Symbols fixed: {symbols_fixed}") + print(f"- Null symbols after: {null_symbols_after} ({null_symbols_after/len(df_fixed)*100:.1f}%)") + + if symbols_fixed > 0: + # Save the fixed file + print(f"\nSaving fixed stocks features to: {stocks_file}") + df_fixed.to_parquet(stocks_file) + print("File saved successfully!") + + return True + else: + print("No symbols were extracted/fixed for stocks") + return True # Not an error for stocks + +def main(): + """Main function to extract symbols from exchange symbol data""" + + print("=== EXTRACTING SYMBOLS FROM EXCHANGE DATA ===") + print("This script extracts base symbols from symbols.* columns") + print("Example: symbols.gateio:'BTC_USDT' -> symbol:'BTC'") + print() + + # Process crypto features + print("Processing crypto features...") + crypto_success = process_crypto_features() + + print("\n" + "="*50 + "\n") + + # Process stocks features + print("Processing stocks features...") + stocks_success = process_stocks_features() + + print("\n" + "="*50) + + if crypto_success: + print("Successfully extracted crypto symbols!") + else: + print("Failed to extract crypto symbols!") + + if stocks_success: + print("Stocks symbols processing completed!") + else: + print("Failed to process stocks symbols!") + + if crypto_success and stocks_success: + print("\nSymbol extraction completed successfully!") + return True + else: + print("\nSome issues occurred during symbol extraction") + return False + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) diff --git a/src/merge/final_null_handler.py b/src/merge/final_null_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..185ae1ef0aa83d3929128a8551f6f9ff7d885a39 --- /dev/null +++ b/src/merge/final_null_handler.py @@ -0,0 +1,899 @@ +import pandas as pd +import numpy as np +from pathlib import Path +import json +import warnings +warnings.filterwarnings('ignore') + +class FinalNullValueHandler: + """ + Advanced final null value handler with symbol-first temporal interpolation. + + Strategy Priority: + 1. Same symbol, nearby timestamps (interpolation/extrapolation) + 2. Same symbol, historical mean/median + 3. Similar symbols (same asset class) + 4. Global defaults with symbol-specific variation + """ + + def __init__(self): + self.crypto_column_defaults = self._define_crypto_defaults() + self.stock_column_defaults = self._define_stock_defaults() + self.symbol_profiles = {} + self.symbol_stats = {} # Historical statistics per symbol + + def _analyze_symbol_statistics(self, df): + """Analyze historical statistics for each symbol to guide intelligent filling""" + stats = {} + + # Sort by timestamp for proper temporal analysis + if 'interval_timestamp' in df.columns: + df_sorted = df.sort_values(['symbol', 'interval_timestamp']) + else: + df_sorted = df.sort_values('symbol') + + for symbol in df['symbol'].unique(): + symbol_data = df_sorted[df_sorted['symbol'] == symbol].copy() + + symbol_stats = { + 'symbol': symbol, + 'total_records': len(symbol_data), + 'date_range': None, + 'typical_values': {}, + 'volatility': {}, + 'trends': {}, + 'seasonal_patterns': {} + } + + # Calculate date range if timestamp available + if 'interval_timestamp' in symbol_data.columns: + timestamps = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms') + symbol_stats['date_range'] = { + 'start': timestamps.min(), + 'end': timestamps.max(), + 'duration_days': (timestamps.max() - timestamps.min()).days + } + + # Calculate typical values, volatility, and trends for numerical columns + numerical_cols = symbol_data.select_dtypes(include=[np.number]).columns + for col in numerical_cols: + if col in ['interval_timestamp', 'backup_id']: + continue + + col_data = symbol_data[col].dropna() + if len(col_data) > 0: + symbol_stats['typical_values'][col] = { + 'mean': col_data.mean(), + 'median': col_data.median(), + 'std': col_data.std(), + 'min': col_data.min(), + 'max': col_data.max(), + 'q25': col_data.quantile(0.25), + 'q75': col_data.quantile(0.75), + 'recent_mean': col_data.tail(min(10, len(col_data))).mean(), # Last 10 values + 'data_points': len(col_data) + } + + # Calculate volatility + if len(col_data) > 1: + symbol_stats['volatility'][col] = col_data.std() / (col_data.mean() + 1e-8) + + # Calculate trend if we have timestamp data + if 'interval_timestamp' in symbol_data.columns and len(col_data) >= 3: + # Simple linear trend + valid_rows = symbol_data[col].notna() + if valid_rows.sum() >= 3: + x = np.arange(len(symbol_data[valid_rows])) + y = symbol_data.loc[valid_rows, col].values + try: + trend_slope = np.polyfit(x, y, 1)[0] + symbol_stats['trends'][col] = trend_slope + except: + symbol_stats['trends'][col] = 0 + + stats[symbol] = symbol_stats + + return stats + + def _temporal_interpolation_fill(self, df, symbol, column): + """ + Fill nulls using temporal interpolation within the same symbol + + Priority: + 1. Linear interpolation between known values + 2. Forward fill from last known value + 3. Backward fill from next known value + 4. Exponential smoothing for trend continuation + """ + try: + symbol_mask = df['symbol'] == symbol + symbol_data = df.loc[symbol_mask].copy() + + if column not in symbol_data.columns or symbol_data[column].notna().sum() == 0: + return None + + # Sort by timestamp if available and remove duplicates + if 'interval_timestamp' in symbol_data.columns: + symbol_data = symbol_data.sort_values('interval_timestamp') + # Drop duplicate timestamps for this symbol to avoid reindex issues + symbol_data = symbol_data.drop_duplicates(subset=['interval_timestamp'], keep='first') + + # Reset index to avoid any index issues + symbol_data = symbol_data.reset_index(drop=True) + filled_series = symbol_data[column].copy() + + # 1. Linear interpolation (works best with timestamp ordering) + if 'interval_timestamp' in symbol_data.columns and len(symbol_data) > 1: + # Try time-based interpolation with safe fallback + try: + original_index = filled_series.index + datetime_index = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms') + + # Ensure unique datetime index + if datetime_index.duplicated().any(): + # Add microseconds to make unique + for i, is_dup in enumerate(datetime_index.duplicated(keep='first')): + if is_dup: + datetime_index.iloc[i] += pd.Timedelta(microseconds=i+1) + + filled_series.index = datetime_index + filled_series = filled_series.interpolate(method='time') + filled_series.index = original_index # Restore original index + except Exception: + # Fallback to linear interpolation if time interpolation fails + filled_series = filled_series.interpolate(method='linear') + else: + filled_series = filled_series.interpolate(method='linear') + + # 2. Forward fill + filled_series = filled_series.ffill() + + # 3. Backward fill + filled_series = filled_series.bfill() + + # 4. If still has nulls, use trend extrapolation + if filled_series.isna().any() and symbol in self.symbol_stats: + symbol_stat = self.symbol_stats[symbol] + if column in symbol_stat.get('typical_values', {}): + typical_val = symbol_stat['typical_values'][column]['recent_mean'] + trend = symbol_stat.get('trends', {}).get(column, 0) + + # Apply trend-based extrapolation for remaining nulls + for idx in filled_series[filled_series.isna()].index: + # Simple trend continuation + filled_series[idx] = typical_val + trend * (idx % 10) # Modest trend application + + return filled_series + + except Exception as e: + # If all else fails, return None to trigger fallback behavior + print(f"Warning: Temporal interpolation failed for {symbol} {column}: {e}") + return None + + def _similar_symbol_fill(self, df, symbol, column, asset_type): + """ + Fill nulls using similar symbols in the same asset class + """ + if asset_type == 'crypto': + # For crypto, use symbols with similar rank or market cap + target_stats = self.symbol_stats.get(symbol, {}) + target_rank = target_stats.get('typical_values', {}).get('rank', {}).get('median', 999) + + similar_symbols = [] + for sym, stats in self.symbol_stats.items(): + if sym == symbol: + continue + + sym_rank = stats.get('typical_values', {}).get('rank', {}).get('median', 999) + if abs(sym_rank - target_rank) <= 50: # Similar rank range + similar_symbols.append(sym) + + else: # stock + # For stocks, use symbols with similar market cap or sector + target_stats = self.symbol_stats.get(symbol, {}) + target_mcap = target_stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0) + + similar_symbols = [] + for sym, stats in self.symbol_stats.items(): + if sym == symbol: + continue + + sym_mcap = stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0) + if target_mcap > 0 and sym_mcap > 0: + ratio = max(sym_mcap, target_mcap) / min(sym_mcap, target_mcap) + if ratio <= 5: # Within 5x market cap + similar_symbols.append(sym) + + if not similar_symbols: + return None + + # Get values from similar symbols + similar_data = df[df['symbol'].isin(similar_symbols)][column].dropna() + if len(similar_data) > 0: + # Use weighted average based on similarity + return similar_data.median() # Robust central tendency + + return None + + def _intelligent_symbol_fill(self, df, symbol, column): + """ + Intelligent filling strategy prioritizing symbol-specific data + + Returns the best estimate for null values in the specified column for the given symbol + """ + # Strategy 1: Temporal interpolation within same symbol + temporal_result = self._temporal_interpolation_fill(df, symbol, column) + if temporal_result is not None and temporal_result.notna().any(): + return temporal_result + + # Strategy 2: Use historical statistics from same symbol + if symbol in self.symbol_stats and column in self.symbol_stats[symbol]['typical_values']: + stats = self.symbol_stats[symbol]['typical_values'][column] + + # Choose appropriate central tendency based on data characteristics + if stats['data_points'] >= 10: + # Use recent mean for frequently updated data + return stats['recent_mean'] + elif stats['data_points'] >= 3: + # Use median for small datasets (more robust) + return stats['median'] + else: + # Use mean for very small datasets + return stats['mean'] + + # Strategy 3: Use similar symbols + asset_type = 'crypto' if symbol in df.columns and any( + col in df.columns for col in ['rank', 'dominance', 'performance.day'] + ) else 'stock' + + similar_fill = self._similar_symbol_fill(df, symbol, column, asset_type) + if similar_fill is not None: + return similar_fill + + # Strategy 4: Global fallback with symbol variation + return None # Will be handled by existing default logic + + def _define_crypto_defaults(self): + """Define intelligent defaults for crypto-specific columns""" + return { + # Crypto market data + 'dominance': 0.001, # Very small dominance for minor cryptos + 'rank': 999, # Low rank for unknown cryptos + 'stable': 0, # Most cryptos are not stablecoins (use 0 instead of False) + 'marketcap': 1000000, # $1M default market cap + 'transaction_count': 100, # Minimal transaction count + 'transaction_volume': 10000, # Minimal transaction volume + 'tx_price_correlation': 0.5, # Neutral correlation + + # Exchange prices (use main price as baseline) + 'exchangePrices.binance': None, # Will be filled with main price + 'exchangePrices.coinbase': None, + 'exchangePrices.kraken': None, + 'exchangePrices.bybit': None, + 'exchangePrices.kucoin': None, + 'exchangePrices.okx': None, + 'exchangePrices.mexc': None, + 'exchangePrices.gateio': None, + 'exchangePrices.bitget': None, + 'exchangePrices.bitmart': None, + 'exchangePrices.bingx': None, + 'exchangePrices.cryptocom': None, + + # Exchange symbols (use main symbol as baseline) + 'symbols.binance': None, # Will be filled with main symbol + 'symbols.coinbase': None, + 'symbols.kraken': None, + 'symbols.bybit': None, + 'symbols.kucoin': None, + 'symbols.okx': None, + 'symbols.mexc': None, + 'symbols.gateio': None, + 'symbols.bitget': None, + 'symbols.bitmart': None, + 'symbols.bingx': None, + 'symbols.cryptocom': None, + + # Performance metrics (neutral/small changes) + 'performance.day': 0.0, + 'performance.hour': 0.0, + 'performance.hour4': 0.0, + 'performance.min1': 0.0, + 'performance.min15': 0.0, + 'performance.min5': 0.0, + 'performance.month': 0.0, + 'performance.month3': 0.0, + 'performance.week': 0.0, + 'performance.year': 0.0, + + # Rank differences (no change) + 'rankDiffs.day': 0, + 'rankDiffs.hour': 0, + 'rankDiffs.hour4': 0, + 'rankDiffs.min1': 0, + 'rankDiffs.min15': 0, + 'rankDiffs.min5': 0, + 'rankDiffs.month': 0, + 'rankDiffs.month3': 0, + 'rankDiffs.week': 0, + 'rankDiffs.year': 0, + + # Technical indicators + 'bb_width': 0.02, # Small bollinger band width + 'cg_id': None, # Will be derived from symbol + } + + def _define_stock_defaults(self): + """Define intelligent defaults for stock-specific columns""" + return { + # Stock market data + 'stock_market': 'NASDAQ', # Default market + 'marketCapitalization': 1000000000, # $1B default + 'shareOutstanding': 100000000, # 100M shares default + 'mspr': 0, # Neutral momentum + + # News and sentiment data + 'news_activity_score_x': 0, + 'news_activity_score_y': 0, + 'news_articles_count_x': 0, + 'news_articles_count_y': 0, + 'news_highlights_count_x': 0, + 'news_highlights_count_y': 0, + 'news_match_score_max_x': 0, + 'news_match_score_max_y': 0, + 'news_match_score_mean_x': 0, + 'news_match_score_mean_y': 0, + 'news_mentions_count_x': 0, + 'news_mentions_count_y': 0, + 'news_sentiment_max_x': 0.5, # Neutral sentiment + 'news_sentiment_max_y': 0.5, + 'news_sentiment_mean_x': 0.5, + 'news_sentiment_mean_y': 0.5, + 'news_sentiment_min_x': 0.5, + 'news_sentiment_min_y': 0.5, + 'news_sentiment_range_x': 0, + 'news_sentiment_range_y': 0, + 'news_sentiment_std': 0, + 'news_sentiment_std_x': 0, + 'news_sentiment_std_y': 0, + + # Analyst ratings + 'buy': 5, # Moderate buy recommendations + 'hold': 10, # More hold recommendations + 'sell': 2, # Few sell recommendations + 'strongBuy': 3, + 'strongSell': 1, + + # Technical indicators + 'volume_price_momentum': 0.0, # Neutral momentum + } + + def _create_symbol_profiles(self, df): + """Create profiles for each symbol to guide intelligent filling""" + profiles = {} + + for symbol in df['symbol'].unique(): + symbol_data = df[df['symbol'] == symbol] + + # Determine if it's crypto or stock + is_crypto = 'rank' in symbol_data.columns and symbol_data['rank'].notna().any() + if not is_crypto: + is_crypto = any(col.startswith('performance.') for col in symbol_data.columns) + + # Calculate key statistics + profile = { + 'symbol': symbol, + 'is_crypto': is_crypto, + 'total_records': len(symbol_data), + 'data_density': symbol_data.notna().mean().mean(), + 'has_price_data': 'price' in symbol_data.columns and symbol_data['price'].notna().any(), + 'typical_price': symbol_data.get('price', pd.Series([100])).median(), + 'typical_volume': symbol_data.get('volume', pd.Series([1000000])).median(), + 'typical_marketcap': symbol_data.get('marketcap', symbol_data.get('marketCapitalization', pd.Series([1000000000]))).median() + } + + profiles[symbol] = profile + + return profiles + + def _intelligent_fill_value(self, df, symbol, column, default_value): + """Generate intelligent fill value based on symbol context""" + profile = self.symbol_profiles.get(symbol, {}) + + # Add symbol-specific variation to prevent homogenization + symbol_hash = hash(f"{symbol}_{column}") % 1000 + variation_factor = (symbol_hash / 1000.0 - 0.5) * 0.1 # ±5% variation + + if default_value is None: + return None + elif isinstance(default_value, (int, float)): + if default_value == 0: + return 0 # Keep zeros as zeros + else: + return default_value * (1 + variation_factor) + else: + return default_value + + def _fill_exchange_prices_advanced(self, df): + """Advanced exchange price filling using symbol-first strategy""" + exchange_price_cols = [col for col in df.columns if col.startswith('exchangePrices.')] + + if not exchange_price_cols or 'price' not in df.columns: + return df + + df_result = df.copy() + + for symbol in df['symbol'].unique(): + symbol_mask = df['symbol'] == symbol + symbol_data = df.loc[symbol_mask] + + # First try to get main price from symbol's own data + main_price_series = self._intelligent_symbol_fill(df, symbol, 'price') + if main_price_series is None or (isinstance(main_price_series, pd.Series) and main_price_series.isna().all()): + continue + + if isinstance(main_price_series, pd.Series): + main_price = main_price_series.median() + else: + main_price = main_price_series + + if pd.isna(main_price): + continue + + # Fill exchange prices for this symbol + for exchange_col in exchange_price_cols: + if symbol_data[exchange_col].isna().any(): + # First try temporal interpolation for this exchange + exchange_filled = self._intelligent_symbol_fill(df, symbol, exchange_col) + + if exchange_filled is not None: + if isinstance(exchange_filled, pd.Series): + df_result.loc[symbol_mask, exchange_col] = exchange_filled + else: + null_mask = df_result.loc[symbol_mask, exchange_col].isna() + df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_filled + else: + # Fallback: use main price with small exchange-specific variation + exchange_hash = hash(f"{symbol}_{exchange_col}") % 100 + variation = (exchange_hash / 100.0 - 0.5) * 0.01 # ±0.5% + exchange_price = main_price * (1 + variation) + null_mask = df_result.loc[symbol_mask, exchange_col].isna() + df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_price + + return df_result + + def _fill_exchange_symbols(self, df): + """Fill exchange symbols with main symbol + exchange-specific formatting""" + exchange_symbol_cols = [col for col in df.columns if col.startswith('symbols.')] + + if not exchange_symbol_cols or 'symbol' not in df.columns: + return df + + df_result = df.copy() + + # Exchange-specific symbol formatting + exchange_formats = { + 'symbols.binance': lambda s: f"{s.upper()}USDT" if s.lower() != 'bitcoin' else "BTCUSDT", + 'symbols.coinbase': lambda s: f"{s.upper()}-USD", + 'symbols.kraken': lambda s: f"{s.upper()}USD" if len(s) <= 3 else f"{s.upper()}/USD", + 'symbols.bybit': lambda s: f"{s.upper()}USDT", + 'symbols.kucoin': lambda s: f"{s.upper()}-USDT", + 'symbols.okx': lambda s: f"{s.upper()}-USDT", + 'symbols.mexc': lambda s: f"{s.upper()}_USDT", + 'symbols.gateio': lambda s: f"{s.upper()}_USDT", + 'symbols.bitget': lambda s: f"{s.upper()}USDT", + 'symbols.bitmart': lambda s: f"{s.upper()}_USDT", + 'symbols.bingx': lambda s: f"{s.upper()}-USDT", + 'symbols.cryptocom': lambda s: f"{s.upper()}_USDT" + } + + for symbol in df['symbol'].unique(): + symbol_mask = df['symbol'] == symbol + + for exchange_col in exchange_symbol_cols: + if df.loc[symbol_mask, exchange_col].isna().all(): + formatter = exchange_formats.get(exchange_col, lambda s: s.upper()) + try: + exchange_symbol = formatter(symbol) + df_result.loc[symbol_mask, exchange_col] = exchange_symbol + except Exception: + df_result.loc[symbol_mask, exchange_col] = symbol.upper() + + return df_result + + def _fill_cg_id(self, df): + """Fill CoinGecko ID based on symbol""" + if 'cg_id' not in df.columns: + return df + + df_result = df.copy() + + # Common CoinGecko ID mappings + cg_id_mapping = { + 'bitcoin': 'bitcoin', + 'btc': 'bitcoin', + 'ethereum': 'ethereum', + 'eth': 'ethereum', + 'binancecoin': 'binancecoin', + 'bnb': 'binancecoin', + 'cardano': 'cardano', + 'ada': 'cardano', + 'solana': 'solana', + 'sol': 'solana', + 'xrp': 'ripple', + 'ripple': 'ripple', + 'dogecoin': 'dogecoin', + 'doge': 'dogecoin', + 'polkadot': 'polkadot', + 'dot': 'polkadot', + 'avalanche-2': 'avalanche-2', + 'avax': 'avalanche-2', + 'chainlink': 'chainlink', + 'link': 'chainlink', + 'polygon': 'matic-network', + 'matic': 'matic-network', + 'litecoin': 'litecoin', + 'ltc': 'litecoin', + 'uniswap': 'uniswap', + 'uni': 'uniswap' + } + + for symbol in df['symbol'].unique(): + symbol_mask = df['symbol'] == symbol + + if df.loc[symbol_mask, 'cg_id'].isna().all(): + cg_id = cg_id_mapping.get(symbol.lower(), symbol.lower()) + df_result.loc[symbol_mask, 'cg_id'] = cg_id + + return df_result + + def process_crypto_features(self, df): + """Process crypto features with advanced symbol-first null handling""" + print("Processing crypto features with symbol-first strategy...") + df_result = df.copy() + + # Step 1: Analyze symbol statistics for intelligent filling + print("Analyzing symbol statistics...") + self.symbol_stats = self._analyze_symbol_statistics(df_result) + print(f"Analyzed {len(self.symbol_stats)} symbols") + + # Step 2: Create symbol profiles + self.symbol_profiles = self._create_symbol_profiles(df_result) + + # Step 3: Symbol-first null handling for key columns + priority_columns = [ + 'price', 'volume', 'marketcap', 'dominance', 'rank', + 'performance.day', 'performance.week', 'performance.month', + 'rsi', 'macd', 'transaction_count', 'transaction_volume' + ] + + for column in priority_columns: + if column in df_result.columns and df_result[column].isna().any(): + print(f"Processing {column} with symbol-first strategy...") + + for symbol in df_result['symbol'].unique(): + symbol_mask = df_result['symbol'] == symbol + null_mask = df_result[column].isna() + fill_mask = symbol_mask & null_mask + + if fill_mask.any(): + # Use intelligent symbol-first filling + fill_result = self._intelligent_symbol_fill(df_result, symbol, column) + + if fill_result is not None: + if isinstance(fill_result, pd.Series): + # If we got a series back (from temporal interpolation) + # Make sure the series aligns with the symbol mask + symbol_indices = df_result[symbol_mask].index + if len(fill_result) == len(symbol_indices): + # Map the series values to the correct indices + for i, idx in enumerate(symbol_indices): + if pd.notna(fill_result.iloc[i]): + df_result.loc[idx, column] = fill_result.iloc[i] + else: + # Fallback: use median of the series + fill_value = fill_result.median() + if pd.notna(fill_value): + df_result.loc[fill_mask, column] = fill_value + else: + # If we got a scalar value + df_result.loc[fill_mask, column] = fill_result + + # Step 4: Handle exchange prices with cross-reference to main price + df_result = self._fill_exchange_prices_advanced(df_result) + + # Step 5: Handle exchange symbols with proper formatting + df_result = self._fill_exchange_symbols(df_result) + + # Step 6: Handle CoinGecko IDs + df_result = self._fill_cg_id(df_result) + + # Step 7: Fill remaining columns with intelligent defaults + for column in df_result.columns: + if df_result[column].isna().any(): + default_value = self.crypto_column_defaults.get(column) + + if default_value is not None: + for symbol in df_result['symbol'].unique(): + symbol_mask = df_result['symbol'] == symbol + null_mask = df_result[column].isna() + fill_mask = symbol_mask & null_mask + + if fill_mask.any(): + try: + fill_value = self._intelligent_fill_value( + df_result, symbol, column, default_value + ) + df_result.loc[fill_mask, column] = fill_value + except Exception as e: + print(f"Warning: Failed to fill {column} for {symbol}: {e}") + # Skip this column for this symbol + continue + + return df_result + + def process_stock_features(self, df): + """Process stock features with advanced symbol-first null handling""" + print("Processing stock features with symbol-first strategy...") + df_result = df.copy() + + # Step 1: Analyze symbol statistics for intelligent filling + print("Analyzing symbol statistics...") + self.symbol_stats = self._analyze_symbol_statistics(df_result) + print(f"Analyzed {len(self.symbol_stats)} symbols") + + # Step 2: Create symbol profiles + self.symbol_profiles = self._create_symbol_profiles(df_result) + + # Step 3: Symbol-first null handling for key columns + priority_columns = [ + 'close', 'open', 'high', 'low', 'volume', 'prev_close', + 'marketCapitalization', 'shareOutstanding', + 'rsi', 'macd', 'atr', 'bb_position', + 'news_sentiment_mean_x', 'news_sentiment_mean_y', + 'buy', 'sell', 'hold', 'strongBuy', 'strongSell' + ] + + for column in priority_columns: + if column in df_result.columns and df_result[column].isna().any(): + print(f"Processing {column} with symbol-first strategy...") + + for symbol in df_result['symbol'].unique(): + symbol_mask = df_result['symbol'] == symbol + null_mask = df_result[column].isna() + fill_mask = symbol_mask & null_mask + + if fill_mask.any(): + # Use intelligent symbol-first filling + fill_result = self._intelligent_symbol_fill(df_result, symbol, column) + + if fill_result is not None: + if isinstance(fill_result, pd.Series): + # If we got a series back (from temporal interpolation) + # Make sure the series aligns with the symbol mask + symbol_indices = df_result[symbol_mask].index + if len(fill_result) == len(symbol_indices): + # Map the series values to the correct indices + for i, idx in enumerate(symbol_indices): + if pd.notna(fill_result.iloc[i]): + df_result.loc[idx, column] = fill_result.iloc[i] + else: + # Fallback: use median of the series + fill_value = fill_result.median() + if pd.notna(fill_value): + df_result.loc[fill_mask, column] = fill_value + else: + # If we got a scalar value + df_result.loc[fill_mask, column] = fill_result + + # Step 4: Fill remaining columns with intelligent defaults + for column in df_result.columns: + if df_result[column].isna().any(): + default_value = self.stock_column_defaults.get(column) + + if default_value is not None: + for symbol in df_result['symbol'].unique(): + symbol_mask = df_result['symbol'] == symbol + null_mask = df_result[column].isna() + fill_mask = symbol_mask & null_mask + + if fill_mask.any(): + try: + fill_value = self._intelligent_fill_value( + df_result, symbol, column, default_value + ) + df_result.loc[fill_mask, column] = fill_value + except Exception as e: + print(f"Warning: Failed to fill {column} for {symbol}: {e}") + # Skip this column for this symbol + continue + + return df_result + + def generate_report(self, df_before, df_after, feature_type): + """Generate a comprehensive report of null value handling with symbol-first strategy details""" + before_nulls = df_before.isnull().sum() + after_nulls = df_after.isnull().sum() + + null_reduction = before_nulls - after_nulls + columns_fixed = null_reduction[null_reduction > 0] + + # Analyze symbol coverage + symbol_analysis = {} + if 'symbol' in df_before.columns: + for symbol in df_before['symbol'].unique(): + symbol_before = int(df_before[df_before['symbol'] == symbol].isnull().sum().sum()) + symbol_after = int(df_after[df_after['symbol'] == symbol].isnull().sum().sum()) + symbol_analysis[symbol] = { + 'nulls_before': symbol_before, + 'nulls_after': symbol_after, + 'nulls_filled': symbol_before - symbol_after, + 'records': int(len(df_before[df_before['symbol'] == symbol])) + } + + # Analyze temporal coverage if timestamp available + temporal_analysis = {} + if 'interval_timestamp' in df_before.columns: + df_before_ts = df_before.copy() + df_after_ts = df_after.copy() + df_before_ts['date'] = pd.to_datetime(df_before_ts['interval_timestamp'], unit='ms').dt.date + df_after_ts['date'] = pd.to_datetime(df_after_ts['interval_timestamp'], unit='ms').dt.date + + for date in df_before_ts['date'].unique(): + date_before = int(df_before_ts[df_before_ts['date'] == date].isnull().sum().sum()) + date_after = int(df_after_ts[df_after_ts['date'] == date].isnull().sum().sum()) + temporal_analysis[str(date)] = { + 'nulls_before': date_before, + 'nulls_after': date_after, + 'nulls_filled': date_before - date_after + } + + report = { + 'feature_type': feature_type, + 'timestamp': pd.Timestamp.now().isoformat(), + 'strategy': 'symbol-first-temporal-interpolation', + 'total_rows': int(len(df_after)), + 'total_columns': int(len(df_after.columns)), + 'unique_symbols': int(len(df_after['symbol'].unique())) if 'symbol' in df_after.columns else 0, + 'columns_with_nulls_before': int((before_nulls > 0).sum()), + 'columns_with_nulls_after': int((after_nulls > 0).sum()), + 'total_nulls_before': int(before_nulls.sum()), + 'total_nulls_after': int(after_nulls.sum()), + 'total_nulls_filled': int(null_reduction.sum()), + 'columns_fixed': int(len(columns_fixed)), + 'null_reduction_rate': float((null_reduction.sum() / before_nulls.sum()) if before_nulls.sum() > 0 else 0), + 'remaining_null_columns': {str(k): int(v) for k, v in after_nulls[after_nulls > 0].to_dict().items()}, + 'fixed_columns_detail': {str(k): int(v) for k, v in null_reduction[null_reduction > 0].to_dict().items()}, + 'symbol_analysis': symbol_analysis, + 'temporal_analysis': temporal_analysis, + 'strategy_details': { + 'symbol_stats_analyzed': len(self.symbol_stats), + 'temporal_interpolation_used': True, + 'similar_symbol_fallback': True, + 'intelligent_defaults': True + } + } + + return report + + +def process_crypto_features_file(input_path, output_path=None): + """Process crypto features file""" + if output_path is None: + output_path = input_path + + print(f"Loading crypto features from {input_path}...") + df = pd.read_parquet(input_path) + + print(f"Loaded {len(df)} rows with {len(df.columns)} columns") + print(f"Null values before processing: {df.isnull().sum().sum()}") + + handler = FinalNullValueHandler() + df_processed = handler.process_crypto_features(df) + + print(f"Null values after processing: {df_processed.isnull().sum().sum()}") + + # Generate report + report = handler.generate_report(df, df_processed, 'crypto') + + # Save processed data + df_processed.to_parquet(output_path, index=False) + print(f"Saved processed crypto features to {output_path}") + + # Save report + report_path = str(output_path).replace('.parquet', '_null_handling_report.json') + with open(report_path, 'w') as f: + json.dump(report, f, indent=2) + print(f"Saved processing report to {report_path}") + + return df_processed, report + + +def process_stock_features_file(input_path, output_path=None): + """Process stock features file""" + if output_path is None: + output_path = input_path + + print(f"Loading stock features from {input_path}...") + df = pd.read_parquet(input_path) + + print(f"Loaded {len(df)} rows with {len(df.columns)} columns") + print(f"Null values before processing: {df.isnull().sum().sum()}") + + handler = FinalNullValueHandler() + df_processed = handler.process_stock_features(df) + + print(f"Null values after processing: {df_processed.isnull().sum().sum()}") + + # Generate report + report = handler.generate_report(df, df_processed, 'stock') + + # Save processed data + df_processed.to_parquet(output_path, index=False) + print(f"Saved processed stock features to {output_path}") + + # Save report + report_path = str(output_path).replace('.parquet', '_null_handling_report.json') + with open(report_path, 'w') as f: + json.dump(report, f, indent=2) + print(f"Saved processing report to {report_path}") + + return df_processed, report + + +def main(): + """Main function to process both crypto and stock features""" + crypto_path = Path("data/merged/features/crypto_features.parquet") + stocks_path = Path("data/merged/features/stocks_features.parquet") + + processed_files = [] + + # Process crypto features + if crypto_path.exists(): + try: + df_crypto, report_crypto = process_crypto_features_file(crypto_path) + processed_files.append(('crypto', crypto_path, report_crypto)) + print(f"✓ Crypto features processed: {report_crypto['total_nulls_filled']} nulls filled") + except Exception as e: + print(f"✗ Error processing crypto features: {e}") + else: + print(f"Warning: {crypto_path} not found") + + # Process stock features + if stocks_path.exists(): + try: + df_stocks, report_stocks = process_stock_features_file(stocks_path) + processed_files.append(('stocks', stocks_path, report_stocks)) + print(f"✓ Stock features processed: {report_stocks['total_nulls_filled']} nulls filled") + except Exception as e: + print(f"✗ Error processing stock features: {e}") + else: + print(f"Warning: {stocks_path} not found") + + # Summary report + if processed_files: + print("\n" + "="*60) + print("FINAL NULL VALUE HANDLING SUMMARY") + print("="*60) + + total_nulls_filled = 0 + for file_type, file_path, report in processed_files: + total_nulls_filled += report['total_nulls_filled'] + print(f"\n{file_type.upper()} FEATURES:") + print(f" File: {file_path}") + print(f" Rows: {report['total_rows']:,}") + print(f" Columns: {report['total_columns']}") + print(f" Nulls filled: {report['total_nulls_filled']:,}") + print(f" Columns fixed: {report['columns_fixed']}") + print(f" Remaining null columns: {len(report['remaining_null_columns'])}") + + if report['remaining_null_columns']: + print(f" Still have nulls: {list(report['remaining_null_columns'].keys())}") + + print(f"\nTOTAL NULLS FILLED ACROSS ALL FILES: {total_nulls_filled:,}") + print("="*60) + else: + print("No files were processed successfully.") + + +if __name__ == "__main__": + main() diff --git a/src/merge/final_verification.py b/src/merge/final_verification.py new file mode 100644 index 0000000000000000000000000000000000000000..a06c3581d10f58db4d32249786812407a2375cf3 --- /dev/null +++ b/src/merge/final_verification.py @@ -0,0 +1,41 @@ +#!/usr/bin/env python3 + +import pandas as pd + +def main(): + print("=== FINAL CRYPTO SYMBOL VERIFICATION ===") + + # Load crypto features + df = pd.read_parquet('data/merged/features/crypto_features.parquet') + + print(f"Total rows: {len(df)}") + print(f"Total columns: {len(df.columns)}") + + # Symbol analysis + null_symbols = df['symbol'].isnull().sum() + total_rows = len(df) + null_percentage = (null_symbols / total_rows) * 100 + + print(f"Null symbols: {null_symbols} ({null_percentage:.1f}%)") + print(f"Unique symbols: {df['symbol'].nunique()}") + + print("\nTop 10 symbols by count:") + print(df['symbol'].value_counts().head(10)) + + print("\nSample of successfully extracted symbols:") + sample = df[df['symbol'].notna()][['symbol', 'cg_id']].head(10) + for _, row in sample.iterrows(): + print(f" {row['symbol']} -> {row['cg_id']}") + + if null_symbols > 0: + print(f"\nRows with remaining null symbols:") + null_rows = df[df['symbol'].isnull()][['symbol', 'cg_id', 'symbols.binance', 'symbols.bybit']] + print(null_rows.to_string(index=False)) + + print("\n=== SUCCESS METRICS ===") + print(f"✅ Symbol extraction success rate: {((total_rows - null_symbols) / total_rows) * 100:.1f}%") + print(f"✅ Total symbols populated: {total_rows - null_symbols}") + print(f"✅ Pipeline integration: Complete") + +if __name__ == "__main__": + main() diff --git a/src/merge/finhub/company_info.py b/src/merge/finhub/company_info.py new file mode 100644 index 0000000000000000000000000000000000000000..c189f61887ec2fc631d762da4ea4acb269b5e2d7 --- /dev/null +++ b/src/merge/finhub/company_info.py @@ -0,0 +1,75 @@ +import os +from pathlib import Path +import pandas as pd +import glob + +# Resolve DATA_DIR similar to other modules +try: + from src.config import DATA_DIR as CFG_DATA_DIR # when run as module +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR # when run as script + except Exception: + CFG_DATA_DIR = "/data" + + +def _resolve_under_data(path_like: str | os.PathLike) -> Path: + """Map a repo-style path like 'data/...' to /...; keep absolute paths as-is.""" + p = Path(path_like) + if p.is_absolute(): + return p + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return Path(CFG_DATA_DIR) / rel + +def load_company_profiles(profiles_dir): + """ + Load all company profile parquet files from the directory into a DataFrame. + Returns a DataFrame indexed by symbol. + """ + profile_files = glob.glob(os.path.join(profiles_dir, '*_company_profile.parquet')) + profiles = [] + for file in profile_files: + df = pd.read_parquet(file) + # Extract symbol from filename + symbol = os.path.basename(file).split('_')[0] + df['symbol'] = symbol + profiles.append(df) + if profiles: + profiles_df = pd.concat(profiles, ignore_index=True) + profiles_df.set_index('symbol', inplace=True) + return profiles_df + else: + return pd.DataFrame() + +def merge_company_info_to_features(features_path, profiles_dir, output_path): + """ + Merge company profile info into stocks features DataFrame by symbol. + """ + # Resolve all paths under DATA_DIR + features_path = _resolve_under_data(features_path) + profiles_dir = _resolve_under_data(profiles_dir) + output_path = _resolve_under_data(output_path) + # Load features + features_df = pd.read_parquet(features_path) + # Load company profiles + profiles_df = load_company_profiles(profiles_dir) + # Merge on symbol + merged_df = features_df.join(profiles_df, on='symbol', rsuffix='_company') + # Save result + merged_df.to_parquet(output_path, compression='snappy') + return merged_df + +# Example usage +def main(): + features_path = "data/merged/features/stocks_features.parquet" + profiles_dir = "data/finnhub/company_info" + output_path = features_path + merge_company_info_to_features(features_path, profiles_dir, output_path) + print(f"[INFO] Merged company info into features and saved to: {output_path}") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/finhub/quote.py b/src/merge/finhub/quote.py new file mode 100644 index 0000000000000000000000000000000000000000..b478ec1f2e424108ed4464d305e5480b461228cc --- /dev/null +++ b/src/merge/finhub/quote.py @@ -0,0 +1,115 @@ +import os +from pathlib import Path +import pandas as pd +import glob +import json + +# Resolve DATA_DIR similar to other modules +try: + from src.config import DATA_DIR as CFG_DATA_DIR +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR + except Exception: + CFG_DATA_DIR = "/data" + + +def _resolve_under_data(path_like: str | os.PathLike) -> Path: + p = Path(path_like) + if p.is_absolute(): + return p + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return Path(CFG_DATA_DIR) / rel + +def add_latest_quotes_to_features(features_path, quotes_dir, output_path): + # Resolve paths under DATA_DIR + features_path = _resolve_under_data(features_path) + quotes_dir = _resolve_under_data(quotes_dir) + output_path = _resolve_under_data(output_path) + + # Load features + features_df = pd.read_parquet(features_path) + + # Load all quote JSONs + quote_rows = [] + for file in glob.glob(os.path.join(str(quotes_dir), '*_current_quote.parquet')): + try: + df = pd.read_parquet(file) + # If DataFrame has a 'data' column, expand it + if 'data' in df.columns: + import numpy as np + data_list = df['data'].tolist() + if data_list and isinstance(data_list[0], np.ndarray): + flat_list = [dict(item) for item in data_list[0]] + df = pd.DataFrame.from_records(flat_list) + elif data_list and isinstance(data_list[0], dict): + df = pd.DataFrame.from_records(data_list) + elif data_list and isinstance(data_list[0], list): + expected_cols = ["c", "d", "dp", "h", "l", "o", "pc", "t"] + df = pd.DataFrame(data_list, columns=expected_cols[:len(data_list[0])]) + else: + df = pd.DataFrame() + # If DataFrame has only one row, convert to dict + if not df.empty: + record = df.iloc[0].to_dict() + record['symbol'] = os.path.basename(file).split('_')[0] + quote_rows.append(record) + except Exception as e: + print(f"[WARN] Skipping {file}: {e}") + + if not quote_rows: + print("[WARN] No valid quote data found to merge. Output not updated.") + return + + quotes_df = pd.DataFrame(quote_rows).set_index('symbol') + + def merge_quote_into_row(row): + symbol = row['symbol'] + if symbol not in quotes_df.index: + return row + + quote = quotes_df.loc[symbol] + + field_map = { + 'o': 'open', + 'h': 'high', + 'l': 'low', + 'c': 'close', + 'd': 'change', + 'dp': 'price_change_1', + } + + for q_key, f_key in field_map.items(): + val = quote.get(q_key) + if pd.notnull(val): + if f_key in features_df.columns: + row[f_key] = val + else: + row[f'{f_key}_quote'] = val # if feature doesn’t exist, add it + + # Add extra fields + if pd.notnull(quote.get('pc')): + row['prev_close'] = quote['pc'] + + if pd.notnull(quote.get('t')): + row['timestamp'] = quote['t'] * 1000 + row['datetime'] = pd.to_datetime(quote['t'], unit='s') + + return row + + features_df = features_df.apply(merge_quote_into_row, axis=1) + features_df.to_parquet(output_path, index=False, compression='snappy') + print(f"[INFO] Added latest quote data for all available symbols and saved to: {output_path}") + +def main(): + features_path = "data/merged/features/stocks_features.parquet" + quotes_dir = "data/finnhub/stock_data" + output_path = features_path + add_latest_quotes_to_features(features_path, quotes_dir, output_path) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/finhub/ratings.py b/src/merge/finhub/ratings.py new file mode 100644 index 0000000000000000000000000000000000000000..6b6580299870dbcec5c18214e72e042cb53bc0a9 --- /dev/null +++ b/src/merge/finhub/ratings.py @@ -0,0 +1,66 @@ +import os +from pathlib import Path +import pandas as pd +import glob + +# Resolve DATA_DIR similar to other modules +try: + from src.config import DATA_DIR as CFG_DATA_DIR +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR + except Exception: + CFG_DATA_DIR = "/data" + + +def _resolve_under_data(path_like: str | os.PathLike) -> Path: + p = Path(path_like) + if p.is_absolute(): + return p + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return Path(CFG_DATA_DIR) / rel + +def add_latest_ratings_to_features(features_path, ratings_dir, output_path): + # Resolve paths under DATA_DIR + features_path = _resolve_under_data(features_path) + ratings_dir = _resolve_under_data(ratings_dir) + output_path = _resolve_under_data(output_path) + + # Load features + features_df = pd.read_parquet(features_path) + + # Find all ratings files + ratings_files = glob.glob(os.path.join(str(ratings_dir), '*_recommendation_trends.parquet')) + latest_rows = [] + for file in ratings_files: + # Read as Parquet file + df = pd.read_parquet(file) + # Get latest row by period (assuming period is YYYY-MM-DD) + if 'period' in df.columns: + df['period'] = pd.to_datetime(df['period']) + latest = df.sort_values('period', ascending=False).iloc[[0]] + latest_rows.append(latest) + if latest_rows: + all_latest_ratings = pd.concat(latest_rows, ignore_index=True) + else: + all_latest_ratings = pd.DataFrame() + # Merge only if ratings data is available and has 'symbol' column + if not all_latest_ratings.empty and 'symbol' in all_latest_ratings.columns: + merged_df = features_df.merge(all_latest_ratings, on='symbol', how='left', suffixes=('', '_ratings')) + merged_df.to_parquet(output_path, compression='snappy') + print(f"[INFO] Added latest ratings data for all available symbols and saved to: {output_path}") + else: + print("[WARN] No valid ratings data found to merge. Output not updated.") + +def main(): + features_path = "data/merged/features/stocks_features.parquet" + ratings_dir = "data/finnhub/ratings" + output_path = features_path + add_latest_ratings_to_features(features_path, ratings_dir, output_path) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/finhub/sentiment.py b/src/merge/finhub/sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..5680ac5406325fb06e73ecc06cdf7ee25d6f2db8 --- /dev/null +++ b/src/merge/finhub/sentiment.py @@ -0,0 +1,86 @@ +import os +from pathlib import Path +import pandas as pd + +try: + from src.config import DATA_DIR as CFG_DATA_DIR +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR + except Exception: + CFG_DATA_DIR = "/data" + + +def _resolve_under_data(path_like: str | os.PathLike) -> Path: + p = Path(path_like) + if p.is_absolute(): + return p + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return Path(CFG_DATA_DIR) / rel + + +def add_sentiment_to_features(features_path, output_path, sentiment_data): + # Resolve paths under DATA_DIR + features_path = _resolve_under_data(features_path) + output_path = _resolve_under_data(output_path) + + # Load features + features_df = pd.read_parquet(features_path) + + # Load newest sentiment data for all symbols from ownership directory under DATA_DIR + ownership_dir = Path(CFG_DATA_DIR) / 'finnhub' / 'ownership' + import glob + sentiment_files = glob.glob(os.path.join(str(ownership_dir), '*_insider_sentiment.parquet')) + newest_rows = [] + for file in sentiment_files: + df = pd.read_parquet(file) + # If file has a 'data' column, expand it + if 'data' in df.columns: + data_list = df['data'].tolist() + # If first item is a numpy array, flatten to list of dicts + import numpy as np + if data_list and isinstance(data_list[0], np.ndarray): + # Flatten array to list + flat_list = [dict(item) for item in data_list[0]] + df = pd.DataFrame.from_records(flat_list) + elif data_list and isinstance(data_list[0], dict): + df = pd.DataFrame.from_records(data_list) + elif data_list and isinstance(data_list[0], list): + expected_cols = ["change", "month", "mspr", "symbol", "year"] + df = pd.DataFrame(data_list, columns=expected_cols[:len(data_list[0])]) + else: + df = pd.DataFrame() + # Extract symbol from filename if not present + if 'symbol' not in df.columns: + symbol = os.path.basename(file).split('_')[0] + df['symbol'] = symbol + # Only process if both 'year' and 'month' columns exist + if 'year' in df.columns and 'month' in df.columns: + newest = df.sort_values(['year', 'month'], ascending=[False, False]).iloc[[0]] + newest_rows.append(newest) + else: + print(f"[WARN] Skipping {file}: missing 'year' or 'month' column after expansion.") + if newest_rows: + all_newest_sentiment = pd.concat(newest_rows, ignore_index=True) + else: + all_newest_sentiment = pd.DataFrame() + # Merge only if sentiment data is available and has 'symbol' column + if not all_newest_sentiment.empty and 'symbol' in all_newest_sentiment.columns: + merged_df = features_df.merge(all_newest_sentiment, on='symbol', how='left', suffixes=('', '_sentiment')) + # Save result + merged_df.to_parquet(output_path, compression='snappy') + print(f"[INFO] Added newest sentiment data for all available symbols and saved to: {output_path}") + else: + print("[WARN] No valid sentiment data found to merge. Output not updated.") + +def main(): + features_path = "data/merged/features/stocks_features.parquet" + output_path = features_path + add_sentiment_to_features(features_path, output_path, None) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/full_report.py b/src/merge/full_report.py new file mode 100644 index 0000000000000000000000000000000000000000..d6ebb8e122453719d5dec9f5a44fea8fa70c593b --- /dev/null +++ b/src/merge/full_report.py @@ -0,0 +1,385 @@ +""" +Unified report generator for merged features - generates all 3 reports with automatic column discovery. +Supports merged, crypto, and stocks feature files with dynamic schema detection. + +Usage: + # Generate all 3 reports + python unified_report_generator.py --generate-all + + # Generate specific reports + python unified_report_generator.py --merged-input data/merged/features/merged_features.parquet + python unified_report_generator.py --crypto-input data/merged/features/crypto_features.parquet + python unified_report_generator.py --stocks-input data/merged/features/stocks_features.parquet + + # Custom paths + python unified_report_generator.py \ + --merged-input path/to/merged.parquet \ + --crypto-input path/to/crypto.parquet \ + --stocks-input path/to/stocks.parquet \ + --output-dir reports/ \ + --baseline-schema schemas/baseline.json +""" + +import argparse +import pandas as pd +import json +import os +from datetime import datetime +from typing import Dict, List, Set, Optional +from pathlib import Path + +def categorize_column_by_name(col_name: str) -> str: + """Automatically categorize columns based on naming patterns.""" + col_lower = col_name.lower() + + # Exchange-related + if col_name.startswith(('symbols.', 'exchangePrices.')): + return "Exchange Data" + + # Performance metrics + if col_name.startswith(('performance.', 'rankDiffs.')): + return "Performance Metrics" + + # Technical indicators + if col_lower in ['rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_width', + 'bb_position', 'stoch_k', 'stoch_d', 'cci', 'mfi'] or col_name.startswith('roc_'): + return "Technical Indicators" + + # Price-related + if any(word in col_lower for word in ['price', 'open', 'volume', 'marketcap', 'volatility']): + return "Price & Volume" + + # On-chain/blockchain + if any(word in col_lower for word in ['transaction', 'gas', 'fees', 'tx_', 'blockchain']): + return "On-chain Features" + + # Sentiment + if any(word in col_lower for word in ['sentiment', 'social', 'confidence']): + return "Sentiment Features" + + # Temporal + if any(word in col_lower for word in ['timestamp', 'hour', 'day', 'weekend', 'trading_hours']): + return "Temporal Features" + + # Completeness metrics + if 'completeness' in col_lower or 'data_quality' in col_lower: + return "Data Quality Metrics" + + # Market/Exchange info + if col_lower in ['dominance', 'rank', 'stable', 'cg_id']: + return "Market Metrics" + + # Flags + if col_name.startswith('is_') or col_lower in ['stable']: + return "Asset Flags" + + # Metadata + if col_name.startswith('_') or col_lower in ['backup_id', 'stock_market', 'blockchain_network']: + return "Metadata" + + # Links + if col_name.startswith('links.'): + return "External Links" + + # Interaction features + if any(word in col_lower for word in ['correlation', 'convergence', 'alignment', 'trend']): + return "Interaction Features" + + # Default for unknown + return "Other Features" + +def load_baseline_schema(baseline_path: str) -> Set[str]: + """Load baseline schema if it exists.""" + if os.path.exists(baseline_path): + try: + with open(baseline_path, 'r') as f: + baseline = json.load(f) + return set(baseline.get('columns', [])) + except (json.JSONDecodeError, KeyError): + print(f"Warning: Could not load baseline schema from {baseline_path}") + return set() + +def save_baseline_schema(columns: List[str], baseline_path: str): + """Save current columns as baseline schema.""" + os.makedirs(os.path.dirname(baseline_path), exist_ok=True) + schema = { + "generated_at": datetime.utcnow().isoformat() + "Z", + "total_columns": len(columns), + "columns": sorted(columns) + } + with open(baseline_path, 'w') as f: + json.dump(schema, f, indent=2) + +def detect_asset_type(df: pd.DataFrame, all_columns: List[str]) -> str: + """Detect asset type based on column patterns.""" + if any(col.startswith('symbols.') for col in all_columns): + return "crypto" + elif "stock_market" in all_columns: + return "stocks" + elif "is_crypto" in all_columns and "is_stock" in all_columns: + return "mixed" + else: + return "unknown" + +def get_asset_specific_stats(df: pd.DataFrame, asset_type: str, all_columns: List[str]) -> Dict: + """Get asset-specific statistics.""" + stats = {"asset_type": asset_type} + + if asset_type == "crypto": + # Crypto-specific stats + if "stable" in df.columns: + stats["stable_coins_count"] = int(df["stable"].sum()) + + if "cg_id" in df.columns or "symbol" in df.columns: + symbol_col = "symbol" if "symbol" in df.columns else "cg_id" + stats["unique_crypto_assets"] = df[symbol_col].nunique() + + # Exchange coverage + exchange_columns = [col for col in all_columns if col.startswith(("symbols.", "exchangePrices."))] + if exchange_columns: + exchange_coverage = {} + for col in exchange_columns[:10]: # Limit to avoid huge reports + coverage = (df[col].notna().sum() / len(df)) * 100 + exchange_coverage[col] = round(coverage, 2) + stats["exchange_coverage"] = exchange_coverage + + elif asset_type == "stocks": + # Stock-specific stats + if "symbol" in df.columns: + stats["unique_stock_symbols"] = df["symbol"].nunique() + + if "stock_market" in df.columns: + stats["stock_market_distribution"] = df["stock_market"].value_counts().to_dict() + + if "is_trading_hours" in df.columns: + trading_hours_pct = (df["is_trading_hours"].sum() / len(df)) * 100 + stats["trading_hours_coverage_pct"] = round(trading_hours_pct, 2) + + elif asset_type == "mixed": + # Mixed dataset stats + if "is_crypto" in df.columns: + stats["crypto_records"] = int(df["is_crypto"].sum()) + if "is_stock" in df.columns: + stats["stock_records"] = int(df["is_stock"].sum()) + if "symbol" in df.columns: + stats["total_unique_symbols"] = df["symbol"].nunique() + + return stats + +def generate_report(input_path: str, output_path: str, baseline_schema_path: Optional[str] = None, report_type: str = "auto") -> bool: + """Generate a feature report for any dataset type.""" + + # Check if input file exists + if not os.path.exists(input_path): + print(f"Warning: Input file not found: {input_path}") + return False + + try: + # Load the dataset + df = pd.read_parquet(input_path) + all_columns = list(df.columns) + + print(f"Processing {input_path}...") + print(f" - Shape: {df.shape}") + print(f" - Columns: {len(all_columns)}") + + # Load baseline schema for comparison + baseline_columns = set() + if baseline_schema_path: + baseline_columns = load_baseline_schema(baseline_schema_path) + + # Identify new columns + current_columns = set(all_columns) + new_columns = current_columns - baseline_columns if baseline_columns else set() + + # Auto-categorize all columns + categories = {} + new_features_by_category = {} + + for col in all_columns: + category = categorize_column_by_name(col) + + if category not in categories: + categories[category] = {"count": 0, "features": []} + new_features_by_category[category] = [] + + categories[category]["features"].append(col) + categories[category]["count"] += 1 + + # Track if it's a new feature + if col in new_columns: + new_features_by_category[category].append(col) + + # Clean up empty new feature lists + new_features_by_category = {k: v for k, v in new_features_by_category.items() if v} + + # Basic dataset stats + ts_col = df["interval_timestamp"] if "interval_timestamp" in df.columns else df.iloc[:, 0] + if pd.api.types.is_datetime64_any_dtype(ts_col): + start_ts = int(ts_col.min().timestamp() * 1000) + end_ts = int(ts_col.max().timestamp() * 1000) + else: + start_ts = int(ts_col.min()) + end_ts = int(ts_col.max()) + + memory_mb = df.memory_usage(deep=True).sum() / 1024**2 + + # Data quality + missing = df.isna().sum().to_dict() + total_cells = df.size + non_missing = int(df.notna().sum().sum()) + completeness_pct = (non_missing / total_cells) * 100 + avg_dq_score = df.get("data_quality_score", pd.Series(dtype=float)).mean() + + # Detect asset type and get specific stats + asset_type = detect_asset_type(df, all_columns) + asset_stats = get_asset_specific_stats(df, asset_type, all_columns) + + # Build the report + report = { + "generated_at_utc": datetime.utcnow().isoformat() + "Z", + "report_type": report_type, + "schema_version": "unified_v1.0", + "source_file": os.path.basename(input_path), + "dataset_info": { + "shape": list(df.shape), + "memory_usage_mb": round(memory_mb, 2), + "time_range": {"start": start_ts, "end": end_ts}, + "total_columns": len(all_columns), + "total_categories": len(categories), + "new_columns_count": len(new_columns), + **asset_stats + }, + "feature_categories": categories, + "data_quality": { + "overall_completeness_pct": round(completeness_pct, 2), + "missing_values_by_column": missing, + "average_data_quality_score": None if pd.isna(avg_dq_score) else round(avg_dq_score, 4) + } + } + + # Add new features section if any exist + if new_columns: + report["new_features"] = { + "total_new_features": len(new_columns), + "new_features_by_category": new_features_by_category, + "all_new_features": sorted(list(new_columns)) + } + + # Add baseline comparison if available + if baseline_columns: + removed_columns = baseline_columns - current_columns + if removed_columns: + report["removed_features"] = sorted(list(removed_columns)) + + # Ensure output directory exists + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + # Write report + with open(output_path, "w") as f: + json.dump(report, f, indent=2) + + print(f" Report generated: {output_path}") + print(f" - {len(categories)} categories") + if new_columns: + print(f" - {len(new_columns)} new features detected") + + return True + + except Exception as e: + print(f" Error processing {input_path}: {str(e)}") + return False + +def main(): + parser = argparse.ArgumentParser(description=__doc__) + + # Input files + parser.add_argument("--merged-input", default="data/merged/features/merged_features.parquet", help="Path to merged_features.parquet") + parser.add_argument("--crypto-input", default="data/merged/features/crypto_features.parquet", help="Path to crypto_features.parquet") + parser.add_argument("--stocks-input", default="data/merged/features/stocks_features.parquet", help="Path to stocks_features.parquet") + + # Output settings + parser.add_argument("--output-dir", default="data/merged/features/", help="Output directory for reports") + parser.add_argument("--baseline-schema", default="schemas/baseline.json", help="Path to baseline schema JSON") + + # Convenience flags + parser.add_argument("--generate-all", action="store_true", help="Generate all reports using default paths") + + args = parser.parse_args() + + # Default paths for --generate-all + if args.generate_all: + default_paths = { + "merged": "data/merged/features/merged_features.parquet", + "crypto": "data/merged/features/crypto_features.parquet", + "stocks": "data/merged/features/stocks_features.parquet" + } + + print("Generating all feature reports...") + success_count = 0 + + for report_type, input_path in default_paths.items(): + output_dir = args.output_dir if args.output_dir else "data/merged/features/" + output_path = os.path.join(output_dir, f"{report_type}_report.json") + baseline_path = args.baseline_schema if args.baseline_schema else f"schemas/{report_type}_baseline.json" + + if generate_report(input_path, output_path, baseline_path, report_type): + success_count += 1 + + print(f"\nGenerated {success_count}/3 reports successfully!") + + # Update baseline schema with merged features if it exists + if args.baseline_schema and os.path.exists(default_paths["merged"]): + df = pd.read_parquet(default_paths["merged"]) + save_baseline_schema(list(df.columns), args.baseline_schema) + print(f"Updated baseline schema: {args.baseline_schema}") + + return + + # Individual file processing + reports_generated = 0 + + if args.merged_input: + output_dir = args.output_dir if args.output_dir else "data/merged/features/" + output_path = os.path.join(output_dir, "merged_report.json") + if generate_report(args.merged_input, output_path, args.baseline_schema, "merged"): + reports_generated += 1 + + if args.crypto_input: + output_dir = args.output_dir if args.output_dir else "data/merged/features/" + output_path = os.path.join(output_dir, "crypto_report.json") + if generate_report(args.crypto_input, output_path, args.baseline_schema, "crypto"): + reports_generated += 1 + # Print crypto count and data quality + try: + with open(output_path, "r") as f: + report = json.load(f) + count = report.get("dataset_info", {}).get("shape", [None])[0] + dq = report.get("data_quality", {}).get("overall_completeness_pct", None) + print(f"[CRYPTO] Count: {count}, Data Quality: {dq}%") + except Exception as e: + print(f"[CRYPTO] Error reading report for stats: {e}") + + if args.stocks_input: + output_dir = args.output_dir if args.output_dir else "data/merged/features/" + output_path = os.path.join(output_dir, "stocks_report.json") + if generate_report(args.stocks_input, output_path, args.baseline_schema, "stocks"): + reports_generated += 1 + # Print stocks count and data quality + try: + with open(output_path, "r") as f: + report = json.load(f) + count = report.get("dataset_info", {}).get("shape", [None])[0] + dq = report.get("data_quality", {}).get("overall_completeness_pct", None) + print(f"[STOCKS] Count: {count}, Data Quality: {dq}%") + except Exception as e: + print(f"[STOCKS] Error reading report for stats: {e}") + + if reports_generated == 0: + print("No input files specified. Use --generate-all or specify input files.") + parser.print_help() + else: + print(f"\nGenerated {reports_generated} report(s) successfully!") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/main.py b/src/merge/main.py new file mode 100644 index 0000000000000000000000000000000000000000..4dc86aafdf08041b3caf4e8440c2e17cd9ac2720 --- /dev/null +++ b/src/merge/main.py @@ -0,0 +1,259 @@ +import subprocess +from pathlib import Path +import sys +import pandas as pd +from datetime import datetime, timedelta +from dotenv import load_dotenv +import gc +import psutil +import os + +# Memory optimization for merge operations +class MergeMemoryOptimizer: + """Memory optimizer for merge operations""" + + def __init__(self, max_memory_mb=350): + self.max_memory_mb = max_memory_mb + self.process = psutil.Process() + + def get_memory_usage(self): + return self.process.memory_info().rss / 1024 / 1024 + + def cleanup_after_script(self, script_name): + collected = gc.collect() + memory_after = self.get_memory_usage() + print(f"[MemOpt] After {script_name}: {memory_after:.1f}MB (freed {collected} objects)") + + if memory_after > self.max_memory_mb: + print(f"[MemOpt] WARNING: High memory after {script_name}") + # Additional cleanup attempt + gc.collect() + + return memory_after + +# Global memory optimizer instance +memory_optimizer = MergeMemoryOptimizer() + +DAYS_OLD = 7 +MERGED_PATH = Path("data/merged/features/merged_features.parquet") +ARCHIVE_DIR = Path("data/merged/archive") +ARCHIVE_DIR.mkdir(parents=True, exist_ok=True) + +def run_script(script, args=None): + cmd = [sys.executable, str(Path(__file__).parent / script)] + if args: + cmd += args + print(f"Running: {' '.join(cmd)}") + + # Check memory before running + memory_before = memory_optimizer.get_memory_usage() + print(f"[MemOpt] Before {script}: {memory_before:.1f}MB") + + result = subprocess.run(cmd, check=True) + + # Cleanup after running + memory_optimizer.cleanup_after_script(script) + + return result + +def archive_old_records(): + feature_files = [ + Path("data/merged/features/crypto_features.parquet"), + Path("data/merged/features/stocks_features.parquet") + ] + now = datetime.utcnow() + cutoff = int((now - timedelta(days=DAYS_OLD)).timestamp() * 1000) + + for feature_path in feature_files: + if not feature_path.exists(): + print(f"[WARN] {feature_path} does not exist.") + continue + + df = pd.read_parquet(feature_path) + old = df.loc[df['interval_timestamp'] < cutoff].copy() + keep = df.loc[df['interval_timestamp'] >= cutoff].copy() + + if old.empty: + print(f"[INFO] No records to archive in {feature_path}.") + continue + + # Group by day (UTC) and write each group to a separate parquet file under archive/{day}/ + old['archive_date'] = pd.to_datetime(old['interval_timestamp'], unit='ms').dt.strftime('%Y%m%d') + for day, group in old.groupby('archive_date'): + day_dir = ARCHIVE_DIR / day + day_dir.mkdir(parents=True, exist_ok=True) + out_path = day_dir / f"{feature_path.stem}_archived_{day}.parquet" + if out_path.exists(): + existing = pd.read_parquet(out_path) + group = pd.concat([existing, group.drop(columns=['archive_date'])], ignore_index=True) + else: + group = group.drop(columns=['archive_date']) + + group.to_parquet(out_path, index=False) + print(f"[ARCHIVE] {len(group)} records -> {out_path}") + + # Save the remaining (unarchived) records back to the feature file + keep.to_parquet(feature_path, index=False) + print(f"[INFO] Archived {len(old)} records from {feature_path}. {len(keep)} remain.") + +def store_in_cloud(): + # Import StorageHandler from cloud_utils, ensuring src is in sys.path + import os + import sys + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))) + from data_cloud.cloud_utils import StorageHandler + + # Filebase credentials from env + load_dotenv() + endpoint_url = os.getenv("FILEBASE_ENDPOINT") + access_key = os.getenv("FILEBASE_ACCESS_KEY") + secret_key = os.getenv("FILEBASE_SECRET_KEY") + bucket_name = os.getenv("FILEBASE_BUCKET") + if not all([endpoint_url, access_key, secret_key, bucket_name]): + print("[ERROR] Filebase credentials not set in environment.") + return + + storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name) + + merged_dir = os.path.join("data", "merged") + archive_dir = os.path.join(merged_dir, "archive") + # Upload all files in merged except archive + for root, dirs, files in os.walk(merged_dir): + # Skip archive subdir for now + if os.path.abspath(root) == os.path.abspath(archive_dir): + continue + for fname in files: + local_path = os.path.join(root, fname) + rel_path = os.path.relpath(local_path, "data") + key = rel_path.replace(os.sep, "/") + with open(local_path, "rb") as f: + data = f.read() + storage.upload(key, data) + + # Only upload archive files newer than DAYS_OLD days + import time + cutoff = time.time() - DAYS_OLD * 86400 + if os.path.exists(archive_dir): + for fname in os.listdir(archive_dir): + local_path = os.path.join(archive_dir, fname) + if not os.path.isfile(local_path): + continue + mtime = os.path.getmtime(local_path) + if mtime >= cutoff: + rel_path = os.path.relpath(local_path, "data") + key = rel_path.replace(os.sep, "/") + with open(local_path, "rb") as f: + data = f.read() + storage.upload(key, data) + +# Save stocks and crypto features to data/merged/raw +def save_raw_features(): + import shutil + raw_dir = Path('data/merged/raw') + raw_dir.mkdir(parents=True, exist_ok=True) + src_stocks = Path('data/merged/features/stocks_features.parquet') + src_crypto = Path('data/merged/features/crypto_features.parquet') + dst_stocks = raw_dir / 'stocks_features.parquet' + dst_crypto = raw_dir / 'crypto_features.parquet' + if src_stocks.exists(): + shutil.copy2(src_stocks, dst_stocks) + print(f"[RAW] Saved stocks features to {dst_stocks}") + else: + print(f"[RAW] Source stocks features not found: {src_stocks}") + if src_crypto.exists(): + shutil.copy2(src_crypto, dst_crypto) + print(f"[RAW] Saved crypto features to {dst_crypto}") + else: + print(f"[RAW] Source crypto features not found: {src_crypto}") + +def main(): + print("[MergeOpt] Starting memory-optimized merge pipeline...") + initial_memory = memory_optimizer.get_memory_usage() + print(f"[MergeOpt] Initial memory: {initial_memory:.1f}MB") + + # Run all merge steps with memory monitoring + run_script('merge_0.py') + run_script('merge_1.py', [ + '--latest', 'data/advisorai-data/features/latest_features.parquet', + '--finnhub', 'data/advisorai-data/features/latest_features.parquet', + '--out', 'data/merged/features/merged_features.parquet' + ]) + run_script('merge_2.py') + run_script('merge_3.py') + run_script('merge_4.py') + run_script('separator.py') + run_script('merge_5.py') + run_script('merge_6.py') + run_script('merge_7.py') + + save_raw_features() + + # Extract symbols from exchange symbol data before data fillers + try: + run_script('extract_symbols.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Symbol extraction failed: {e}") + + # Remove rows with null symbols after symbol extraction + try: + run_script('remove_null_symbols.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Null symbol removal failed: {e}") + + # # Run normalization scripts with error handling + # run_script('stocks_data_filler.py') + + # try: + # run_script('crypto_data_filler.py') + # except subprocess.CalledProcessError as e: + # print(f"[WARNING] Crypto data filler failed: {e}") + + # Merge temp files into merged - with error handling + try: + run_script('merge_temp.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Merge temp failed: {e}") + + try: + run_script('merge_sant.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Santiment merge failed: {e}") + + try: + run_script('merge_santiment_with_crypto.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Santiment-crypto merge failed: {e}") + + # # Final comprehensive null value handling - clean up any remaining nulls + # try: + # run_script('run_final_null_handling.py') + # except subprocess.CalledProcessError as e: + # print(f"[WARNING] Final null handling failed: {e}") + + # # Normalize features + # run_script('normalize.py') + # # Normalize train files for both crypto and stocks + # run_script('norm/crypto.py', ['--train']) + # run_script('norm/stocks.py', ['--train']) + + # Archive old records + archive_old_records() + + # Generate and store full report + run_script('full_report.py') + + # Store all merged data in cloud + store_in_cloud() + + # Final memory check + final_memory = memory_optimizer.get_memory_usage() + print(f"[MergeOpt] Final memory usage: {final_memory:.1f}MB") + + if final_memory > 400: + print("[MergeOpt] WARNING: High final memory usage") + memory_optimizer.cleanup_after_script("final cleanup") + + print("[OK] All merge steps, null handling, normalization, and reporting completed successfully.") + +if __name__ == "__main__": + main() diff --git a/src/merge/main_memory_optimized.py b/src/merge/main_memory_optimized.py new file mode 100644 index 0000000000000000000000000000000000000000..7be4ffe5a3b57e82a414a3a53a5ec77220bfcb32 --- /dev/null +++ b/src/merge/main_memory_optimized.py @@ -0,0 +1,85 @@ +""" +Memory-Optimized Merge Wrapper +Wraps the main merge function with memory monitoring and cleanup +""" + +import gc +import os +import sys +import psutil +from pathlib import Path + +class MergeMemoryOptimizer: + """Memory optimizer specifically for merge operations""" + + def __init__(self, max_memory_mb=350): # Conservative limit for merge operations + self.max_memory_mb = max_memory_mb + self.process = psutil.Process() + + def get_memory_usage(self): + """Get current memory usage in MB""" + return self.process.memory_info().rss / 1024 / 1024 + + def cleanup_after_script(self, script_name): + """Cleanup after running a merge script""" + # Force garbage collection + collected = gc.collect() + + # Clear any cached modules + modules_to_clear = [m for m in sys.modules.keys() if 'merge' in m or 'pandas' in m] + for module in modules_to_clear: + if module in sys.modules and module != __name__: + try: + del sys.modules[module] + except: + pass + + memory_after = self.get_memory_usage() + print(f"[MemOpt] After {script_name}: {memory_after:.1f}MB (freed {collected} objects)") + + if memory_after > self.max_memory_mb: + print(f"[MemOpt] WARNING: High memory after {script_name}: {memory_after:.1f}MB") + + return memory_after + +# Import the original main function +def main(): + """Memory-optimized wrapper for merge main""" + optimizer = MergeMemoryOptimizer() + + initial_memory = optimizer.get_memory_usage() + print(f"[MergeOpt] Starting merge operations - Memory: {initial_memory:.1f}MB") + + try: + # Import and run the original main function + from merge.main_original import main as original_main + + # Monitor memory during execution + result = original_main() + + # Final cleanup + final_memory = optimizer.cleanup_after_script("all merge operations") + print(f"[MergeOpt] Final merge memory: {final_memory:.1f}MB") + + return result + + except ImportError: + # Fallback to current main if original doesn't exist + print("[MergeOpt] No original main found, running current implementation...") + + # Import the current implementation + import merge.main as current_main + result = current_main.main() + + # Cleanup + optimizer.cleanup_after_script("current merge implementation") + + return result + + except Exception as e: + print(f"[MergeOpt] Error in merge operations: {e}") + optimizer.cleanup_after_script("error cleanup") + raise + +if __name__ == "__main__": + main() diff --git a/src/merge/main_original.py b/src/merge/main_original.py new file mode 100644 index 0000000000000000000000000000000000000000..228786940689423a2661de9205459a3b485f99a0 --- /dev/null +++ b/src/merge/main_original.py @@ -0,0 +1,209 @@ +import subprocess +from pathlib import Path +import sys +import pandas as pd +from datetime import datetime, timedelta +from dotenv import load_dotenv + +DAYS_OLD = 7 +MERGED_PATH = Path("data/merged/features/merged_features.parquet") +ARCHIVE_DIR = Path("data/merged/archive") +ARCHIVE_DIR.mkdir(parents=True, exist_ok=True) + +def run_script(script, args=None): + cmd = [sys.executable, str(Path(__file__).parent / script)] + if args: + cmd += args + print(f"Running: {' '.join(cmd)}") + result = subprocess.run(cmd, check=True) + return result + +def archive_old_records(): + feature_files = [ + Path("data/merged/features/crypto_features.parquet"), + Path("data/merged/features/stocks_features.parquet") + ] + now = datetime.utcnow() + cutoff = int((now - timedelta(days=DAYS_OLD)).timestamp() * 1000) + + for feature_path in feature_files: + if not feature_path.exists(): + print(f"[WARN] {feature_path} does not exist.") + continue + + df = pd.read_parquet(feature_path) + old = df.loc[df['interval_timestamp'] < cutoff].copy() + keep = df.loc[df['interval_timestamp'] >= cutoff].copy() + + if old.empty: + print(f"[INFO] No records to archive in {feature_path}.") + continue + + # Group by day (UTC) and write each group to a separate parquet file under archive/{day}/ + old['archive_date'] = pd.to_datetime(old['interval_timestamp'], unit='ms').dt.strftime('%Y%m%d') + for day, group in old.groupby('archive_date'): + day_dir = ARCHIVE_DIR / day + day_dir.mkdir(parents=True, exist_ok=True) + out_path = day_dir / f"{feature_path.stem}_archived_{day}.parquet" + if out_path.exists(): + existing = pd.read_parquet(out_path) + group = pd.concat([existing, group.drop(columns=['archive_date'])], ignore_index=True) + else: + group = group.drop(columns=['archive_date']) + + group.to_parquet(out_path, index=False) + print(f"[ARCHIVE] {len(group)} records -> {out_path}") + + # Save the remaining (unarchived) records back to the feature file + keep.to_parquet(feature_path, index=False) + print(f"[INFO] Archived {len(old)} records from {feature_path}. {len(keep)} remain.") + +def store_in_cloud(): + # Import StorageHandler from cloud_utils, ensuring src is in sys.path + import os + import sys + sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'src'))) + from data_cloud.cloud_utils import StorageHandler + + # Filebase credentials from env + load_dotenv() + endpoint_url = os.getenv("FILEBASE_ENDPOINT") + access_key = os.getenv("FILEBASE_ACCESS_KEY") + secret_key = os.getenv("FILEBASE_SECRET_KEY") + bucket_name = os.getenv("FILEBASE_BUCKET") + if not all([endpoint_url, access_key, secret_key, bucket_name]): + print("[ERROR] Filebase credentials not set in environment.") + return + + storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name) + + merged_dir = os.path.join("data", "merged") + archive_dir = os.path.join(merged_dir, "archive") + # Upload all files in merged except archive + for root, dirs, files in os.walk(merged_dir): + # Skip archive subdir for now + if os.path.abspath(root) == os.path.abspath(archive_dir): + continue + for fname in files: + local_path = os.path.join(root, fname) + rel_path = os.path.relpath(local_path, "data") + key = rel_path.replace(os.sep, "/") + with open(local_path, "rb") as f: + data = f.read() + storage.upload(key, data) + + # Only upload archive files newer than DAYS_OLD days + import time + cutoff = time.time() - DAYS_OLD * 86400 + if os.path.exists(archive_dir): + for fname in os.listdir(archive_dir): + local_path = os.path.join(archive_dir, fname) + if not os.path.isfile(local_path): + continue + mtime = os.path.getmtime(local_path) + if mtime >= cutoff: + rel_path = os.path.relpath(local_path, "data") + key = rel_path.replace(os.sep, "/") + with open(local_path, "rb") as f: + data = f.read() + storage.upload(key, data) + +# Save stocks and crypto features to data/merged/raw +def save_raw_features(): + import shutil + raw_dir = Path('data/merged/raw') + raw_dir.mkdir(parents=True, exist_ok=True) + src_stocks = Path('data/merged/features/stocks_features.parquet') + src_crypto = Path('data/merged/features/crypto_features.parquet') + dst_stocks = raw_dir / 'stocks_features.parquet' + dst_crypto = raw_dir / 'crypto_features.parquet' + if src_stocks.exists(): + shutil.copy2(src_stocks, dst_stocks) + print(f"[RAW] Saved stocks features to {dst_stocks}") + else: + print(f"[RAW] Source stocks features not found: {src_stocks}") + if src_crypto.exists(): + shutil.copy2(src_crypto, dst_crypto) + print(f"[RAW] Saved crypto features to {dst_crypto}") + else: + print(f"[RAW] Source crypto features not found: {src_crypto}") + +def main(): + # Run all merge steps + run_script('merge_0.py') + run_script('merge_1.py', [ + '--latest', 'data/advisorai-data/features/latest_features.parquet', + '--finnhub', 'data/advisorai-data/features/latest_features.parquet', + '--out', 'data/merged/features/merged_features.parquet' + ]) + run_script('merge_2.py') + run_script('merge_3.py') + run_script('merge_4.py') + run_script('separator.py') + run_script('merge_5.py') + run_script('merge_6.py') + run_script('merge_7.py') + + save_raw_features() + + # Extract symbols from exchange symbol data before data fillers + try: + run_script('extract_symbols.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Symbol extraction failed: {e}") + + # Remove rows with null symbols after symbol extraction + try: + run_script('remove_null_symbols.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Null symbol removal failed: {e}") + + # # Run normalization scripts with error handling + # run_script('stocks_data_filler.py') + + # try: + # run_script('crypto_data_filler.py') + # except subprocess.CalledProcessError as e: + # print(f"[WARNING] Crypto data filler failed: {e}") + + # Merge temp files into merged - with error handling + try: + run_script('merge_temp.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Merge temp failed: {e}") + + try: + run_script('merge_sant.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Santiment merge failed: {e}") + + try: + run_script('merge_santiment_with_crypto.py') + except subprocess.CalledProcessError as e: + print(f"[WARNING] Santiment-crypto merge failed: {e}") + + # # Final comprehensive null value handling - clean up any remaining nulls + # try: + # run_script('run_final_null_handling.py') + # except subprocess.CalledProcessError as e: + # print(f"[WARNING] Final null handling failed: {e}") + + # # Normalize features + # run_script('normalize.py') + # # Normalize train files for both crypto and stocks + # run_script('norm/crypto.py', ['--train']) + # run_script('norm/stocks.py', ['--train']) + + # Archive old records + archive_old_records() + + # Generate and store full report + run_script('full_report.py') + + # Store all merged data in cloud + store_in_cloud() + + print("[OK] All merge steps, null handling, normalization, and reporting completed successfully.") + +if __name__ == "__main__": + main() diff --git a/src/merge/manual_null_handler.py b/src/merge/manual_null_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..f49eb16199312ca6481981529157100ddb1786a8 --- /dev/null +++ b/src/merge/manual_null_handler.py @@ -0,0 +1,135 @@ +#!/usr/bin/env python3 +""" +Manual Null Handler - Standalone script for manual execution +Use this when you need to handle null values without running the full pipeline +""" + +import argparse +import sys +from pathlib import Path +import pandas as pd +from final_null_handler import process_crypto_features_file, process_stock_features_file, process_merged_features_file +from run_final_null_handling import process_merged_features_file + +def main(): + parser = argparse.ArgumentParser(description='Handle null values in feature files') + parser.add_argument('--crypto', action='store_true', help='Process crypto features only') + parser.add_argument('--stocks', action='store_true', help='Process stock features only') + parser.add_argument('--merged', action='store_true', help='Process merged features only') + parser.add_argument('--all', action='store_true', help='Process all feature files') + parser.add_argument('--input', type=str, help='Input file path (overrides default paths)') + parser.add_argument('--output', type=str, help='Output file path (defaults to input path)') + parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes') + + args = parser.parse_args() + + # Default paths + default_paths = { + 'crypto': Path("data/merged/features/crypto_features.parquet"), + 'stocks': Path("data/merged/features/stocks_features.parquet"), + 'merged': Path("data/merged/features/merged_features.parquet") + } + + if not any([args.crypto, args.stocks, args.merged, args.all, args.input]): + print("Error: Must specify --crypto, --stocks, --merged, --all, or --input") + parser.print_help() + return 1 + + files_to_process = [] + + if args.input: + # Custom input file + input_path = Path(args.input) + if not input_path.exists(): + print(f"Error: Input file {input_path} does not exist") + return 1 + + # Detect file type based on name or content + if 'crypto' in input_path.name.lower(): + file_type = 'crypto' + elif 'stock' in input_path.name.lower(): + file_type = 'stocks' + elif 'merged' in input_path.name.lower(): + file_type = 'merged' + else: + # Try to detect from content + try: + df_sample = pd.read_parquet(input_path, nrows=10) + if 'rank' in df_sample.columns or 'dominance' in df_sample.columns: + file_type = 'crypto' + elif 'strongBuy' in df_sample.columns or 'news_activity_score_x' in df_sample.columns: + file_type = 'stocks' + else: + file_type = 'merged' + except Exception: + file_type = 'merged' # Default + + output_path = Path(args.output) if args.output else input_path + files_to_process.append((input_path, output_path, file_type)) + + else: + # Use default paths based on flags + if args.all: + for file_type, path in default_paths.items(): + if path.exists(): + files_to_process.append((path, path, file_type)) + else: + if args.crypto and default_paths['crypto'].exists(): + files_to_process.append((default_paths['crypto'], default_paths['crypto'], 'crypto')) + if args.stocks and default_paths['stocks'].exists(): + files_to_process.append((default_paths['stocks'], default_paths['stocks'], 'stocks')) + if args.merged and default_paths['merged'].exists(): + files_to_process.append((default_paths['merged'], default_paths['merged'], 'merged')) + + if not files_to_process: + print("Error: No files found to process") + return 1 + + print("="*60) + print("MANUAL NULL VALUE HANDLER") + print("="*60) + + if args.dry_run: + print("DRY RUN MODE - No changes will be made") + print() + + for input_path, output_path, file_type in files_to_process: + print(f"\nProcessing: {input_path}") + print(f"Type: {file_type}") + print(f"Output: {output_path}") + + if args.dry_run: + try: + df = pd.read_parquet(input_path) + null_count = df.isnull().sum().sum() + print(f"Would process {len(df)} rows with {null_count} null values") + except Exception as e: + print(f"Error reading file: {e}") + continue + + try: + if file_type == 'crypto': + df_processed, report = process_crypto_features_file(input_path, output_path) + elif file_type == 'stocks': + df_processed, report = process_stock_features_file(input_path, output_path) + elif file_type == 'merged': + df_processed, report = process_merged_features_file(input_path) + + print(f"✅ Successfully processed {file_type} features:") + print(f" - Rows: {len(df_processed):,}") + print(f" - Nulls filled: {report['total_nulls_filled']:,}") + print(f" - Columns fixed: {report['columns_fixed']}") + + except Exception as e: + print(f"❌ Error processing {input_path}: {e}") + return 1 + + print("\n" + "="*60) + print("MANUAL NULL HANDLING COMPLETED") + print("="*60) + + return 0 + +if __name__ == "__main__": + exit_code = main() + sys.exit(exit_code) diff --git a/src/merge/merge_0.py b/src/merge/merge_0.py new file mode 100644 index 0000000000000000000000000000000000000000..1137edd8c0bac2fb4fb56e0d66a0f7e378c16e48 --- /dev/null +++ b/src/merge/merge_0.py @@ -0,0 +1,20 @@ +import os +import shutil +from pathlib import Path + +def step0_move_old_merged(): + """ + Move the old merged features file to data/merged/temp for later remerge and deletion. + """ + merged_dir = Path("data/merged/features") + temp_dir = Path("data/merged/temp") + temp_dir.mkdir(parents=True, exist_ok=True) + # Move all files from merged_dir to temp_dir + for f in merged_dir.glob("*"): + if f.is_file(): + dest = temp_dir / f.name + print(f"[INFO] Moving {f} -> {dest}") + shutil.move(str(f), str(dest)) + +if __name__ == "__main__": + step0_move_old_merged() diff --git a/src/merge/merge_1.py b/src/merge/merge_1.py new file mode 100644 index 0000000000000000000000000000000000000000..6b74c46b44687e8573fdc13e91c4becc64d554c5 --- /dev/null +++ b/src/merge/merge_1.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +""" +Simple merge step 1: Copy latest features to merged features. +This creates the initial merged_features.parquet file for the pipeline. +""" + +import shutil +from pathlib import Path + +def main(): + """Copy latest features to merged features directory.""" + + # Source file + source_path = Path("data/advisorai-data/features/latest_features.parquet") + + # Destination file + dest_path = Path("data/merged/features/merged_features.parquet") + + # Create destination directory if it doesn't exist + dest_path.parent.mkdir(parents=True, exist_ok=True) + + # Check if source file exists + if not source_path.exists(): + raise FileNotFoundError(f"Source file not found: {source_path}") + + # Copy the file + shutil.copy2(source_path, dest_path) + + # Read and report basic info + import pandas as pd + df = pd.read_parquet(dest_path) + + print(f"OK wrote {dest_path} -> {len(df)} rows x {len(df.columns)} cols") + +if __name__ == "__main__": + main() diff --git a/src/merge/merge_2.py b/src/merge/merge_2.py new file mode 100644 index 0000000000000000000000000000000000000000..9df753f2ce82fed18df588a64008314e839735b8 --- /dev/null +++ b/src/merge/merge_2.py @@ -0,0 +1,233 @@ +""" +Merge your features JSON with coin-metadata JSON, or merge a crypto-bubbles +Parquet into your merged-features Parquet. + +Usage: + # JSON mode (default): + python merge_2.py json \ + --features data/merged/features/merged_features.json \ + --coininfo data/coininfo/coin_metadata.json \ + --out merged_with_coininfo.ndjson + + # Parquet mode: + python merge_2.py parquet \ + --base data/merged/features/merged_features.parquet \ + --bubbles data/crypto-bubbles/crypto_bubbles_2025-07-15.parquet \ + --out data/merged/features/merged_features.parquet +""" +import json +import pandas as pd +from datetime import datetime +from pathlib import Path +import argparse + +def merge_parquet_features(base_fp: Path, bubbles_fp: Path, out_fp: Path): + """ + Merge crypto bubbles Parquet into merged features Parquet. + For overlapping columns, non-null bubbles values overwrite base. + New columns from bubbles are added. + """ + + import time + base = pd.read_parquet(base_fp) + bubbles = pd.read_parquet(bubbles_fp) + + # Fill missing interval_timestamp with current UTC ms, ensure int (ms) robustly + now_ms = int(time.time() * 1000) + def to_millis(val): + if pd.isna(val): + return pd.NA + if isinstance(val, (pd.Timestamp, datetime)): + return val.value // 1_000_000 + try: + return int(float(val)) + except (ValueError, TypeError): + try: + return int(pd.to_datetime(val).value // 1_000_000) + except Exception: + return pd.NA + + for df in (base, bubbles): + if 'interval_timestamp' in df.columns: + df['interval_timestamp'] = df['interval_timestamp'].fillna(now_ms) + df['interval_timestamp'] = df['interval_timestamp'].map(to_millis).astype('Int64') + + # Rename 'slug' in bubbles to 'symbol' for join, if needed + bubbles_renamed = bubbles.rename(columns={"slug": "symbol"}) if "slug" in bubbles.columns else bubbles + # Remove duplicate columns, keep first occurrence + bubbles_renamed = bubbles_renamed.loc[:, ~bubbles_renamed.columns.duplicated()] + + # Use 'symbol' and 'interval_timestamp' as join keys + keys = [k for k in ["symbol", "interval_timestamp"] if k in base.columns and k in bubbles_renamed.columns] + if not all(k in base.columns for k in keys) or not all(k in bubbles_renamed.columns for k in keys): + raise ValueError("No common key columns found for merge (need 'symbol' and 'interval_timestamp').") + + # Normalize symbol column in both DataFrames for robust merging + def normalize_symbol_col(df): + df['symbol'] = df['symbol'].astype(str).str.lower() + # Map 'ripple' <-> 'xrp' both ways for robust merging + df['symbol'] = df['symbol'].replace({'ripple': 'xrp', 'xrp/ripple': 'xrp'}) + # Also add a step to map 'xrp' to 'ripple' for output if needed + df['symbol'] = df['symbol'].replace({'xrp': 'ripple'}) + return df + bubbles_renamed = normalize_symbol_col(bubbles_renamed) + base = normalize_symbol_col(base) + + # Pick top 50 by rank if present, else first 50 unique + if 'rank' in bubbles_renamed.columns: + sorted_bubbles = bubbles_renamed.sort_values('rank') + else: + sorted_bubbles = bubbles_renamed + top_50 = sorted_bubbles.drop_duplicates(subset='symbol').head(50) + + # Always include these must-have assets + must_have = {'xrp', 'ripple', 'solana','eth','btc','bitcoin','ethereum', 'sol', 'ada', 'cardano'} + extra = bubbles_renamed[bubbles_renamed['symbol'].isin(must_have)] + + # Combine and dedupe on available keys + dedup_cols = ['symbol'] + if 'interval_timestamp' in pd.concat([top_50, extra]).columns: + dedup_cols.append('interval_timestamp') + bubbles_renamed = pd.concat([top_50, extra]).drop_duplicates(subset=dedup_cols) + + base = base.set_index(keys) + bubbles_renamed = bubbles_renamed.set_index(keys) + + # Union of columns, with bubbles first so its columns take precedence + all_cols = list(dict.fromkeys(bubbles_renamed.columns.tolist() + base.columns.tolist())) + base = base.reindex(columns=all_cols) + bubbles_renamed = bubbles_renamed.reindex(columns=all_cols) + + merged = bubbles_renamed.combine_first(base).reset_index() + # Ensure 'symbol' column matches the index value for every row + if 'symbol' in merged.columns: + merged['symbol'] = merged['symbol'].astype(str) + # Always output 'ripple' instead of 'xrp' + merged['symbol'] = merged['symbol'].replace({'xrp': 'ripple'}) + + # Ensure interval_timestamp is never null in the output and is int (ms), robustly + if 'interval_timestamp' in merged.columns: + merged['interval_timestamp'] = merged['interval_timestamp'].fillna(now_ms) + merged['interval_timestamp'] = merged['interval_timestamp'].map(to_millis).astype('Int64') + + # Set is_crypto=1 where is_crypto is null or symbol is 'solana' + if 'is_crypto' in merged.columns: + merged['is_crypto'] = merged['is_crypto'].fillna(1) + if 'symbol' in merged.columns: + merged.loc[merged['symbol'].str.lower() == 'solana', 'is_crypto'] = 1 + + # Drop unwanted columns + for col in ['id', 'name', 'image']: + if col in merged.columns: + merged = merged.drop(columns=col) + + merged.to_parquet(out_fp, index=False) + print(f"OK Merged top 50 from {bubbles_fp} into {base_fp} -> {out_fp} " + f"({merged.shape[0]} rows x {merged.shape[1]} cols)") + + +def load_json_records(path: Path): + """ + Load a JSON file that is either: + - A single JSON object, + - A list of objects, + - Or NDJSON (one JSON object per line). + Returns: List[dict] + """ + text = path.read_text(encoding="utf8") + try: + data = json.loads(text) + except json.JSONDecodeError: + data = [json.loads(line) for line in text.splitlines() if line.strip()] + if isinstance(data, dict): + data = [data] + return data + + +def main_json_merge(features_fp: Path, coininfo_fp: Path, out_fp: Path): + # 1) load features + feats = load_json_records(features_fp) + df_feats = pd.json_normalize(feats) + + # 2) load coin metadata + coins = load_json_records(coininfo_fp) + df_coins = pd.json_normalize(coins) + + # 3) prepare a normalized join key + df_feats["join_key"] = df_feats["symbol"] + df_coins["join_key"] = df_coins["slug"].str.lower() + + # 4) merge + df_merged = df_feats.merge( + df_coins, + on="join_key", + how="left", + suffixes=("", "_meta") + ) + + # 5) clean up + df_merged = df_merged.drop(columns=["join_key"]) + if "symbol_meta" in df_merged.columns: + df_merged = df_merged.drop(columns=["symbol_meta"]) + + # 6) write out as NDJSON + out_fp.parent.mkdir(parents=True, exist_ok=True) + with open(out_fp, "w", encoding="utf8") as f: + for rec in df_merged.to_dict(orient="records"): + f.write(json.dumps(rec) + "\n") + + print(f"✅ Wrote {len(df_merged)} merged records to {out_fp}") + + +def cli(): + p = argparse.ArgumentParser(__doc__) + sub = p.add_subparsers(dest="mode", required=False) + + # JSON merge mode (default) + js = sub.add_parser("json", help="Merge features JSON with coininfo JSON") + js.add_argument("--features", type=Path, + default=Path("data/merged/features/merged_features.json"), + help="Path to merged_features JSON/NDJSON") + js.add_argument("--coininfo", type=Path, + default=Path("data/coininfo/coin_metadata.json"), + help="Path to coin-metadata JSON/NDJSON") + js.add_argument("--out", type=Path, + default=Path("merged_with_coininfo.ndjson"), + help="Where to write the merged NDJSON") + + # Parquet merge mode + pq = sub.add_parser("parquet", help="Merge crypto bubbles Parquet into merged features Parquet") + pq.add_argument("--base", type=Path, + default=Path("data/merged/features/merged_features.parquet"), + help="Path to base merged-features Parquet") + pq.add_argument("--bubbles", type=Path, + default=None, + help="Path to crypto bubbles Parquet (if not set, will use latest in data/crypto-bubbles/)") + pq.add_argument("--out", type=Path, + default=Path("data/merged/features/merged_features.parquet"), + help="Where to write the merged Parquet") + + args = p.parse_args() + # If no subcommand is given, default to 'parquet' and reparse + if args.mode is None: + import sys + sys.argv.insert(1, "parquet") + args = p.parse_args() + + # If bubbles is not provided, find the latest crypto_bubbles_*.parquet + if args.mode == "parquet": + if args.bubbles is None or not args.bubbles.exists(): + import glob + import os + bubble_files = glob.glob(os.path.join("data", "crypto-bubbles", "crypto_bubbles_*.parquet")) + if not bubble_files: + raise FileNotFoundError("No crypto_bubbles_*.parquet files found in data/crypto-bubbles/") + latest_bubble = max(bubble_files, key=os.path.getmtime) + print(f"[INFO] Using latest bubbles file: {latest_bubble}") + args.bubbles = Path(latest_bubble) + merge_parquet_features(args.base, args.bubbles, args.out) + else: + main_json_merge(args.features, args.coininfo, args.out) + +if __name__ == "__main__": + cli() diff --git a/src/merge/merge_3.py b/src/merge/merge_3.py new file mode 100644 index 0000000000000000000000000000000000000000..1046e94fdd2289ff155bfdaf65436dcd3f9e27d3 --- /dev/null +++ b/src/merge/merge_3.py @@ -0,0 +1,372 @@ +import sys +import os +import numpy as np +import pandas as pd +from datetime import datetime + +# Ensure src/merge is in the path for import +sys.path.append(os.path.dirname(__file__)) + +from alpaca_features import build_features, save + +def create_symbol_mapping(): + """ + Create mapping between crypto full names and ticker symbols. + """ + # Common crypto symbol mappings + crypto_mapping = { + # Major cryptocurrencies + 'bitcoin': 'BTC', + 'ethereum': 'ETH', + 'binancecoin': 'BNB', + 'ripple': 'XRP', + 'cardano': 'ADA', + 'solana': 'SOL', + 'dogecoin': 'DOGE', + 'polkadot': 'DOT', + 'matic-network': 'MATIC', + 'polygon': 'MATIC', + 'avalanche-2': 'AVAX', + 'avalanche': 'AVAX', + 'chainlink': 'LINK', + 'litecoin': 'LTC', + 'bitcoin-cash': 'BCH', + 'stellar': 'XLM', + 'vechain': 'VET', + 'ethereum-classic': 'ETC', + 'filecoin': 'FIL', + 'tron': 'TRX', + 'monero': 'XMR', + 'eos': 'EOS', + 'aave': 'AAVE', + 'maker': 'MKR', + 'compound': 'COMP', + 'uniswap': 'UNI', + 'yearn-finance': 'YFI', + 'sushi': 'SUSHI', + 'curve-dao-token': 'CRV', + 'pancakeswap-token': 'CAKE', + 'terra-luna': 'LUNA', + 'fantom': 'FTM', + 'harmony': 'ONE', + 'near': 'NEAR', + 'algorand': 'ALGO', + 'cosmos': 'ATOM', + 'internet-computer': 'ICP', + 'helium': 'HNT', + 'theta-token': 'THETA', + 'chiliz': 'CHZ', + 'decentraland': 'MANA', + 'the-sandbox': 'SAND', + 'axie-infinity': 'AXS', + 'shiba-inu': 'SHIB', + 'apecoin': 'APE', + 'gala': 'GALA', + 'enjincoin': 'ENJ', + 'flow': 'FLOW', + 'basic-attention-token': 'BAT', + 'omg': 'OMG', + 'loopring': 'LRC', + 'immutable-x': 'IMX', + 'render-token': 'RNDR', + 'quant-network': 'QNT', + 'injective-protocol': 'INJ', + 'sei-network': 'SEI', + 'arbitrum': 'ARB', + 'optimism': 'OP', + 'blur': 'BLUR', + 'pepe': 'PEPE', + 'bonk': 'BONK', + 'wormhole': 'W', + 'jupiter-exchange-solana': 'JUP', + 'worldcoin-wld': 'WLD', + 'pyth-network': 'PYTH', + 'jito': 'JTO', + 'tensor': 'TNSR', + 'meme': 'MEME', + 'cat-in-a-dogs-world': 'MEW', + 'book-of-meme': 'BOME', + 'dogwifhat': 'WIF', + 'popcat': 'POPCAT', + 'goatseus-maximus': 'GOAT', + 'peanut-the-squirrel': 'PNUT', + 'act-i-the-ai-prophecy': 'ACT', + 'fartcoin': 'FARTCOIN', + 'ai16z': 'AI16Z', + 'virtual-protocol': 'VIRTUAL', + 'zerebro': 'ZEREBRO', + 'griffain': 'GRIFFAIN', + 'aixbt-by-virtuals': 'AIXBT', + 'marc-and-ethan-are-based': 'BASED', + 'pudgy-penguins': 'PENGU', + 'hyperliquid': 'HYPE', + 'move-movement': 'MOVE', + 'usual': 'USUAL', + 'reserve-rights': 'RSR', + 'ondo-finance': 'ONDO', + 'ethena': 'ENA', + 'eigenlayer': 'EIGEN', + 'grass': 'GRASS', + 'io': 'IO', + 'notcoin': 'NOT', + 'turbo': 'TURBO', + 'jasmy': 'JASMY', + 'neo': 'NEO', + 'iota': 'IOTA', + 'dash': 'DASH', + 'zcash': 'ZEC', + 'waves': 'WAVES', + } + + # Create reverse mapping (ticker -> full name) + reverse_mapping = {v.lower(): k for k, v in crypto_mapping.items()} + + # Also add the forward mapping (full name -> ticker) + forward_mapping = {k: v.lower() for k, v in crypto_mapping.items()} + + return crypto_mapping, reverse_mapping, forward_mapping + +def normalize_symbols(df, symbol_col, is_alpaca=False): + """ + Normalize symbols to handle crypto name/ticker differences and stock symbols. + """ + df = df.copy() + crypto_mapping, reverse_mapping, forward_mapping = create_symbol_mapping() + + # Convert to lowercase for consistency + df[symbol_col] = df[symbol_col].str.lower() + + if is_alpaca: + # Alpaca uses tickers (BTC, ETH, etc. for crypto, NVDA, AAPL, etc. for stocks) + # For crypto: Map tickers to full names to match merged data + # For stocks: Keep the ticker symbol as-is (in lowercase) + + def map_alpaca_symbol(symbol): + symbol_lower = symbol.lower() + + # Check if it's a crypto ticker that needs mapping + if symbol_lower in reverse_mapping: + return reverse_mapping[symbol_lower] + else: + # It's likely a stock symbol, keep as-is (lowercase) + return symbol_lower + + df[symbol_col] = df[symbol_col].apply(map_alpaca_symbol) + else: + # Merged data uses full names for crypto (bitcoin, ethereum, etc.) + # and should use lowercase tickers for stocks (nvda, aapl, etc.) + # Keep as is, but ensure lowercase + pass + + return df + +def merge_alpaca_features(): + """ + Merge Alpaca features with existing merged features. + Handles timestamp alignment, column conflicts, and symbol mapping. + """ + + # Step 1: Create Alpaca features + alpaca_df = build_features() + save(alpaca_df) + + # Step 2: Load merged features + try: + from src import config as app_config + base_dir = app_config.DATA_DIR + except Exception: + base_dir = os.environ.get("DATA_DIR", "/data") + merged_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet") + + merged_df = pd.read_parquet(merged_path) + + # Normalize symbols + alpaca_df_normalized = normalize_symbols(alpaca_df, "symbol", is_alpaca=True) + merged_df_normalized = normalize_symbols(merged_df, "symbol", is_alpaca=False) + + # Find overlapping symbols + alpaca_normalized = set(alpaca_df_normalized["symbol"].unique()) + merged_normalized = set(merged_df_normalized["symbol"].unique()) + overlapping_symbols = alpaca_normalized.intersection(merged_normalized) + missing_in_merged = alpaca_normalized - merged_normalized + + + # Step 6: Handle symbols that exist only in Alpaca data + if missing_in_merged: + + new_symbol_rows = [] + for missing_symbol in missing_in_merged: + # Get actual data for this symbol from Alpaca + symbol_data = alpaca_df_normalized[alpaca_df_normalized["symbol"] == missing_symbol] + if len(symbol_data) == 0: + continue + + + # Create rows based on Alpaca timestamps, not merged timestamps + for _, alpaca_row in symbol_data.iterrows(): + new_row = { + "symbol": missing_symbol, + "interval_timestamp": alpaca_row["timestamp"], # Use Alpaca timestamp + "is_stock": True if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else False, + "is_crypto": False if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else True, + "stock_market": "NASDAQ" if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else None, + "feature_timestamp": pd.Timestamp.now().value // 1000000, # Convert to milliseconds + } + + # Copy all Alpaca feature columns into the new row + for col in alpaca_row.index: + if col not in new_row: + new_row[col] = alpaca_row[col] + + # Add all other columns from merged_df with NaN values (except the ones we set above) + for col in merged_df_normalized.columns: + if col not in new_row: + new_row[col] = np.nan + + new_symbol_rows.append(new_row) + + if new_symbol_rows: + new_symbols_df = pd.DataFrame(new_symbol_rows) + merged_df_normalized = pd.concat([merged_df_normalized, new_symbols_df], ignore_index=True) + + # Step 7: Check for overlapping columns and handle them + join_keys = ["symbol", "timestamp", "interval_timestamp"] + alpaca_cols = set(alpaca_df_normalized.columns) - set(join_keys) + merged_cols = set(merged_df_normalized.columns) - set(join_keys) + overlapping_cols = alpaca_cols.intersection(merged_cols) + + # Convert timestamps to datetime for processing (use pd.concat to avoid fragmentation) + timestamp_columns = {} + + if "timestamp" in alpaca_df_normalized.columns: + timestamp_columns["timestamp_dt"] = pd.to_datetime(alpaca_df_normalized["timestamp"], unit="ms") + + if "interval_timestamp" in merged_df_normalized.columns: + timestamp_columns["interval_timestamp_dt"] = pd.to_datetime(merged_df_normalized["interval_timestamp"], unit="ms") + + # Add timestamp columns efficiently using pd.concat + if timestamp_columns: + for col_name, col_data in timestamp_columns.items(): + if col_name == "timestamp_dt" and "timestamp" in alpaca_df_normalized.columns: + alpaca_df_normalized = pd.concat([alpaca_df_normalized, col_data.to_frame(col_name)], axis=1) + elif col_name == "interval_timestamp_dt" and "interval_timestamp" in merged_df_normalized.columns: + merged_df_normalized = pd.concat([merged_df_normalized, col_data.to_frame(col_name)], axis=1) + + # Perform an OUTER merge to capture all data from both sources + final_merge = pd.merge( + merged_df_normalized, + alpaca_df_normalized, + left_on=["symbol", "interval_timestamp"], + right_on=["symbol", "timestamp"], + how="outer", # Changed from "left" to "outer" + suffixes=("", "_alpaca") + ) + + # For rows that came only from Alpaca (new symbols), copy the timestamp to interval_timestamp + alpaca_only_mask = final_merge["interval_timestamp"].isna() & final_merge["timestamp"].notna() + if alpaca_only_mask.any(): + final_merge.loc[alpaca_only_mask, "interval_timestamp"] = final_merge.loc[alpaca_only_mask, "timestamp"] + + # Set basic metadata for these new rows + final_merge.loc[alpaca_only_mask, "feature_timestamp"] = pd.Timestamp.now().value // 1000000 + + # Set stock/crypto flags based on symbol + for symbol in final_merge.loc[alpaca_only_mask, "symbol"].unique(): + symbol_mask = alpaca_only_mask & (final_merge["symbol"] == symbol) + is_stock = symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] + final_merge.loc[symbol_mask, "is_stock"] = is_stock + final_merge.loc[symbol_mask, "is_crypto"] = not is_stock + if is_stock: + final_merge.loc[symbol_mask, "stock_market"] = "NASDAQ" + + # Copy _alpaca columns into base columns for Alpaca-only rows + feature_cols = [ + "open", "high", "low", "close", "volume", "trade_count", "vwap", + "symbol_quote", "bid_price", "bid_size", "bid_exchange", "ask_price", "ask_size", "ask_exchange", + "conditions", "tape", "symbol_trade", "exchange", "price", "size", "id", "conditions_trade", "tape_trade" + ] + for col in feature_cols: + alpaca_col = f"{col}_alpaca" + if alpaca_col in final_merge.columns and col in final_merge.columns: + final_merge.loc[alpaca_only_mask, col] = final_merge.loc[alpaca_only_mask, alpaca_col] + + # Step 11: Calculate merge statistics + total_merged_rows = len(merged_df_normalized) + total_alpaca_rows = len(alpaca_df_normalized) + total_final_rows = len(final_merge) + + # Count matches from original merged data + original_matched_rows = final_merge[ + final_merge["timestamp"].notna() & + final_merge["interval_timestamp"].notna() & + (final_merge["interval_timestamp"] != final_merge["timestamp"]) + ].shape[0] + + # Count new rows from Alpaca-only symbols + alpaca_only_rows = final_merge[ + final_merge["timestamp"].notna() & + (final_merge["interval_timestamp"] == final_merge["timestamp"]) + ].shape[0] + + # Total rows with Alpaca data + total_alpaca_matched = final_merge[final_merge["timestamp"].notna()].shape[0] + + original_match_rate = original_matched_rows / total_merged_rows if total_merged_rows > 0 else 0 + overall_match_rate = total_alpaca_matched / total_final_rows if total_final_rows > 0 else 0 + + + # Step 12: Debug successful matches and new symbols + if total_alpaca_matched > 0: + successful_matches = final_merge[final_merge["timestamp"].notna()] + sample_cols = ["symbol", "interval_timestamp", "timestamp", "open", "high", "low", "close", "volume"] + available_cols = [col for col in sample_cols if col in successful_matches.columns] + + # Step 13: Add merge metadata + final_merge["alpaca_merge_timestamp"] = pd.Timestamp.now().value // 1000000 # Convert to milliseconds + final_merge["alpaca_data_available"] = final_merge["timestamp"].notna() + final_merge["alpaca_match_rate"] = overall_match_rate + final_merge["is_new_symbol"] = final_merge["interval_timestamp"] == final_merge["timestamp"] + + # Step 14: Handle duplicate columns before saving + duplicate_cols = final_merge.columns[final_merge.columns.duplicated()].tolist() + if duplicate_cols: + final_merge = final_merge.loc[:, ~final_merge.columns.duplicated()] + + # Save the merged features + out_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet") + + final_merge.to_parquet(out_path, index=False) + + # Generate detailed summary report + print(f"Total final rows: {len(final_merge)}") + print(f"Rows with Alpaca data: {total_alpaca_matched}") + print(f"New symbols added: {alpaca_only_rows}") + print(f"Overall match rate: {overall_match_rate:.2%}") + print(f"Total columns: {len(final_merge.columns)}") + + # Show symbols with and without Alpaca data + symbol_summary = final_merge.groupby("symbol").agg({ + "alpaca_data_available": ["count", "sum"], + "is_new_symbol": "sum" + }).round(2) + + symbol_summary.columns = ["total_rows", "alpaca_matches", "new_symbol_rows"] + symbol_summary["match_rate"] = symbol_summary["alpaca_matches"] / symbol_summary["total_rows"] + symbol_summary["is_new_symbol"] = symbol_summary["new_symbol_rows"] > 0 + + # Show which symbols have complete data + complete_symbols = symbol_summary[symbol_summary["match_rate"] > 0.5] + if len(complete_symbols) > 0: + print(complete_symbols[["total_rows", "alpaca_matches", "match_rate"]]) + + # Show sample of final merged data + sample_cols = ["symbol", "interval_timestamp", "alpaca_data_available", "is_new_symbol", "open", "high", "low", "close", "volume"] + + return final_merge + +if __name__ == "__main__": + try: + merged_df = merge_alpaca_features() + except Exception as e: + import traceback + traceback.print_exc() \ No newline at end of file diff --git a/src/merge/merge_4.py b/src/merge/merge_4.py new file mode 100644 index 0000000000000000000000000000000000000000..6b032be4d033801ac1e363aeab7fa3db2a1a4a6f --- /dev/null +++ b/src/merge/merge_4.py @@ -0,0 +1,237 @@ +import json +import pandas as pd +from datetime import datetime +import numpy as np +import os + +def parse_news_data(file_path): + """Parse the news data file containing multiple JSON objects per line""" + news_data = [] + + with open(file_path, 'r') as f: + content = f.read() + + # Split by newlines and parse each JSON object + lines = content.strip().split('\n') + + for line in lines: + if line.strip(): + try: + news_item = json.loads(line) + news_data.append(news_item) + except json.JSONDecodeError as e: + print(f"Error parsing line: {line[:100]}...") + print(f"Error: {e}") + continue + + return news_data + +def extract_sentiment_features(news_data): + """Extract sentiment features from news data for each symbol""" + sentiment_features = {} + + for article in news_data: + # Get article-level info + published_at = article.get('published_at') + title = article.get('title', '') + description = article.get('description', '') + + # Process entities (stocks mentioned in the article) + entities = article.get('entities', []) + + for entity in entities: + if entity.get('type') == 'equity': + symbol = entity.get('symbol', '').lower() # Convert to lowercase + + if symbol: + if symbol not in sentiment_features: + sentiment_features[symbol] = { + 'news_sentiment_scores': [], + 'news_match_scores': [], + 'news_mentions_count': 0, + 'news_articles_count': 0, + 'latest_news_timestamp': None, + 'news_highlights_count': 0 + } + + # Add sentiment and match scores + sentiment_score = entity.get('sentiment_score') + match_score = entity.get('match_score') + + if sentiment_score is not None: + sentiment_features[symbol]['news_sentiment_scores'].append(sentiment_score) + + if match_score is not None: + sentiment_features[symbol]['news_match_scores'].append(match_score) + + # Count highlights + highlights = entity.get('highlights', []) + sentiment_features[symbol]['news_highlights_count'] += len(highlights) + + # Update latest timestamp + if published_at: + if (sentiment_features[symbol]['latest_news_timestamp'] is None or + published_at > sentiment_features[symbol]['latest_news_timestamp']): + sentiment_features[symbol]['latest_news_timestamp'] = published_at + + sentiment_features[symbol]['news_mentions_count'] += 1 + + # Count unique articles per symbol + mentioned_symbols = set(entity.get('symbol', '').lower() for entity in entities + if entity.get('type') == 'equity' and entity.get('symbol')) + + for symbol in mentioned_symbols: + if symbol in sentiment_features: + sentiment_features[symbol]['news_articles_count'] += 1 + + return sentiment_features + +def aggregate_sentiment_features(sentiment_data): + """Aggregate sentiment features into final metrics""" + aggregated = {} + + for symbol, data in sentiment_data.items(): + # Calculate aggregated metrics + sentiment_scores = data['news_sentiment_scores'] + match_scores = data['news_match_scores'] + + features = { + 'news_sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else None, + 'news_sentiment_std': np.std(sentiment_scores) if len(sentiment_scores) > 1 else None, + 'news_sentiment_min': np.min(sentiment_scores) if sentiment_scores else None, + 'news_sentiment_max': np.max(sentiment_scores) if sentiment_scores else None, + 'news_match_score_mean': np.mean(match_scores) if match_scores else None, + 'news_match_score_max': np.max(match_scores) if match_scores else None, + 'news_mentions_count': data['news_mentions_count'], + 'news_articles_count': data['news_articles_count'], + 'news_highlights_count': data['news_highlights_count'], + 'latest_news_timestamp': data['latest_news_timestamp'], + 'news_sentiment_range': (np.max(sentiment_scores) - np.min(sentiment_scores)) if len(sentiment_scores) > 0 else None, + 'news_activity_score': data['news_mentions_count'] * np.mean(match_scores) if match_scores else 0 + } + + aggregated[symbol] = features + + return aggregated + +def merge_with_existing_features(news_features, existing_features_file): + """Merge news features with existing market data features""" + + # Load existing features + if existing_features_file.endswith('.parquet'): + df_existing = pd.read_parquet(existing_features_file) + else: + df_existing = pd.read_csv(existing_features_file) + + print(f"Loaded existing features: {df_existing.shape}") + print(f"News features available for {len(news_features)} symbols") + + # Add news features as new columns + news_columns = [ + 'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', + 'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max', + 'news_mentions_count', 'news_articles_count', 'news_highlights_count', + 'latest_news_timestamp', 'news_sentiment_range', 'news_activity_score' + ] + + # Initialize all news columns with NaN + for col in news_columns: + df_existing[col] = np.nan + + # Fill in news features where available + symbols_matched = 0 + for idx, row in df_existing.iterrows(): + symbol = row['symbol'] + if symbol in news_features: + for col in news_columns: + # The keys in news_features already have the correct names + df_existing.loc[idx, col] = news_features[symbol].get(col, None) + symbols_matched += 1 + + print(f"Matched news features for {symbols_matched} symbols out of {len(df_existing)} total records") + + return df_existing + +def main(): + # Configuration + # Use Marketaux parquet file for news data + news_file = os.path.join('data', 'marketaux', 'news', 'news_latest.parquet') + existing_features_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet') + output_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet') + + # Check if news file exists + if not os.path.exists(news_file): + print(f"WARNING: News file not found: {news_file}") + print("This usually happens when MarketAux API keys are exhausted.") + print("Skipping news sentiment merge and keeping existing features unchanged.") + + # Just copy existing features if they exist + if os.path.exists(existing_features_file): + import shutil + shutil.copy2(existing_features_file, output_file) + print(f"Copied existing features to output: {output_file}") + else: + print(f"WARNING: No existing features file found at {existing_features_file}") + return + + print("Step 1: Loading news data from parquet...") + try: + news_df = pd.read_parquet(news_file) + news_data = news_df.to_dict(orient='records') + print(f"Loaded {len(news_data)} news articles from {news_file}") + except Exception as e: + print(f"ERROR: Failed to load news data: {e}") + print("Skipping news sentiment merge.") + + # Copy existing features as fallback + if os.path.exists(existing_features_file): + import shutil + shutil.copy2(existing_features_file, output_file) + print(f"Copied existing features to output: {output_file}") + return + + print("Step 2: Extracting sentiment features...") + sentiment_data = extract_sentiment_features(news_data) + print(f"Extracted sentiment data for {len(sentiment_data)} symbols") + + print("Step 3: Aggregating sentiment metrics...") + news_features = aggregate_sentiment_features(sentiment_data) + + # Display sample of extracted features + print("\nSample of extracted news features:") + for symbol, features in list(news_features.items())[:3]: + print(f"\n{symbol.upper()}:") + for key, value in features.items(): + if value is not None: + if isinstance(value, float): + print(f" {key}: {value:.4f}") + else: + print(f" {key}: {value}") + + print(f"\nStep 4: Merging with existing features...") + try: + merged_df = merge_with_existing_features(news_features, existing_features_file) + + # Remove 'links.pulsex' column if present + if 'links.pulsex' in merged_df.columns: + merged_df = merged_df.drop(columns=['links.pulsex']) + + print(f"Step 5: Saving merged features...") + merged_df.to_parquet(output_file, index=False) + print(f"Saved merged features to {output_file}") + print(f"Final dataset shape: {merged_df.shape}") + + # Show summary of news feature coverage + news_cols = [col for col in merged_df.columns if col.startswith('news_')] + print(f"\nNews feature coverage:") + for col in news_cols: + non_null_count = merged_df[col].notna().sum() + coverage = non_null_count / len(merged_df) * 100 + print(f" {col}: {non_null_count}/{len(merged_df)} ({coverage:.1f}%)") + + except Exception as e: + print(f"Error during merging: {e}") + print("Make sure your merged_features.parquet file exists and is accessible") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/merge_5.py b/src/merge/merge_5.py new file mode 100644 index 0000000000000000000000000000000000000000..0af85ea47c09326d3a14c4b7d6d9bd8a514d2412 --- /dev/null +++ b/src/merge/merge_5.py @@ -0,0 +1,376 @@ +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +import json +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def normalize_stock_data(df_stocks): + """ + Normalize stock data to ensure consistent format for merging. + """ + logger.info("=== NORMALIZING STOCK DATA ===") + df_stocks = df_stocks.copy() + + # Normalize symbol to uppercase and strip whitespace + df_stocks['symbol'] = df_stocks['symbol'].astype(str).str.upper().str.strip() + + # Ensure interval_timestamp is int64 (Unix timestamp in milliseconds) + if 'interval_timestamp' in df_stocks.columns: + # If it's already numeric, ensure it's int64 + df_stocks['interval_timestamp'] = pd.to_numeric(df_stocks['interval_timestamp'], errors='coerce').astype('int64') + logger.info(f"Stock timestamp range: {df_stocks['interval_timestamp'].min()} to {df_stocks['interval_timestamp'].max()}") + logger.info(f"Stock timestamp sample: {df_stocks['interval_timestamp'].head().tolist()}") + + logger.info(f"Stock symbols sample: {df_stocks['symbol'].unique()[:10].tolist()}") + logger.info(f"Stock data shape: {df_stocks.shape}") + + return df_stocks + +def normalize_news_data(df_news): + """ + Normalize news data to ensure consistent format for merging. + """ + logger.info("=== NORMALIZING NEWS DATA ===") + df_news = df_news.copy() + + # Extract entities and create individual records + news_records = [] + + for idx, row in df_news.iterrows(): + entities = row.get('entities', []) + + # Only proceed if entities is a non-empty list or ndarray + if not isinstance(entities, (list, np.ndarray)) or len(entities) == 0: + continue + + # Convert published_at to timestamp + try: + if isinstance(row['published_at'], str): + published_dt = pd.to_datetime(row['published_at']) + else: + published_dt = row['published_at'] + except: + logger.warning(f"Could not parse published_at for row {idx}") + continue + + # Process each entity + for entity in entities: + if not isinstance(entity, dict): + continue + + # Only process equity type entities with symbols + if entity.get('type') == 'equity' and 'symbol' in entity: + symbol = str(entity['symbol']).upper().strip() + + # Create 30-minute intervals (matching your stock data) + interval_dt = published_dt.floor('30min') + # Convert to Unix timestamp in milliseconds + interval_timestamp = int(interval_dt.timestamp() * 1000) + + news_records.append({ + 'symbol': symbol, + 'interval_timestamp': interval_timestamp, + 'published_at': published_dt, + 'sentiment_score': entity.get('sentiment_score', 0), + 'match_score': entity.get('match_score', 0), + 'highlights_count': len(entity.get('highlights', [])), + 'news_uuid': row.get('uuid', ''), + 'news_title': row.get('title', ''), + 'news_source': row.get('source', ''), + 'relevance_score': row.get('relevance_score', 0) + }) + + if not news_records: + logger.warning("No valid news records found") + return pd.DataFrame() + + df_news_normalized = pd.DataFrame(news_records) + logger.info(f"Normalized news data shape: {df_news_normalized.shape}") + # Print columns that are completely null and those that aren't + null_columns = [col for col in df_news_normalized.columns if df_news_normalized[col].isnull().all()] + not_null_columns = [col for col in df_news_normalized.columns if not df_news_normalized[col].isnull().all()] + print(f"Completely null columns: {null_columns}") + print(f"Non-null columns: {not_null_columns}") + logger.info(f"News symbols sample: {df_news_normalized['symbol'].unique()[:10].tolist()}") + logger.info(f"News timestamp range: {df_news_normalized['interval_timestamp'].min()} to {df_news_normalized['interval_timestamp'].max()}") + logger.info(f"News timestamp sample: {df_news_normalized['interval_timestamp'].head().tolist()}") + + return df_news_normalized + +def find_nearest_timestamp_matches(df_stocks, df_news, time_tolerance_minutes=30): + """ + Find the nearest timestamp matches within a tolerance window. + This handles cases where timestamps don't align exactly. + """ + logger.info(f"=== FINDING NEAREST TIMESTAMP MATCHES (tolerance: {time_tolerance_minutes} min) ===") + + if df_news.empty: + return df_stocks.assign(**{col: 0 for col in [ + 'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max', + 'news_match_score_mean', 'news_match_score_max', 'news_highlights_count', + 'news_articles_count', 'latest_news_timestamp', 'news_sentiment_range', + 'news_activity_score', 'news_mentions_count' + ]}) + + # Convert tolerance to milliseconds + tolerance_ms = time_tolerance_minutes * 60 * 1000 + + # Get unique combinations for efficient processing + stock_keys = df_stocks[['symbol', 'interval_timestamp']].drop_duplicates() + + matched_records = [] + + for _, stock_row in stock_keys.iterrows(): + symbol = stock_row['symbol'] + stock_timestamp = stock_row['interval_timestamp'] + + # Find news for this symbol + symbol_news = df_news[df_news['symbol'] == symbol].copy() + + if symbol_news.empty: + continue + + # Calculate time differences + symbol_news['time_diff'] = abs(symbol_news['interval_timestamp'] - stock_timestamp) + + # Filter within tolerance + nearby_news = symbol_news[symbol_news['time_diff'] <= tolerance_ms] + + if nearby_news.empty: + continue + + # Aggregate the nearby news + agg_data = { + 'symbol': symbol, + 'interval_timestamp': stock_timestamp, + 'news_sentiment_mean': nearby_news['sentiment_score'].mean(), + 'news_sentiment_std': nearby_news['sentiment_score'].std(), + 'news_sentiment_min': nearby_news['sentiment_score'].min(), + 'news_sentiment_max': nearby_news['sentiment_score'].max(), + 'news_match_score_mean': nearby_news['match_score'].mean(), + 'news_match_score_max': nearby_news['match_score'].max(), + 'news_highlights_count': nearby_news['highlights_count'].sum(), + 'news_articles_count': len(nearby_news), + 'latest_news_timestamp': nearby_news['published_at'].max(), + 'news_mentions_count': len(nearby_news) + } + + # Calculate additional features + agg_data['news_sentiment_range'] = agg_data['news_sentiment_max'] - agg_data['news_sentiment_min'] + agg_data['news_activity_score'] = agg_data['news_match_score_mean'] + agg_data['news_match_score_max'] + + # Fill NaN values + for key, value in agg_data.items(): + if pd.isna(value) and key not in ['symbol', 'interval_timestamp', 'latest_news_timestamp']: + agg_data[key] = 0 + + matched_records.append(agg_data) + + if matched_records: + df_matched_news = pd.DataFrame(matched_records) + logger.info(f"Found {len(df_matched_news)} symbol-timestamp matches") + + # Merge with stock data + df_result = df_stocks.merge( + df_matched_news, + on=['symbol', 'interval_timestamp'], + how='left' + ) + else: + logger.warning("No timestamp matches found within tolerance") + df_result = df_stocks.copy() + + # Fill remaining NaN values for stocks without news + news_columns = [ + 'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max', + 'news_match_score_mean', 'news_match_score_max', 'news_highlights_count', + 'news_articles_count', 'news_sentiment_range', 'news_activity_score', 'news_mentions_count' + ] + + for col in news_columns: + if col in df_result.columns: + df_result[col] = df_result[col].fillna(0) + + # Report results + if 'news_articles_count' in df_result.columns: + stocks_with_news = len(df_result[df_result['news_articles_count'] > 0]) + total_news_articles = df_result['news_articles_count'].sum() + logger.info(f"Successfully matched news for {stocks_with_news} stock records out of {len(df_result)}") + logger.info(f"Total news articles matched: {total_news_articles}") + + return df_result + +def diagnose_data_alignment(df_stocks, df_news): + """ + Diagnose alignment issues between stock and news data. + """ + logger.info("=== DATA ALIGNMENT DIAGNOSIS ===") + + # Check symbol overlap + stock_symbols = set(df_stocks['symbol'].unique()) if 'symbol' in df_stocks.columns else set() + news_symbols = set(df_news['symbol'].unique()) if len(df_news) > 0 and 'symbol' in df_news.columns else set() + + common_symbols = stock_symbols.intersection(news_symbols) + + logger.info(f"Stock symbols: {len(stock_symbols)} unique") + logger.info(f"News symbols: {len(news_symbols)} unique") + logger.info(f"Common symbols: {len(common_symbols)}") + logger.info(f"Common symbols sample: {list(common_symbols)[:10]}") + + # Check timestamp ranges + if 'interval_timestamp' in df_stocks.columns: + stock_ts_min = df_stocks['interval_timestamp'].min() + stock_ts_max = df_stocks['interval_timestamp'].max() + stock_ts_range = pd.to_datetime([stock_ts_min, stock_ts_max], unit='ms') + logger.info(f"Stock timestamp range: {stock_ts_range[0]} to {stock_ts_range[1]}") + + if len(df_news) > 0 and 'interval_timestamp' in df_news.columns: + news_ts_min = df_news['interval_timestamp'].min() + news_ts_max = df_news['interval_timestamp'].max() + news_ts_range = pd.to_datetime([news_ts_min, news_ts_max], unit='ms') + logger.info(f"News timestamp range: {news_ts_range[0]} to {news_ts_range[1]}") + + # Check for timestamp overlap + if 'interval_timestamp' in df_stocks.columns: + overlap_start = max(stock_ts_min, news_ts_min) + overlap_end = min(stock_ts_max, news_ts_max) + if overlap_start <= overlap_end: + overlap_range = pd.to_datetime([overlap_start, overlap_end], unit='ms') + logger.info(f"Timestamp overlap: {overlap_range[0]} to {overlap_range[1]}") + else: + logger.warning("No timestamp overlap between stock and news data") + +def parse_json_news_file(news_file_path): + """ + Parse news file that contains JSON records (one per line or structured). + """ + logger.info(f"Parsing news file: {news_file_path}") + + try: + # Try reading as parquet first + df_news = pd.read_parquet(news_file_path) + logger.info(f"Successfully read parquet file with shape: {df_news.shape}") + + # Check if the data contains JSON strings that need parsing + if len(df_news.columns) == 1 and df_news.iloc[0, 0] and isinstance(df_news.iloc[0, 0], str): + logger.info("Detected JSON strings in single column, parsing...") + json_records = [] + for idx, row in df_news.iterrows(): + try: + json_data = json.loads(row.iloc[0]) + json_records.append(json_data) + except json.JSONDecodeError as e: + logger.warning(f"Failed to parse JSON at row {idx}: {e}") + continue + + if json_records: + df_news = pd.DataFrame(json_records) + logger.info(f"Parsed {len(json_records)} JSON records") + + return df_news + + except Exception as e: + logger.error(f"Error reading news file: {e}") + return pd.DataFrame() + +def main(stocks_file_path, news_file_path, output_file_path, time_tolerance_minutes=30): + """ + Main function to normalize and merge stock and news data. + """ + try: + logger.info("=== STARTING DATA NORMALIZATION AND MERGE ===") + + # Step 1: Load stock data + logger.info("Step 1: Loading stock data...") + df_stocks = pd.read_parquet(stocks_file_path) + logger.info(f"Loaded stock data with shape: {df_stocks.shape}") + + # Step 2: Load and parse news data + logger.info("Step 2: Loading news data...") + df_news_raw = parse_json_news_file(news_file_path) + + if df_news_raw.empty: + logger.warning("No news data found, creating stock data with empty news columns") + df_stocks = normalize_stock_data(df_stocks) + # Add empty news columns + for col in ['news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', + 'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max', + 'news_highlights_count', 'news_articles_count', 'latest_news_timestamp', + 'news_sentiment_range', 'news_activity_score', 'news_mentions_count']: + df_stocks[col] = 0 if col != 'latest_news_timestamp' else None + df_stocks.to_parquet(output_file_path, index=False) + logger.info("Saved stock data with empty news columns") + return df_stocks + + # Step 3: Normalize both datasets + logger.info("Step 3: Normalizing stock data...") + df_stocks_norm = normalize_stock_data(df_stocks) + + logger.info("Step 4: Normalizing news data...") + df_news_norm = normalize_news_data(df_news_raw) + + # Step 5: Diagnose alignment + logger.info("Step 5: Diagnosing data alignment...") + diagnose_data_alignment(df_stocks_norm, df_news_norm) + + # Step 6: Find nearest timestamp matches and merge + logger.info("Step 6: Finding nearest timestamp matches and merging...") + df_merged = find_nearest_timestamp_matches( + df_stocks_norm, + df_news_norm, + time_tolerance_minutes=time_tolerance_minutes + ) + + # Step 7: Save results + logger.info("Step 7: Saving merged data...") + df_merged.to_parquet(output_file_path, index=False) + logger.info(f"Saved merged data to {output_file_path}") + + # Final report + logger.info("=== MERGE COMPLETED ===") + logger.info(f"Final dataset shape: {df_merged.shape}") + + news_cols = [col for col in df_merged.columns if col.startswith('news_')] + logger.info(f"News columns added: {len(news_cols)}") + + if 'news_articles_count' in df_merged.columns: + total_articles = df_merged['news_articles_count'].sum() + records_with_news = len(df_merged[df_merged['news_articles_count'] > 0]) + logger.info(f"Total news articles merged: {total_articles}") + logger.info(f"Stock records with news: {records_with_news} / {len(df_merged)}") + + return df_merged + + except Exception as e: + logger.error(f"Error in main process: {e}") + import traceback + logger.error(traceback.format_exc()) + raise + +# Example usage +if __name__ == "__main__": + import os + + # Update these paths to match your actual file locations + base_dir = "data/" # Update this + stocks_file = os.path.join(base_dir, "merged/features/stocks_features.parquet") + news_file = os.path.join(base_dir, "marketaux/news/news_latest.parquet") + output_file = os.path.join(base_dir, "merged/features/stocks_features.parquet") + + # Check if stocks_features.parquet exists before running + if not os.path.exists(stocks_file): + logger.error(f"Input file missing: {stocks_file}") + print(f"ERROR: Input file missing: {stocks_file}") + exit(1) + + # Run the merge with 30-minute tolerance (adjust as needed) + df_result = main( + stocks_file_path=stocks_file, + news_file_path=news_file, + output_file_path=output_file, + time_tolerance_minutes=60*24 # Adjust this based on your needs + ) \ No newline at end of file diff --git a/src/merge/merge_6.py b/src/merge/merge_6.py new file mode 100644 index 0000000000000000000000000000000000000000..15967cad40a4088c5d4217d2b0cecc1661102aca --- /dev/null +++ b/src/merge/merge_6.py @@ -0,0 +1,612 @@ +import os +import pandas as pd +import numpy as np +from pathlib import Path +# import #logging +from datetime import datetime + +# Resolve DATA_DIR from config (container-safe) with fallback +try: + from src.config import DATA_DIR as CFG_DATA_DIR # when run as module +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR # when run as script from src/ + except Exception: + CFG_DATA_DIR = "/data" + +class FixedTimestampHandler: + def __init__(self, base_path: str | os.PathLike | None = None): + # Prefer explicit argument, then DATA_DIR env, then config fallback + resolved_base = base_path or os.getenv("DATA_DIR") or CFG_DATA_DIR + self.base_path = Path(resolved_base) + self.finviz_path = self.base_path / "finviz" / "sentiment" + self.crypto_features_path = self.base_path / "merged" / "features" / "crypto_features.parquet" + self.stocks_features_path = self.base_path / "merged" / "features" / "stocks_features.parquet" + self.output_path = self.base_path / "merged" / "features" + self.output_path.mkdir(parents=True, exist_ok=True) + # Configure #logging + #logging.basicConfig(level=#logging.INFO, + # format='%(asctime)s - %(levelname)s - %(message)s') + # Define tickers and mappings + self.stock_tickers = ["AAPL", "TSLA", "GOOGL", "NVDA", "MSFT", "COIN"] + self.crypto_ticker_mapping = { + "BTC": "bitcoin", + "ETH": "ethereum", + "SOL": "solana", + "XRP": "ripple", + "ADA": "cardano" + } + # Reverse mapping: crypto name to ticker (all lowercase keys) + self.crypto_name_to_ticker = {v.lower(): k for k, v in self.crypto_ticker_mapping.items()} + + def crypto_name_to_symbol(self, name): + """Transform crypto name (e.g., 'bitcoin', 'Bitcoin', 'BITCOIN') to ticker symbol (e.g., 'BTC')""" + if not isinstance(name, str): + return None + name_lower = name.strip().lower() + # Try exact match + if name_lower in self.crypto_name_to_ticker: + return self.crypto_name_to_ticker[name_lower] + # Try to match ignoring spaces and underscores + for key in self.crypto_name_to_ticker: + if name_lower.replace(' ', '').replace('_', '') == key.replace(' ', '').replace('_', ''): + return self.crypto_name_to_ticker[key] + return None + + def is_timestamp_column(self, df, col_name): + """Determine if a column is likely a timestamp column""" + if pd.api.types.is_datetime64_any_dtype(df[col_name]): + return True + if pd.api.types.is_numeric_dtype(df[col_name]): + sample_vals = df[col_name].dropna() + if len(sample_vals) == 0: + return False + sample_val = sample_vals.iloc[0] + current_time = pd.Timestamp.now().timestamp() + units = [ + ('s', 1), + ('ms', 1000), + ('us', 1000000), + ('ns', 1000000000) + ] + for unit, divisor in units: + try: + if unit == 's': + ts_value = sample_val + else: + ts_value = sample_val / divisor + if abs(ts_value - current_time) < (10 * 365 * 24 * 3600): + return True + except: + continue + if df[col_name].dtype == 'object': + sample_val = df[col_name].dropna().iloc[0] if not df[col_name].empty else None + if sample_val and isinstance(sample_val, str): + try: + pd.to_datetime(sample_val) + return True + except (ValueError, TypeError): + pass + return False + + def get_timestamp_columns(self, df): + """Identify all timestamp columns in a dataframe""" + timestamp_cols = [] + potential_names = ['time', 'date', 'interval', 'timestamp', 'dt'] + for col in df.columns: + if any(keyword in col.lower() for keyword in potential_names): + if self.is_timestamp_column(df, col): + timestamp_cols.append(col) + return timestamp_cols + + def convert_timestamp_column(self, df, col_name, unit='auto'): + """Convert a timestamp column to datetime format with improved validation""" + if pd.api.types.is_datetime64_any_dtype(df[col_name]): + if df[col_name].dt.tz is not None: + df[col_name] = df[col_name].dt.tz_localize(None) + return df[col_name] + if pd.api.types.is_numeric_dtype(df[col_name]): + sample_vals = df[col_name].dropna() + if len(sample_vals) == 0: + print(f"[ERROR] No valid values in timestamp column {col_name}") + return None + + # Convert nullable Int64 to regular numeric if needed + if hasattr(sample_vals, 'dtype') and str(sample_vals.dtype).startswith('Int'): + sample_vals = sample_vals.astype('int64') + + if unit == 'auto': + current_time = pd.Timestamp.now().timestamp() + best_unit = None + best_distance = float('inf') + for test_unit in ['s', 'ms', 'us', 'ns']: + try: + # Additional safety check + if len(sample_vals) == 0: + continue + first_val = sample_vals.iloc[0] + if pd.isna(first_val): + continue + if test_unit == 's': + test_ts = pd.to_datetime(first_val, unit='s') + else: + divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[test_unit] + test_ts = pd.to_datetime(first_val / divisor, unit='s') + distance = abs((pd.Timestamp.now() - test_ts).total_seconds()) + if distance < best_distance: + best_distance = distance + best_unit = test_unit + except Exception as e: + #logging.debug(f"Failed to test unit {test_unit} for column {col_name}: {e}") + continue + if best_unit is None: + #logging.error(f"Could not determine unit for column {col_name}") + return None + unit = best_unit + #logging.info(f"Auto-detected unit for {col_name}: {unit}") + try: + # Convert nullable Int64 to regular numeric if needed for the whole column + values_to_convert = df[col_name] + if hasattr(values_to_convert, 'dtype') and str(values_to_convert.dtype).startswith('Int'): + values_to_convert = values_to_convert.astype('int64') + + if unit == 's': + converted = pd.to_datetime(values_to_convert, unit='s') + else: + divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[unit] + converted = pd.to_datetime(values_to_convert / divisor, unit='s') + if converted.dt.tz is not None: + converted = converted.dt.tz_localize(None) + if converted.min().year < 2000: + #logging.warning(f"Converted timestamps for {col_name} seem too old. Checking alternative units.") + for alt_unit in ['s', 'ms', 'us', 'ns']: + if alt_unit == unit: + continue + try: + if alt_unit == 's': + alt_converted = pd.to_datetime(df[col_name], unit='s') + else: + alt_divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[alt_unit] + alt_converted = pd.to_datetime(df[col_name] / alt_divisor, unit='s') + if alt_converted.min().year > 2000: + #logging.info(f"Alternative unit {alt_unit} gives better results for {col_name}") + converted = alt_converted + break + except Exception as e: + #logging.debug(f"Failed to try alternative unit {alt_unit} for column {col_name}: {e}") + continue + #logging.info(f"Successfully converted {col_name} using unit '{unit}'") + #logging.info(f"Date range: {converted.min()} to {converted.max()}") + return converted + except Exception as e: + #logging.error(f"Failed to convert {col_name} using unit '{unit}': {e}") + return None + elif df[col_name].dtype == 'object': + try: + converted = pd.to_datetime(df[col_name]) + if converted.dt.tz is not None: + converted = converted.dt.tz_localize(None) + #logging.info(f"Successfully converted string timestamps in {col_name}") + return converted + except Exception as e: + #logging.error(f"Failed to convert string timestamps in {col_name}: {e}") + return None + else: + #logging.error(f"Unknown timestamp format in column {col_name}") + return None + + def select_best_timestamp_column(self, df, timestamp_columns): + """Select the best timestamp column from a list of potential columns""" + best_col = None + best_score = -1 + for col in timestamp_columns: + try: + if col not in df.columns: + print(f"[WARN] Column {col} not found in dataframe") + continue + if df[col].isnull().all(): + print(f"[WARN] Column {col} contains only null values") + continue + converted = self.convert_timestamp_column(df, col) + if converted is None: + print(f"[WARN] Could not convert column {col} to timestamp") + continue + non_null_count = converted.notna().sum() + recent_count = converted[converted > pd.Timestamp('2020-01-01')].count() + score = non_null_count + recent_count * 2 + print(f"[DEBUG] Column {col}: score={score}, non_null={non_null_count}, recent={recent_count}") + if score > best_score: + best_score = score + best_col = col + except Exception as e: + print(f"[WARN] Error evaluating timestamp column {col}: {e}") + continue + print(f"[INFO] Best timestamp column: {best_col} (score: {best_score})") + return best_col + + def load_sentiment_data(self, symbol): + """Load sentiment data with proper timestamp handling""" + sentiment_file = self.finviz_path / f"{symbol.upper()}_sentiment.parquet" + if not sentiment_file.exists(): + print(f"[WARN] Sentiment file not found: {sentiment_file}") + return None + try: + df = pd.read_parquet(sentiment_file) + print(f"[INFO] Loaded sentiment data for {symbol}: {len(df)} rows") + timestamp_cols = self.get_timestamp_columns(df) + if not timestamp_cols: + print(f"[ERROR] No timestamp columns found in {symbol} sentiment data") + return None + timestamp_col = timestamp_cols[0] + converted = self.convert_timestamp_column(df, timestamp_col) + if converted is None: + print(f"[ERROR] Could not convert timestamp column {timestamp_col} in {symbol}") + return None + df['sentiment_timestamp'] = converted + df['symbol'] = symbol.upper() + return df + except Exception as e: + print(f"[ERROR] Error loading sentiment data for {symbol}: {e}") + return None + + def load_features_data(self, data_type='stocks'): + """Load features data with improved timestamp handling""" + file_path = self.stocks_features_path if data_type == 'stocks' else self.crypto_features_path + if not file_path.exists(): + print(f"[ERROR] Features file not found: {file_path}") + return None + try: + df = pd.read_parquet(file_path) + print(f"[INFO] Loaded {data_type} features: {len(df)} rows") + potential_timestamp_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['time', 'date', 'interval', 'timestamp', 'dt'])] + print(f"[INFO] Potential timestamp columns: {potential_timestamp_cols}") + + # Safer timestamp detection + timestamp_cols = [] + for col in potential_timestamp_cols: + try: + is_ts = self.is_timestamp_column(df, col) + if is_ts: + timestamp_cols.append(col) + print(f"[DEBUG] {col} confirmed as timestamp column") + else: + print(f"[DEBUG] {col} rejected as timestamp column") + except Exception as e: + print(f"[WARN] Error checking {col}: {e}") + continue + + print(f"[INFO] Confirmed timestamp columns: {timestamp_cols}") + if not timestamp_cols: + print(f"[ERROR] No valid timestamp columns found in {data_type} features") + return None + best_col = self.select_best_timestamp_column(df, timestamp_cols) + if best_col is None: + print(f"[ERROR] Could not select a valid timestamp column from {timestamp_cols}") + return None + converted = self.convert_timestamp_column(df, best_col) + if converted is None: + print(f"[ERROR] Failed to convert selected timestamp column {best_col}") + return None + df['feature_timestamp'] = converted + print(f"[INFO] Selected timestamp column: {best_col}") + print(f"[INFO] Date range: {converted.min()} to {converted.max()}") + return df + except Exception as e: + import traceback + print(f"[ERROR] Error loading {data_type} features: {e}") + print(f"[ERROR] Traceback: {traceback.format_exc()}") + return None + + def merge_sentiment_to_features(self, features_df, sentiment_df, tolerance_minutes=60*12): + """Merge sentiment data INTO features data based on closest timestamp, with tolerance window""" + features_sorted = features_df.sort_values(by='feature_timestamp') + sentiment_sorted = sentiment_df.sort_values(by='sentiment_timestamp') + + # Use a tolerance window for timestamp matching + tolerance = pd.Timedelta(minutes=tolerance_minutes) + merged_df = pd.merge_asof( + features_sorted, + sentiment_sorted, + left_on='feature_timestamp', + right_on='sentiment_timestamp', + direction='nearest', + tolerance=tolerance + ) + + # If no sentiment match within tolerance, sentiment_score will be NaN + if 'sentiment_score' in merged_df.columns: + unmatched = merged_df['sentiment_score'].isna().sum() + print(f"[INFO] Rows with no sentiment match (NaN sentiment_score): {unmatched}") + + print(f"[INFO] Merged {len(features_df)} feature rows with {len(sentiment_df)} sentiment rows using tolerance {tolerance_minutes} min") + print(f"[INFO] Result: {len(merged_df)} rows") + return merged_df + + def process_stocks_data(self): + """Process all stocks data by merging finviz sentiment into stock features""" + print("[INFO] Processing stocks data...") + + # Load stocks features first (this is the base dataset) + stocks_df = self.load_features_data('stocks') + if stocks_df is None: + print("[ERROR] Failed to load stocks features data") + return None + + # Check what columns are available and what symbols are in the data + if 'symbol' in stocks_df.columns: + unique_symbols = stocks_df['symbol'].unique() + elif 'ticker' in stocks_df.columns: + unique_symbols = stocks_df['ticker'].unique() + + print(f"[INFO] Available symbols in stocks features: {unique_symbols}") + + # Check if any sentiment files exist + if not self.finviz_path.exists(): + print(f"[WARN] Finviz sentiment directory does not exist: {self.finviz_path}") + print(f"[WARN] Proceeding without sentiment data merge for stocks") + # Save features as-is without sentiment merge + output_file = self.output_path / "stocks_features.parquet" + stocks_df.to_parquet(output_file) + print(f"[INFO] Stocks features saved without sentiment to: {output_file}") + return stocks_df + + # Check if any sentiment files exist for our tickers + sentiment_files_exist = any( + (self.finviz_path / f"{ticker.upper()}_sentiment.parquet").exists() + for ticker in self.stock_tickers + ) + + if not sentiment_files_exist: + print(f"[WARN] No sentiment files found for any stock tickers: {self.stock_tickers}") + print(f"[WARN] Proceeding without sentiment data merge for stocks") + # Save features as-is without sentiment merge + output_file = self.output_path / "stocks_features.parquet" + stocks_df.to_parquet(output_file) + print(f"[INFO] Stocks features saved without sentiment to: {output_file}") + return stocks_df + + merged_stocks_list = [] + + for ticker in self.stock_tickers: + print(f"[INFO] Processing stock ticker: {ticker}") + + # Load sentiment data for this ticker + sentiment_df = self.load_sentiment_data(ticker) + if sentiment_df is None: + print(f"[WARN] No sentiment data for {ticker}, skipping...") + continue + + # Filter stocks features for this ticker + ticker_stocks = None + if 'symbol' in stocks_df.columns: + ticker_stocks = stocks_df[stocks_df['symbol'] == ticker].copy() + elif 'ticker' in stocks_df.columns: + ticker_stocks = stocks_df[stocks_df['ticker'] == ticker].copy() + + if ticker_stocks is None or len(ticker_stocks) == 0: + print(f"[WARN] No feature data found for ticker {ticker} - skipping this ticker") + continue + + print(f"[INFO] Found {len(ticker_stocks)} feature rows for {ticker}") + + # Merge sentiment INTO features + merged_ticker = self.merge_sentiment_to_features(ticker_stocks, sentiment_df) + + # Remove symbol_y and replace symbol_x with symbol + if 'symbol_y' in merged_ticker.columns: + merged_ticker = merged_ticker.drop(columns=['symbol_y']) + if 'symbol_x' in merged_ticker.columns: + merged_ticker = merged_ticker.rename(columns={'symbol_x': 'symbol'}) + + # Re-order columns: symbol first, interval_timestamp second (if present) + cols = list(merged_ticker.columns) + if 'symbol' in cols: + cols.remove('symbol') + new_order = ['symbol'] + if 'interval_timestamp' in cols: + cols.remove('interval_timestamp') + new_order.append('interval_timestamp') + new_order += cols + merged_ticker = merged_ticker[new_order] + merged_stocks_list.append(merged_ticker) + + if not merged_stocks_list: + print("[WARN] No stocks data was successfully merged with sentiment") + print("[WARN] Saving original stocks features without sentiment") + output_file = self.output_path / "stocks_features.parquet" + stocks_df.to_parquet(output_file) + print(f"[INFO] Stocks features saved without sentiment to: {output_file}") + return stocks_df + + # Combine all merged stock data + final_stocks_df = pd.concat(merged_stocks_list, ignore_index=True) + + # Save the result + output_file = self.output_path / "stocks_features.parquet" + final_stocks_df.to_parquet(output_file) + print(f"[INFO] Stocks data with sentiment saved to: {output_file}") + + return final_stocks_df + + def process_crypto_data(self): + """Process all crypto data by merging finviz sentiment into crypto features""" + print("[INFO] Processing crypto data...") + + # Load crypto features first (this is the base dataset) + crypto_df = self.load_features_data('crypto') + if crypto_df is None: + print("[ERROR] Failed to load crypto features data") + return None + + # Check for various possible symbol/ticker columns + symbol_columns = [col for col in crypto_df.columns if any(keyword in col.lower() + for keyword in ['symbol', 'ticker', 'name', 'id', 'coin'])] + + print(f"[INFO] Available symbol columns in crypto: {symbol_columns}") + + # Try to identify unique values in potential symbol columns + for col in symbol_columns: + if crypto_df[col].dtype == 'object': + unique_values = crypto_df[col].unique()[:10] # Show first 10 unique values + print(f"[INFO] Sample values in {col}: {unique_values}") + + # Check if any sentiment files exist + if not self.finviz_path.exists(): + print(f"[WARN] Finviz sentiment directory does not exist: {self.finviz_path}") + print(f"[WARN] Proceeding without sentiment data merge for crypto") + # Save features as-is without sentiment merge + output_file = self.output_path / "crypto_features.parquet" + crypto_df.to_parquet(output_file) + print(f"[INFO] Crypto features saved without sentiment to: {output_file}") + return crypto_df + + # Check if any sentiment files exist for our crypto tickers + sentiment_files_exist = any( + (self.finviz_path / f"{ticker.upper()}_sentiment.parquet").exists() + for ticker in self.crypto_ticker_mapping.keys() + ) + + if not sentiment_files_exist: + print(f"[WARN] No sentiment files found for any crypto tickers: {list(self.crypto_ticker_mapping.keys())}") + print(f"[WARN] Proceeding without sentiment data merge for crypto") + # Save features as-is without sentiment merge + output_file = self.output_path / "crypto_features.parquet" + crypto_df.to_parquet(output_file) + print(f"[INFO] Crypto features saved without sentiment to: {output_file}") + return crypto_df + + merged_crypto_list = [] + + for crypto_ticker, crypto_name in self.crypto_ticker_mapping.items(): + print(f"[INFO] Processing crypto ticker: {crypto_ticker} (name: {crypto_name})") + + # Load sentiment data for this crypto ticker + sentiment_df = self.load_sentiment_data(crypto_ticker) + if sentiment_df is None: + print(f"[WARN] No sentiment data for {crypto_ticker}, skipping...") + continue + + # Try different approaches to filter crypto features + ticker_crypto = None + + # Approach 1: Try exact ticker match + for col in ['symbol', 'ticker', 'coin_id', 'id']: + if col in crypto_df.columns: + matches = crypto_df[crypto_df[col].str.upper() == crypto_ticker].copy() + if len(matches) > 0: + ticker_crypto = matches + print(f"[INFO] Found {len(matches)} rows matching {crypto_ticker} in column '{col}'") + break + + # Approach 2: Try crypto name match + if ticker_crypto is None or len(ticker_crypto) == 0: + for col in ['name', 'coin_name']: + if col in crypto_df.columns: + matches = crypto_df[crypto_df[col].str.lower() == crypto_name.lower()].copy() + if len(matches) > 0: + ticker_crypto = matches + print(f"[INFO] Found {len(matches)} rows matching {crypto_name} in column '{col}'") + break + + # Approach 3: Try partial matching (in case of different formats) + if ticker_crypto is None or len(ticker_crypto) == 0: + for col in symbol_columns: + if crypto_df[col].dtype == 'object': + # Try case-insensitive contains match + matches = crypto_df[crypto_df[col].str.contains(crypto_ticker, case=False, na=False)].copy() + if len(matches) > 0: + ticker_crypto = matches + print(f"[INFO] Found {len(matches)} rows with partial match for {crypto_ticker} in column '{col}'") + break + + # Try crypto name partial match + matches = crypto_df[crypto_df[col].str.contains(crypto_name, case=False, na=False)].copy() + if len(matches) > 0: + ticker_crypto = matches + print(f"[INFO] Found {len(matches)} rows with partial match for {crypto_name} in column '{col}'") + break + + if ticker_crypto is None or len(ticker_crypto) == 0: + print(f"[WARN] No feature data found for crypto {crypto_ticker} ({crypto_name}) - skipping this crypto") + continue + + # Merge sentiment INTO features + merged_ticker = self.merge_sentiment_to_features(ticker_crypto, sentiment_df) + + # Remove symbol_x and replace symbol_y with symbol + if 'symbol_x' in merged_ticker.columns: + merged_ticker = merged_ticker.drop(columns=['symbol_x']) + if 'symbol_y' in merged_ticker.columns: + merged_ticker = merged_ticker.rename(columns={'symbol_y': 'symbol'}) + + # Remove duplicate 'symbol' columns if any + symbol_cols = [col for col in merged_ticker.columns if col == 'symbol'] + if len(symbol_cols) > 1: + # Keep only the first 'symbol' column + # This will drop all but the first occurrence + merged_ticker = merged_ticker.loc[:, ~merged_ticker.columns.duplicated()] + + # Re-order columns: symbol first, interval_timestamp second (if present) + cols = list(merged_ticker.columns) + if 'symbol' in cols: + cols.remove('symbol') + new_order = ['symbol'] + if 'interval_timestamp' in cols: + cols.remove('interval_timestamp') + new_order.append('interval_timestamp') + new_order += cols + merged_ticker = merged_ticker[new_order] + merged_crypto_list.append(merged_ticker) + + if not merged_crypto_list: + print("[WARN] No crypto data was successfully merged with sentiment") + print("[WARN] Saving original crypto features without sentiment") + output_file = self.output_path / "crypto_features.parquet" + crypto_df.to_parquet(output_file) + print(f"[INFO] Crypto features saved without sentiment to: {output_file}") + return crypto_df + + # Combine all merged crypto data + final_crypto_df = pd.concat(merged_crypto_list, ignore_index=True) + + # Save the result + output_file = self.output_path / "crypto_features.parquet" + final_crypto_df.to_parquet(output_file) + print(f"[INFO] Crypto data with sentiment saved to: {output_file}") + + return final_crypto_df + + def process_all_data(self): + """Process both stocks and crypto data""" + #logging.info("Starting data processing for all assets...") + + stocks_result = self.process_stocks_data() + crypto_result = self.process_crypto_data() + + if stocks_result is not None: + print(f"[OK] Stocks processing completed: {len(stocks_result)} rows") + else: + print("[ERROR] Stocks processing failed") + + if crypto_result is not None: + print(f"[OK] Crypto processing completed: {len(crypto_result)} rows") + else: + print("[ERROR] Crypto processing failed") + + return stocks_result, crypto_result + +# Example usage +if __name__ == "__main__": + handler = FixedTimestampHandler() + + # Test individual components + #logging.info("Testing sentiment data loading...") + sentiment_df = handler.load_sentiment_data("AAPL") + + stocks_df = handler.load_features_data('stocks') + + # Test merge process + # handler.test_merge() + + # Process all data + handler.process_all_data() \ No newline at end of file diff --git a/src/merge/merge_7.py b/src/merge/merge_7.py new file mode 100644 index 0000000000000000000000000000000000000000..0f41d0e8bc3a3cc9e67e40018217dc083d4bcff4 --- /dev/null +++ b/src/merge/merge_7.py @@ -0,0 +1,28 @@ +import importlib.util +import os + +def run_module(module_path, module_name): + spec = importlib.util.spec_from_file_location(module_name, module_path) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + if hasattr(mod, 'main'): + mod.main() + else: + print(f"[WARN] {module_name} has no main() function.") + +def main(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + finhub_dir = os.path.join(this_dir, 'finhub') + modules = [ + ('company_info.py', 'company_info'), + ('sentiment.py', 'sentiment'), + ('ratings.py', 'ratings'), + ('quote.py', 'quote'), + ] + for fname, mname in modules: + print(f"[INFO] Merging {mname.replace('_', ' ')}...") + run_module(os.path.join(finhub_dir, fname), mname) + print("[INFO] All merges complete.") + +if __name__ == "__main__": + main() diff --git a/src/merge/merge_sant.py b/src/merge/merge_sant.py new file mode 100644 index 0000000000000000000000000000000000000000..0782551feb6c2decf19598793fe68ac9636ed16b --- /dev/null +++ b/src/merge/merge_sant.py @@ -0,0 +1,909 @@ +""" +Santiment Data Merger +===================== + +This script merges all Santiment data files into a unified features dataset. +It reads all parquet files from data/santiment/, merges them by slug and datetime +with 1-hour interval tolerance, and creates merged_features.parquet. + +Features: +- Reads all Santiment parquet files automatically +- Merges by slug and datetime with 1-hour tolerance +- Handles different data formats (financial, ohlcv, prices, etc.) +- Creates comprehensive feature dataset +- Robust error handling and logging + +Author: AI Assistant +Date: August 2025 +""" + +import os +import sys +import pandas as pd +import numpy as np +from pathlib import Path +from datetime import datetime, timedelta +import logging +import glob +from typing import List, Dict, Optional, Tuple +import warnings + +# Resolve data directory base +try: + from src.config import DATA_DIR as CFG_DATA_DIR +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR + except Exception: + CFG_DATA_DIR = "/data" + + +def _resolve_under_data(path_like: str | os.PathLike) -> Path: + p = Path(path_like) + if p.is_absolute(): + return p + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return Path(CFG_DATA_DIR) / rel + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class SantimentDataMerger: + """ + Comprehensive Santiment Data Merger + + Merges all Santiment parquet files into a unified features dataset + with proper handling of different data formats and time alignment. + """ + + def __init__(self, + source_dir: str = "data/santiment", + output_dir: str = "data/santiment", + time_tolerance_hours: int = 1): + """ + Initialize the Santiment Data Merger + + Args: + source_dir: Directory containing Santiment parquet files + output_dir: Directory to save merged features + time_tolerance_hours: Tolerance for datetime matching (hours) + """ + # Resolve under DATA_DIR for portability + self.source_dir = _resolve_under_data(source_dir) + self.output_dir = _resolve_under_data(output_dir) + self.time_tolerance = timedelta(hours=time_tolerance_hours) + + # Ensure directories exist + self.source_dir.mkdir(parents=True, exist_ok=True) + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Storage for processed data + self.dataframes: Dict[str, pd.DataFrame] = {} + self.merged_data: Optional[pd.DataFrame] = None + self.processing_stats = { + 'files_found': 0, + 'files_processed': 0, + 'files_failed': 0, + 'total_records': 0, + 'unique_slugs': set(), + 'date_range': {}, + 'categories': set() + } + + # Track placeholder mode (no input files) + self.placeholder_created = False + + # Initialize symbol normalizer + self.symbol_normalizer = self._setup_symbol_normalizer() + + def _setup_symbol_normalizer(self): + """ + Set up symbol normalization mapping for consistent asset identification + + Returns: + Dictionary mapping various symbol formats to canonical slugs + """ + # Canonical mapping for major crypto assets + # Maps various symbols/names to the official uppercase symbols + symbol_mapping = { + # Bitcoin variants + 'bitcoin': 'BTC', + 'btc': 'BTC', + 'Bitcoin': 'BTC', + 'BTC': 'BTC', + + # Ethereum variants + 'ethereum': 'ETH', + 'eth': 'ETH', + 'Ethereum': 'ETH', + 'ETH': 'ETH', + + # Ripple/XRP variants + 'ripple': 'XRP', + 'xrp': 'XRP', + 'Ripple': 'XRP', + 'XRP': 'XRP', + + # Solana variants + 'solana': 'SOL', + 'sol': 'SOL', + 'Solana': 'SOL', + 'SOL': 'SOL', + + # Cardano variants + 'cardano': 'ADA', + 'ada': 'ADA', + 'Cardano': 'ADA', + 'ADA': 'ADA', + + # Polkadot variants + 'polkadot': 'DOT', + 'dot': 'DOT', + 'Polkadot': 'DOT', + 'DOT': 'DOT', + + # Chainlink variants + 'chainlink': 'LINK', + 'link': 'LINK', + 'Chainlink': 'LINK', + 'LINK': 'LINK', + + # Litecoin variants + 'litecoin': 'LTC', + 'ltc': 'LTC', + 'Litecoin': 'LTC', + 'LTC': 'LTC', + + # Bitcoin Cash variants + 'bitcoin-cash': 'BCH', + 'bch': 'BCH', + 'Bitcoin Cash': 'BCH', + 'BCH': 'BCH', + + # Stellar variants + 'stellar': 'XLM', + 'xlm': 'XLM', + 'Stellar': 'XLM', + 'XLM': 'XLM', + + # Ethereum Classic variants + 'ethereum-classic': 'ETC', + 'etc': 'ETC', + 'Ethereum Classic': 'ETC', + 'ETC': 'ETC', + + # EOS variants + 'eos': 'EOS', + 'EOS': 'EOS', + } + + logger.info(f"Initialized symbol normalizer with {len(symbol_mapping)} mappings") + return symbol_mapping + + def normalize_symbol(self, symbol: str) -> str: + """ + Normalize a symbol to its canonical uppercase format + + Args: + symbol: Symbol to normalize + + Returns: + Canonical uppercase symbol (e.g., BTC, ETH, SOL) + """ + if symbol in self.symbol_normalizer: + canonical = self.symbol_normalizer[symbol] + if symbol != canonical: + logger.debug(f"Normalized '{symbol}' -> '{canonical}'") + return canonical + + # If not found in mapping, return uppercase version and log warning + logger.warning(f"Unknown symbol '{symbol}' not found in normalization mapping, using uppercase") + return symbol.upper() + + def find_parquet_files(self) -> List[Path]: + """ + Find all parquet files in the source directory + + Returns: + List of parquet file paths + """ + parquet_files = list(self.source_dir.glob("*.parquet")) + + # Filter out non-Santiment files and already merged files + santiment_files = [] + for file_path in parquet_files: + filename = file_path.name.lower() + # Include Santiment files but exclude already merged ones + if ('santiment_' in filename or 'ohlcv' in filename or 'prices' in filename) and 'merged' not in filename: + santiment_files.append(file_path) + + self.processing_stats['files_found'] = len(santiment_files) + logger.info(f"Found {len(santiment_files)} Santiment parquet files") + + return santiment_files + + def parse_filename(self, file_path: Path) -> Dict[str, str]: + """ + Parse filename to extract metadata + + Args: + file_path: Path to the parquet file + + Returns: + Dictionary with parsed metadata + """ + filename = file_path.stem + parts = filename.split('_') + + metadata = { + 'source': 'santiment', + 'category': 'unknown', + 'metric': 'unknown', + 'asset': 'unknown', + 'timestamp': 'unknown' + } + + try: + if filename.startswith('santiment_'): + # Format: santiment_category_metric_timestamp + if len(parts) >= 4: + metadata['category'] = parts[1] + metadata['metric'] = parts[2] + metadata['timestamp'] = '_'.join(parts[3:]) + elif 'ohlcv' in filename: + # Format: santiment_ohlcv_asset_timestamp + if len(parts) >= 4: + metadata['category'] = 'ohlcv' + metadata['metric'] = 'ohlcv' + metadata['asset'] = parts[2] + metadata['timestamp'] = '_'.join(parts[3:]) + elif 'prices' in filename: + # Format: santiment_prices_asset_timestamp + if len(parts) >= 4: + metadata['category'] = 'prices' + metadata['metric'] = 'prices_detailed' + metadata['asset'] = parts[2] + metadata['timestamp'] = '_'.join(parts[3:]) + + except Exception as e: + logger.warning(f"Failed to parse filename {filename}: {e}") + + return metadata + + def load_and_standardize_dataframe(self, file_path: Path) -> Optional[pd.DataFrame]: + """ + Load and standardize a parquet file + + Args: + file_path: Path to the parquet file + + Returns: + Standardized DataFrame or None if failed + """ + try: + df = pd.read_parquet(file_path) + + if df.empty: + logger.warning(f"Empty dataframe: {file_path.name}") + return None + + # Parse filename for metadata + metadata = self.parse_filename(file_path) + + # Standardize datetime index + if 'datetime' in df.columns: + df['datetime'] = pd.to_datetime(df['datetime']) + df.set_index('datetime', inplace=True) + elif df.index.name == 'datetime' or pd.api.types.is_datetime64_any_dtype(df.index): + df.index = pd.to_datetime(df.index) + df.index.name = 'datetime' + else: + # Try to find a datetime column + datetime_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()] + if datetime_cols: + df[datetime_cols[0]] = pd.to_datetime(df[datetime_cols[0]]) + df.set_index(datetime_cols[0], inplace=True) + df.index.name = 'datetime' + else: + logger.warning(f"No datetime column found in {file_path.name}") + return None + + # Ensure slug column exists + if 'slug' not in df.columns: + if metadata['asset'] != 'unknown': + # Normalize the asset symbol before assigning + normalized_asset = self.normalize_symbol(metadata['asset']) + df['slug'] = normalized_asset + if metadata['asset'] != normalized_asset: + logger.info(f"Normalized asset '{metadata['asset']}' -> '{normalized_asset}' in {file_path.name}") + else: + logger.warning(f"No slug information found in {file_path.name}") + return None + else: + # Normalize existing slug column + df['slug'] = df['slug'].apply(self.normalize_symbol) + logger.debug(f"Normalized existing slug column in {file_path.name}") + + # Add metadata columns + df['source_file'] = file_path.name + df['category'] = metadata['category'] + + # Rename columns to avoid conflicts and add prefixes + value_columns = [col for col in df.columns if col not in ['slug', 'metric', 'source_file', 'category']] + + # Add category prefix to value columns + category = metadata['category'] + metric = metadata['metric'] + + column_mapping = {} + for col in value_columns: + if col in ['slug', 'source_file', 'category']: + continue + + # Create meaningful column name + if col == 'value': + new_col = f"{category}_{metric}" + elif col in ['open', 'high', 'low', 'close', 'volume']: + new_col = f"{category}_{col}" + else: + new_col = f"{category}_{col}" + + column_mapping[col] = new_col + + df.rename(columns=column_mapping, inplace=True) + + # Update stats + self.processing_stats['unique_slugs'].update(df['slug'].unique()) + self.processing_stats['categories'].add(category) + + logger.info(f"Loaded {file_path.name}: {len(df)} records, {len(df.columns)} columns") + + return df + + except Exception as e: + logger.error(f"Failed to load {file_path.name}: {e}") + return None + + def merge_dataframes_by_slug_datetime(self, dataframes: List[pd.DataFrame]) -> pd.DataFrame: + """ + Merge multiple dataframes by slug and datetime with tolerance + + Args: + dataframes: List of DataFrames to merge + + Returns: + Merged DataFrame + """ + if not dataframes: + return pd.DataFrame() + + logger.info(f"Merging {len(dataframes)} dataframes...") + + # Start with the first dataframe + merged = dataframes[0].copy() + logger.info(f"Starting with base dataframe: {len(merged)} records") + + # Merge each subsequent dataframe + for i, df in enumerate(dataframes[1:], 1): + logger.info(f"Merging dataframe {i+1}/{len(dataframes)}: {len(df)} records") + + try: + # Merge on slug and datetime index with tolerance + merged = self._merge_with_time_tolerance(merged, df) + logger.info(f"After merge {i}: {len(merged)} records") + + except Exception as e: + logger.error(f"Failed to merge dataframe {i+1}: {e}") + continue + + return merged + + def _merge_with_time_tolerance(self, left_df: pd.DataFrame, right_df: pd.DataFrame) -> pd.DataFrame: + """ + Merge two dataframes with time tolerance + + Args: + left_df: Left DataFrame + right_df: Right DataFrame + + Returns: + Merged DataFrame + """ + # Reset index to make datetime a column for merging + left_reset = left_df.reset_index() + right_reset = right_df.reset_index() + + # Perform merge on slug first + common_slugs = set(left_reset['slug'].unique()) & set(right_reset['slug'].unique()) + + if not common_slugs: + # No common slugs, concatenate vertically + logger.warning("No common slugs found, concatenating dataframes") + combined = pd.concat([left_df, right_df], axis=0, sort=False) + return combined.sort_index() + + merged_parts = [] + + for slug in common_slugs: + left_slug = left_reset[left_reset['slug'] == slug].copy() + right_slug = right_reset[right_reset['slug'] == slug].copy() + + if left_slug.empty or right_slug.empty: + continue + + # Sort by datetime + left_slug = left_slug.sort_values('datetime') + right_slug = right_slug.sort_values('datetime') + + # Merge with time tolerance using pandas merge_asof + try: + merged_slug = pd.merge_asof( + left_slug, + right_slug, + on='datetime', + by='slug', + tolerance=self.time_tolerance, + direction='nearest', + suffixes=('', '_right') + ) + + # Remove duplicate columns + duplicate_cols = [col for col in merged_slug.columns if col.endswith('_right')] + for col in duplicate_cols: + base_col = col.replace('_right', '') + if base_col in merged_slug.columns: + # Keep non-null values, preferring left side + merged_slug[base_col] = merged_slug[base_col].fillna(merged_slug[col]) + else: + # Rename the right column + merged_slug[base_col] = merged_slug[col] + merged_slug.drop(columns=[col], inplace=True) + + merged_parts.append(merged_slug) + + except Exception as e: + logger.warning(f"Failed to merge slug {slug}: {e}") + # Fallback: simple concatenation for this slug + slug_combined = pd.concat([left_slug, right_slug], axis=0, sort=False) + merged_parts.append(slug_combined) + + # Handle slugs that exist in only one dataframe + left_only_slugs = set(left_reset['slug'].unique()) - common_slugs + right_only_slugs = set(right_reset['slug'].unique()) - common_slugs + + for slug in left_only_slugs: + merged_parts.append(left_reset[left_reset['slug'] == slug]) + + for slug in right_only_slugs: + merged_parts.append(right_reset[right_reset['slug'] == slug]) + + # Combine all parts + if merged_parts: + final_merged = pd.concat(merged_parts, axis=0, sort=False, ignore_index=True) + # Set datetime as index + final_merged.set_index('datetime', inplace=True) + return final_merged.sort_index() + else: + return left_df + + def fill_missing_values(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Comprehensive null filling strategy for the merged dataset + + Args: + df: DataFrame with potential null values + + Returns: + DataFrame with filled null values + """ + logger.info("Applying comprehensive null filling strategy...") + + filled_df = df.copy() + null_counts_before = filled_df.isnull().sum().sum() + + # Strategy 1: Forward fill within each asset (time-based continuity) + logger.info("Step 1: Forward filling within each asset...") + for slug in filled_df['slug'].unique(): + slug_mask = filled_df['slug'] == slug + filled_df.loc[slug_mask] = filled_df.loc[slug_mask].ffill() + + # Strategy 2: Backward fill within each asset (fill initial nulls) + logger.info("Step 2: Backward filling within each asset...") + for slug in filled_df['slug'].unique(): + slug_mask = filled_df['slug'] == slug + filled_df.loc[slug_mask] = filled_df.loc[slug_mask].bfill() + + # Strategy 3: Fill specific column types with appropriate defaults + logger.info("Step 3: Filling remaining nulls with type-specific defaults...") + + for col in filled_df.columns: + if filled_df[col].isnull().any(): + # Price and financial metrics: use median of the column + if any(keyword in col.lower() for keyword in ['price', 'usd', 'btc', 'eth', 'marketcap', 'volume']): + median_val = filled_df[col].median() + filled_df[col] = filled_df[col].fillna(median_val) + logger.debug(f"Filled {col} nulls with median: {median_val}") + + # Address and network metrics: use 0 (no activity) + elif any(keyword in col.lower() for keyword in ['address', 'network', 'active', 'transaction']): + filled_df[col] = filled_df[col].fillna(0) + logger.debug(f"Filled {col} nulls with 0") + + # Exchange metrics: use 0 (no flow) + elif any(keyword in col.lower() for keyword in ['exchange', 'inflow', 'outflow', 'balance']): + filled_df[col] = filled_df[col].fillna(0) + logger.debug(f"Filled {col} nulls with 0") + + # Supply metrics: forward fill or use mean + elif any(keyword in col.lower() for keyword in ['supply', 'circulation', 'velocity']): + mean_val = filled_df[col].mean() + filled_df[col] = filled_df[col].fillna(mean_val) + logger.debug(f"Filled {col} nulls with mean: {mean_val}") + + # Development metrics: use 0 (no activity) + elif any(keyword in col.lower() for keyword in ['dev', 'github', 'contributors']): + filled_df[col] = filled_df[col].fillna(0) + logger.debug(f"Filled {col} nulls with 0") + + # Social metrics: use 0 (no mentions) + elif any(keyword in col.lower() for keyword in ['social', 'sentiment', 'volume_4chan', 'volume_reddit']): + filled_df[col] = filled_df[col].fillna(0) + logger.debug(f"Filled {col} nulls with 0") + + # OHLCV metrics: use forward fill or interpolation + elif any(keyword in col.lower() for keyword in ['open', 'high', 'low', 'close', 'ohlcv']): + filled_df[col] = filled_df[col].ffill().bfill() + logger.debug(f"Filled {col} nulls with forward/backward fill") + + # Derivatives and whale metrics: use 0 + elif any(keyword in col.lower() for keyword in ['funding', 'interest', 'whale', 'holders']): + filled_df[col] = filled_df[col].fillna(0) + logger.debug(f"Filled {col} nulls with 0") + + # String columns: use 'unknown' or most frequent value + elif filled_df[col].dtype == 'object': + if col in ['slug', 'category', 'source_file', 'metric', 'development_alternative_slug_used']: + # Skip these columns as they will be removed or are handled separately + continue + else: + mode_val = filled_df[col].mode() + if len(mode_val) > 0: + filled_df[col] = filled_df[col].fillna(mode_val[0]) + else: + filled_df[col] = filled_df[col].fillna('unknown') + logger.debug(f"Filled {col} nulls with mode/unknown") + + # Any remaining numeric nulls: use median + elif pd.api.types.is_numeric_dtype(filled_df[col]): + median_val = filled_df[col].median() + if pd.notna(median_val): + filled_df[col] = filled_df[col].fillna(median_val) + logger.debug(f"Filled {col} nulls with median: {median_val}") + else: + filled_df[col] = filled_df[col].fillna(0) + logger.debug(f"Filled {col} nulls with 0 (median was NaN)") + + null_counts_after = filled_df.isnull().sum().sum() + nulls_filled = null_counts_before - null_counts_after + + logger.info(f"Null filling completed:") + logger.info(f" Nulls before: {null_counts_before:,}") + logger.info(f" Nulls after: {null_counts_after:,}") + logger.info(f" Nulls filled: {nulls_filled:,}") + + return filled_df + + def process_all_files(self) -> bool: + """ + Process all Santiment parquet files + + Returns: + True if successful, False otherwise + """ + try: + # Find all parquet files + parquet_files = self.find_parquet_files() + + if not parquet_files: + logger.warning("No Santiment parquet files found") + # Graceful fallback: create minimal placeholder merged file to unblock pipeline + try: + # Create an explicitly typed empty DF with expected columns + placeholder = pd.DataFrame({'slug': pd.Series(dtype='object')}) + # Set an empty datetime index (naive) with the expected name + placeholder.index = pd.DatetimeIndex([], name='datetime') + # Ensure output directory exists + self.output_dir.mkdir(parents=True, exist_ok=True) + out_path = self.output_dir / "merged_features.parquet" + # Save directly, bypassing save_merged_features constraints + placeholder.to_parquet(out_path, index=True) + # Mark placeholder state and keep merged_data None + self.placeholder_created = True + logger.info(f"Created placeholder Santiment merged_features.parquet with 0 rows at {out_path}") + return True + except Exception as e: + logger.error(f"Failed to create placeholder Santiment file: {e}") + return False + + # Load and standardize all dataframes + dataframes = [] + + for file_path in parquet_files: + try: + df = self.load_and_standardize_dataframe(file_path) + if df is not None: + dataframes.append(df) + self.processing_stats['files_processed'] += 1 + self.processing_stats['total_records'] += len(df) + else: + self.processing_stats['files_failed'] += 1 + + except Exception as e: + logger.error(f"Failed to process {file_path.name}: {e}") + self.processing_stats['files_failed'] += 1 + + if not dataframes: + logger.error("No dataframes were successfully loaded") + return False + + # Merge all dataframes + logger.info("Starting merge process...") + self.merged_data = self.merge_dataframes_by_slug_datetime(dataframes) + + if self.merged_data.empty: + logger.error("Merged dataframe is empty") + return False + + # Update final stats + self.processing_stats['date_range'] = { + 'start': str(self.merged_data.index.min()), + 'end': str(self.merged_data.index.max()), + 'total_days': (self.merged_data.index.max() - self.merged_data.index.min()).days + } + + logger.info("All files processed successfully") + return True + + except Exception as e: + logger.error(f"Failed to process files: {e}") + return False + + def save_merged_features(self, filename: str = "merged_features.parquet") -> bool: + """ + Save the merged features to a parquet file with comprehensive null filling + + Args: + filename: Output filename + + Returns: + True if successful, False otherwise + """ + if self.merged_data is None or self.merged_data.empty: + logger.error("No merged data to save") + return False + + try: + output_path = self.output_dir / filename + + # Clean up the dataframe before saving + cleaned_df = self.merged_data.copy() + + # Remove any completely null columns + null_columns = cleaned_df.columns[cleaned_df.isnull().all()].tolist() + if null_columns: + logger.info(f"Removing {len(null_columns)} completely null columns: {null_columns}") + cleaned_df = cleaned_df.dropna(axis=1, how='all') + + # Apply comprehensive null filling strategy + logger.info("Applying comprehensive null filling...") + cleaned_df = self.fill_missing_values(cleaned_df) + + # Remove unwanted columns + columns_to_remove = ['metric', 'source_file', 'category', 'development_alternative_slug_used'] + existing_cols_to_remove = [col for col in columns_to_remove if col in cleaned_df.columns] + if existing_cols_to_remove: + logger.info(f"Removing unwanted columns: {existing_cols_to_remove}") + cleaned_df = cleaned_df.drop(columns=existing_cols_to_remove) + + # Ensure all slugs are in uppercase format + logger.info("Ensuring all slugs are in uppercase format...") + cleaned_df['slug'] = cleaned_df['slug'].apply(lambda x: x.upper() if isinstance(x, str) else x) + + # Fix data type issues for parquet compatibility + logger.info("Fixing data types for parquet compatibility...") + for col in cleaned_df.columns: + if cleaned_df[col].dtype == 'object': + # Check if column contains mixed types + sample_values = cleaned_df[col].dropna().head(100) + if len(sample_values) > 0: + # If it looks like it should be numeric, convert it + try: + pd.to_numeric(sample_values, errors='raise') + # If no error, convert the entire column + cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce') + logger.debug(f"Converted {col} to numeric") + except (ValueError, TypeError): + # If conversion fails, ensure it's all strings + cleaned_df[col] = cleaned_df[col].astype(str) + logger.debug(f"Converted {col} to string") + + # Sort by datetime and slug + cleaned_df = cleaned_df.sort_index() + cleaned_df = cleaned_df.sort_values(['slug'], kind='mergesort') + + # Final data quality check + remaining_nulls = cleaned_df.isnull().sum().sum() + if remaining_nulls > 0: + logger.warning(f"Warning: {remaining_nulls} null values remain after filling") + # Log columns with remaining nulls + null_cols = cleaned_df.columns[cleaned_df.isnull().any()].tolist() + logger.warning(f"Columns with remaining nulls: {null_cols}") + else: + logger.info("✓ All null values successfully filled") + + # Save to parquet with error handling + try: + cleaned_df.to_parquet(output_path, compression='snappy') + except Exception as parquet_error: + logger.error(f"Parquet save failed: {parquet_error}") + # Try to identify problematic columns + logger.info("Analyzing columns for parquet compatibility...") + for col in cleaned_df.columns: + try: + test_df = cleaned_df[[col]].copy() + test_df.to_parquet(output_path.with_suffix('.test.parquet')) + output_path.with_suffix('.test.parquet').unlink() # Clean up test file + except Exception as col_error: + logger.error(f"Column {col} causing issues: {col_error}") + # Force convert problematic column to string + cleaned_df[col] = cleaned_df[col].astype(str) + logger.info(f"Converted problematic column {col} to string") + + # Try saving again + cleaned_df.to_parquet(output_path, compression='snappy') + + logger.info(f"Merged features saved to {output_path}") + logger.info(f"Final dataset: {len(cleaned_df)} records, {len(cleaned_df.columns)} columns") + logger.info(f"Data completeness: {100 - (remaining_nulls / (len(cleaned_df) * len(cleaned_df.columns)) * 100):.2f}%") + + return True + + except Exception as e: + logger.error(f"Failed to save merged features: {e}") + return False + + def generate_summary_report(self) -> Dict: + """ + Generate a comprehensive summary report + + Returns: + Summary dictionary + """ + summary = { + 'processing_timestamp': datetime.now().isoformat(), + 'files_statistics': { + 'files_found': self.processing_stats['files_found'], + 'files_processed': self.processing_stats['files_processed'], + 'files_failed': self.processing_stats['files_failed'], + 'success_rate': f"{(self.processing_stats['files_processed'] / max(1, self.processing_stats['files_found'])) * 100:.1f}%" + }, + 'data_statistics': { + 'total_records': self.processing_stats['total_records'], + 'unique_slugs': list(self.processing_stats['unique_slugs']), + 'categories_found': list(self.processing_stats['categories']), + 'date_range': self.processing_stats['date_range'] + } + } + + if self.merged_data is not None: + summary['merged_statistics'] = { + 'final_records': len(self.merged_data), + 'final_columns': len(self.merged_data.columns), + 'memory_usage_mb': f"{self.merged_data.memory_usage(deep=True).sum() / 1024 / 1024:.2f}", + 'slug_distribution': self.merged_data['slug'].value_counts().to_dict(), + 'null_percentage': f"{(self.merged_data.isnull().sum().sum() / (len(self.merged_data) * len(self.merged_data.columns))) * 100:.2f}%" + } + + return summary + + def print_summary(self): + """Print a comprehensive summary of the merge process""" + summary = self.generate_summary_report() + + print("\n" + "="*60) + print("SANTIMENT DATA MERGER SUMMARY") + print("="*60) + + # File statistics + print(f"\nFile Processing:") + print(f" Files found: {summary['files_statistics']['files_found']}") + print(f" Files processed: {summary['files_statistics']['files_processed']}") + print(f" Files failed: {summary['files_statistics']['files_failed']}") + print(f" Success rate: {summary['files_statistics']['success_rate']}") + + # Data statistics + print(f"\nData Overview:") + print(f" Total records processed: {summary['data_statistics']['total_records']:,}") + print(f" Unique assets (slugs): {len(summary['data_statistics']['unique_slugs'])}") + print(f" Categories found: {', '.join(summary['data_statistics']['categories_found'])}") + + if summary['data_statistics']['date_range']: + print(f" Date range: {summary['data_statistics']['date_range']['start']} to {summary['data_statistics']['date_range']['end']}") + print(f" Total days: {summary['data_statistics']['date_range']['total_days']}") + + # Merged statistics + if 'merged_statistics' in summary: + print(f"\nMerged Dataset:") + print(f" Final records: {summary['merged_statistics']['final_records']:,}") + print(f" Final columns: {summary['merged_statistics']['final_columns']}") + print(f" Memory usage: {summary['merged_statistics']['memory_usage_mb']} MB") + print(f" Data completeness: {100 - float(summary['merged_statistics']['null_percentage'].rstrip('%')):.1f}%") + + # Show top assets by record count + print(f"\nTop Assets by Record Count:") + slug_dist = summary['merged_statistics']['slug_distribution'] + for slug, count in list(slug_dist.items())[:5]: + print(f" {slug}: {count:,} records") + + print("="*60) + + +def main(): + """Main function to run the Santiment data merger""" + logger.info("Starting Santiment Data Merger...") + + # Initialize the merger + merger = SantimentDataMerger( + source_dir="data/santiment", + output_dir="data/santiment", + time_tolerance_hours=1 + ) + + try: + # Process all files + success = merger.process_all_files() + + if not success: + logger.error("Failed to process Santiment files") + return False + + # If we only created a placeholder, treat as successful and skip saving/summary + if merger.placeholder_created: + logger.info("Placeholder Santiment dataset created; skipping save and summary.") + return True + + # Save merged features + save_success = merger.save_merged_features("merged_features.parquet") + + if not save_success: + logger.error("Failed to save merged features") + return False + + # Print summary + merger.print_summary() + + # Save summary report + summary = merger.generate_summary_report() + summary_path = Path("data/santiment") / "merge_summary.json" + + import json + with open(summary_path, 'w') as f: + json.dump(summary, f, indent=2, default=str) + + logger.info(f"Summary report saved to {summary_path}") + logger.info("Santiment data merge completed successfully!") + + return True + + except Exception as e: + logger.error(f"Santiment data merge failed: {e}") + return False + + +if __name__ == "__main__": + main() diff --git a/src/merge/merge_santiment_time_shifted.py b/src/merge/merge_santiment_time_shifted.py new file mode 100644 index 0000000000000000000000000000000000000000..bc2ca0e82db3e526825ce4a42d3c382b111e485e --- /dev/null +++ b/src/merge/merge_santiment_time_shifted.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +""" +Time-Shifted Santiment-Crypto Merger +=================================== + +This script handles the case where Santiment data and crypto data have different date ranges +due to API limitations. It performs a time-shifted merge using pattern matching. + +Approaches: +1. Offset-based: Map August crypto data to July Santiment data with consistent offset +2. Day-of-week matching: Match same weekdays/times across different months +3. Pattern-based: Use similar market patterns from different time periods +""" + +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +import os +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def load_data(): + """Load crypto and Santiment data""" + logger.info("Loading data files...") + + # Load crypto features + crypto_file = 'data/merged/features/crypto_features.parquet' + crypto_df = pd.read_parquet(crypto_file) + crypto_df['datetime'] = pd.to_datetime(crypto_df['interval_timestamp'], unit='ms', utc=True) + + # Load Santiment features + santiment_file = 'data/santiment/merged_features.parquet' + santiment_df = pd.read_parquet(santiment_file) + + logger.info(f"Crypto: {len(crypto_df)} records from {crypto_df['datetime'].min()} to {crypto_df['datetime'].max()}") + logger.info(f"Santiment: {len(santiment_df)} records from {santiment_df.index.min()} to {santiment_df.index.max()}") + + return crypto_df, santiment_df + +def calculate_time_offset(crypto_df, santiment_df): + """Calculate the time offset between datasets""" + crypto_start = crypto_df['datetime'].min() + santiment_start = santiment_df.index.min() + + offset = crypto_start - santiment_start + logger.info(f"Time offset: {offset.days} days") + + return offset + +def merge_with_time_shift(crypto_df, santiment_df, method='offset'): + """ + Merge crypto and Santiment data using time-shift techniques + + Args: + crypto_df: Crypto features DataFrame + santiment_df: Santiment features DataFrame + method: 'offset', 'day_of_week', or 'pattern' + """ + logger.info(f"Starting time-shifted merge using method: {method}") + + merged_results = [] + symbol_mapping = {'BTC': 'BTC', 'ETH': 'ETH', 'ADA': 'ADA', 'SOL': 'SOL', 'XRP': 'XRP'} + + if method == 'offset': + # Calculate consistent time offset + offset = calculate_time_offset(crypto_df, santiment_df) + + for symbol, slug in symbol_mapping.items(): + logger.info(f"Processing {symbol} → {slug} with offset method") + + crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy() + santiment_slug = santiment_df[santiment_df['slug'] == slug].copy() + + if crypto_symbol.empty or santiment_slug.empty: + logger.warning(f"Skipping {symbol} - missing data") + continue + + # Apply offset to match timeframes + merged_symbol = merge_with_offset(crypto_symbol, santiment_slug, offset) + merged_results.append(merged_symbol) + + elif method == 'day_of_week': + # Match same day-of-week and time patterns + for symbol, slug in symbol_mapping.items(): + logger.info(f"Processing {symbol} → {slug} with day-of-week method") + + crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy() + santiment_slug = santiment_df[santiment_df['slug'] == slug].copy() + + if crypto_symbol.empty or santiment_slug.empty: + logger.warning(f"Skipping {symbol} - missing data") + continue + + merged_symbol = merge_by_day_pattern(crypto_symbol, santiment_slug) + merged_results.append(merged_symbol) + + # Combine results + if merged_results: + merged_df = pd.concat(merged_results, ignore_index=True) + logger.info(f"Merge completed: {len(merged_df)} records") + return merged_df + else: + logger.error("No data could be merged!") + return None + +def merge_with_offset(crypto_symbol, santiment_slug, offset): + """Merge using consistent time offset""" + merged_records = [] + + for _, crypto_row in crypto_symbol.iterrows(): + # Shift crypto timestamp back by offset to match Santiment timeframe + shifted_time = crypto_row['datetime'] - offset + + # Find closest Santiment record + time_diffs = np.abs(santiment_slug.index - shifted_time) + closest_idx = time_diffs.argmin() + closest_idx = santiment_slug.index[closest_idx] + + # Check if match is reasonable (within 1 hour) + if time_diffs.min() <= pd.Timedelta(hours=1): + santiment_row = santiment_slug.loc[closest_idx] + + # Combine data + combined_row = crypto_row.copy() + for col in santiment_slug.columns: + if col != 'slug': + combined_row[f'santiment_{col}'] = santiment_row[col] + + merged_records.append(combined_row) + + return pd.DataFrame(merged_records) + +def merge_by_day_pattern(crypto_symbol, santiment_slug): + """Merge by matching day-of-week and time patterns""" + merged_records = [] + + for _, crypto_row in crypto_symbol.iterrows(): + crypto_time = crypto_row['datetime'] + + # Find Santiment records with same day-of-week and similar time + santiment_same_weekday = santiment_slug[ + santiment_slug.index.dayofweek == crypto_time.dayofweek + ] + + if not santiment_same_weekday.empty: + # Find closest time-of-day match + crypto_time_of_day = crypto_time.time() + + time_diffs = santiment_same_weekday.index.map( + lambda x: abs((x.time().hour * 60 + x.time().minute) - + (crypto_time_of_day.hour * 60 + crypto_time_of_day.minute)) + ) + + closest_idx = time_diffs.argmin() + closest_idx = santiment_same_weekday.index[closest_idx] + santiment_row = santiment_same_weekday.loc[closest_idx] + + # Combine data + combined_row = crypto_row.copy() + for col in santiment_slug.columns: + if col != 'slug': + combined_row[f'santiment_{col}'] = santiment_row[col] + + merged_records.append(combined_row) + + return pd.DataFrame(merged_records) + +def analyze_merge_quality(merged_df, method): + """Analyze merge quality and provide statistics""" + if merged_df is None or merged_df.empty: + return {"error": "No merged data"} + + santiment_cols = [col for col in merged_df.columns if col.startswith('santiment_')] + + analysis = { + 'method_used': method, + 'total_records': len(merged_df), + 'santiment_features_added': len(santiment_cols), + 'symbols_processed': sorted(merged_df['symbol'].unique()), + 'completeness_by_symbol': {} + } + + # Calculate completeness by symbol + for symbol in analysis['symbols_processed']: + symbol_data = merged_df[merged_df['symbol'] == symbol] + non_null_counts = symbol_data[santiment_cols].notna().sum(axis=1) + records_with_santiment = (non_null_counts > 0).sum() + + analysis['completeness_by_symbol'][symbol] = { + 'total_records': len(symbol_data), + 'records_with_santiment': records_with_santiment, + 'completeness_pct': records_with_santiment / len(symbol_data) * 100 + } + + return analysis + +def save_results(merged_df, analysis, method): + """Save merged results with method identifier""" + if merged_df is None: + logger.error("Cannot save - no merged data") + return None, None + + logger.info("Saving time-shifted merge results...") + + # Create output directory + output_dir = 'data/merged/features' + os.makedirs(output_dir, exist_ok=True) + + # Save with method identifier + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = os.path.join(output_dir, f'crypto_with_santiment_{method}_{timestamp}.parquet') + + merged_df.to_parquet(output_file, index=False) + logger.info(f"Merged features saved to: {output_file}") + + # Save analysis + analysis_file = os.path.join(output_dir, f'santiment_merge_analysis_{method}_{timestamp}.json') + import json + with open(analysis_file, 'w') as f: + json.dump(analysis, f, indent=2, default=str) + + logger.info(f"Analysis saved to: {analysis_file}") + + return output_file, analysis_file + +def main(): + """Main time-shifted merge process""" + logger.info("Starting time-shifted Santiment-Crypto merge...") + + try: + # Load data + crypto_df, santiment_df = load_data() + + # Try different merge methods + methods = ['offset', 'day_of_week'] + results = {} + + for method in methods: + logger.info(f"\n{'='*50}") + logger.info(f"TRYING METHOD: {method.upper()}") + logger.info(f"{'='*50}") + + merged_df = merge_with_time_shift(crypto_df, santiment_df, method=method) + analysis = analyze_merge_quality(merged_df, method) + + if merged_df is not None: + output_file, analysis_file = save_results(merged_df, analysis, method) + results[method] = { + 'success': True, + 'records': len(merged_df), + 'completeness': analysis.get('completeness_by_symbol', {}), + 'output_file': output_file + } + else: + results[method] = {'success': False} + + # Print summary + print("\n" + "="*60) + print("TIME-SHIFTED MERGE SUMMARY") + print("="*60) + + for method, result in results.items(): + print(f"\n{method.upper()} METHOD:") + if result['success']: + print(f" ✅ Success: {result['records']} records merged") + print(f" 📁 File: {result['output_file']}") + for symbol, stats in result['completeness'].items(): + print(f" {symbol}: {stats['completeness_pct']:.1f}% complete") + else: + print(f" ❌ Failed") + + print("="*60) + + except Exception as e: + logger.error(f"Time-shifted merge failed: {e}") + raise + +if __name__ == "__main__": + main() diff --git a/src/merge/merge_santiment_to_crypto.py b/src/merge/merge_santiment_to_crypto.py new file mode 100644 index 0000000000000000000000000000000000000000..58b896be3a68ae3b8f6d93478465f69ad8b82cd6 --- /dev/null +++ b/src/merge/merge_santiment_to_crypto.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +""" +Merge Santiment Features with Crypto Features +============================================ + +This script merges Santiment data with existing crypto features by matching: +- symbol (crypto) = slug (santiment) +- interval_timestamp (crypto) = datetime (santiment) with ±1 hour tolerance + +The result includes all original crypto features plus all Santiment features. +""" + +import pandas as pd +import numpy as np +from datetime import datetime, timedelta +import os +from pathlib import Path +import logging + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# Resolve data directory base +try: + from src.config import DATA_DIR as CFG_DATA_DIR +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR + except Exception: + CFG_DATA_DIR = "/data" + + +def _resolve_under_data(path_like: str | os.PathLike) -> Path: + p = Path(path_like) + if p.is_absolute(): + return p + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return Path(CFG_DATA_DIR) / rel + +def convert_timestamp_to_datetime(timestamp_ms): + """ + Convert millisecond timestamp to datetime + + Args: + timestamp_ms: Timestamp in milliseconds + + Returns: + Datetime object + """ + return pd.to_datetime(timestamp_ms, unit='ms', utc=True) + +def normalize_symbol_mapping(): + """ + Create symbol mapping between crypto symbols and Santiment slugs + + Returns: + Dictionary mapping crypto symbols to Santiment slugs + """ + # Both crypto and Santiment use the same symbol names + return { + 'BTC': 'BTC', + 'ETH': 'ETH', + 'ADA': 'ADA', + 'SOL': 'SOL', + 'XRP': 'XRP' + } + +def load_data(): + """ + Load crypto features and Santiment features + + Returns: + Tuple of (crypto_df, santiment_df) + """ + logger.info("Loading data files...") + + # Load crypto features + crypto_file = _resolve_under_data('data/merged/features/crypto_features.parquet') + if not os.path.exists(crypto_file): + raise FileNotFoundError(f"Crypto features file not found: {crypto_file}") + + crypto_df = pd.read_parquet(crypto_file) + logger.info(f"Loaded crypto features: {crypto_df.shape[0]} rows, {crypto_df.shape[1]} columns") + + # Load Santiment features + santiment_file = _resolve_under_data('data/santiment/merged_features.parquet') + if not os.path.exists(santiment_file): + logger.warning(f"Santiment features file not found: {santiment_file}") + logger.warning("Proceeding without Santiment features (crypto-only output)") + return crypto_df, None + + santiment_df = pd.read_parquet(santiment_file) + logger.info(f"Loaded Santiment features: {santiment_df.shape[0]} rows, {santiment_df.shape[1]} columns") + + return crypto_df, santiment_df + +def prepare_crypto_data(crypto_df): + """ + Prepare crypto data for merging + + Args: + crypto_df: Crypto features DataFrame + + Returns: + Prepared crypto DataFrame + """ + logger.info("Preparing crypto data...") + + # Convert interval_timestamp to datetime + crypto_df = crypto_df.copy() + crypto_df['datetime'] = convert_timestamp_to_datetime(crypto_df['interval_timestamp']) + + # Set datetime as index for easier merging + crypto_df.set_index('datetime', inplace=True) + + logger.info(f"Crypto date range: {crypto_df.index.min()} to {crypto_df.index.max()}") + logger.info(f"Crypto symbols: {sorted(crypto_df['symbol'].unique())}") + + return crypto_df + +def prepare_santiment_data(santiment_df): + """ + Prepare Santiment data for merging + + Args: + santiment_df: Santiment features DataFrame + + Returns: + Prepared Santiment DataFrame + """ + logger.info("Preparing Santiment data...") + + santiment_df = santiment_df.copy() + + # Ensure datetime index is timezone-aware (convert to UTC if needed) + if santiment_df.index.tz is None: + santiment_df.index = pd.to_datetime(santiment_df.index, utc=True) + elif str(santiment_df.index.tz) != 'UTC': + santiment_df.index = santiment_df.index.tz_convert('UTC') + + logger.info(f"Santiment date range: {santiment_df.index.min()} to {santiment_df.index.max()}") + logger.info(f"Santiment slugs: {sorted(santiment_df['slug'].unique())}") + + return santiment_df + +def merge_with_tolerance(crypto_df, santiment_df, symbol_mapping, tolerance_hours=1): + """ + Merge crypto and Santiment data with time tolerance + + Args: + crypto_df: Prepared crypto DataFrame + santiment_df: Prepared Santiment DataFrame + symbol_mapping: Dict mapping crypto symbols to Santiment slugs + tolerance_hours: Time tolerance in hours for matching + + Returns: + Merged DataFrame + """ + logger.info(f"Starting merge with ±{tolerance_hours} hour tolerance...") + + merged_results = [] + tolerance = pd.Timedelta(hours=tolerance_hours) + + # Track merge statistics + total_crypto_records = len(crypto_df) + successful_matches = 0 + + for symbol, slug in symbol_mapping.items(): + logger.info(f"Processing {symbol} → {slug}") + + # Filter data for current symbol/slug + crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy() + santiment_slug = santiment_df[santiment_df['slug'] == slug].copy() + + if crypto_symbol.empty: + logger.warning(f"No crypto data found for symbol: {symbol}") + continue + + if santiment_slug.empty: + logger.warning(f"No Santiment data found for slug: {slug}") + # Add crypto data with null Santiment features + crypto_symbol_with_nulls = add_null_santiment_features(crypto_symbol, santiment_df.columns) + merged_results.append(crypto_symbol_with_nulls) + continue + + # Perform time-tolerance merge + merged_symbol = merge_by_time_tolerance(crypto_symbol, santiment_slug, tolerance) + merged_results.append(merged_symbol) + + matches = len(merged_symbol) + successful_matches += matches + logger.info(f" Matched {matches}/{len(crypto_symbol)} records for {symbol}") + + # Combine all results + if merged_results: + merged_df = pd.concat(merged_results, ignore_index=False) + logger.info(f"Merge completed: {successful_matches}/{total_crypto_records} records matched ({successful_matches/total_crypto_records*100:.1f}%)") + else: + logger.error("No data could be merged!") + return None + + return merged_df + +def merge_by_time_tolerance(crypto_symbol, santiment_slug, tolerance): + """ + Merge crypto and Santiment data for a single symbol with time tolerance + + Args: + crypto_symbol: Crypto data for one symbol + santiment_slug: Santiment data for one slug + tolerance: Time tolerance as Timedelta + + Returns: + Merged DataFrame for this symbol + """ + merged_records = [] + + for crypto_time, crypto_row in crypto_symbol.iterrows(): + # Find Santiment records within tolerance + time_diff = np.abs(santiment_slug.index - crypto_time) + within_tolerance = time_diff <= tolerance + + if within_tolerance.any(): + # Get the closest match within tolerance + closest_idx = time_diff[within_tolerance].idxmin() + santiment_row = santiment_slug.loc[closest_idx] + + # Combine crypto and Santiment features + combined_row = crypto_row.copy() + + # Add Santiment features (excluding 'slug' to avoid duplication) + for col in santiment_slug.columns: + if col != 'slug': # Don't overwrite symbol with slug + combined_row[f'santiment_{col}'] = santiment_row[col] + + merged_records.append(combined_row) + else: + # No match found - add with null Santiment features + combined_row = crypto_row.copy() + for col in santiment_slug.columns: + if col != 'slug': + combined_row[f'santiment_{col}'] = np.nan + merged_records.append(combined_row) + + return pd.DataFrame(merged_records, index=crypto_symbol.index) + +def add_null_santiment_features(crypto_df, santiment_columns): + """ + Add null Santiment features to crypto data when no Santiment data exists + + Args: + crypto_df: Crypto DataFrame + santiment_columns: Santiment column names + + Returns: + Crypto DataFrame with null Santiment features + """ + crypto_with_nulls = crypto_df.copy() + + for col in santiment_columns: + if col != 'slug': # Don't add slug column + crypto_with_nulls[f'santiment_{col}'] = np.nan + + return crypto_with_nulls + +def analyze_merge_quality(merged_df): + """ + Analyze the quality of the merge + + Args: + merged_df: Merged DataFrame + + Returns: + Dictionary with merge quality metrics + """ + logger.info("Analyzing merge quality...") + + # Count Santiment features (exclude slug) + santiment_cols = [col for col in merged_df.columns if col.startswith('santiment_')] + + analysis = { + 'total_records': len(merged_df), + 'santiment_features_added': len(santiment_cols), + 'symbols_processed': sorted(merged_df['symbol'].unique()), + 'completeness_by_symbol': {}, + 'overall_completeness': 0.0 + } + + # Analyze completeness by symbol + for symbol in analysis['symbols_processed']: + symbol_data = merged_df[merged_df['symbol'] == symbol] + + # Calculate how many records have non-null Santiment data + non_null_counts = symbol_data[santiment_cols].notna().sum(axis=1) + records_with_santiment = (non_null_counts > 0).sum() + + completeness = records_with_santiment / len(symbol_data) * 100 + analysis['completeness_by_symbol'][symbol] = { + 'total_records': len(symbol_data), + 'records_with_santiment': records_with_santiment, + 'completeness_pct': completeness + } + + # Overall completeness + all_santiment_data = merged_df[santiment_cols].notna().sum(axis=1) + records_with_any_santiment = (all_santiment_data > 0).sum() + analysis['overall_completeness'] = records_with_any_santiment / len(merged_df) * 100 + + return analysis + +def save_results(merged_df, analysis): + """ + Save merged results and analysis + + Args: + merged_df: Merged DataFrame + analysis: Merge quality analysis + """ + logger.info("Saving results...") + + # Create output directory + output_dir = 'data/merged/features' + os.makedirs(output_dir, exist_ok=True) + + # Save merged features + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = os.path.join(output_dir, f'crypto_with_santiment_features_{timestamp}.parquet') + + # Reset index to include datetime as column + merged_df_export = merged_df.reset_index() + merged_df_export.to_parquet(output_file, index=False) + + logger.info(f"Merged features saved to: {output_file}") + + # Save analysis report + analysis_file = os.path.join(output_dir, f'santiment_merge_analysis_{timestamp}.json') + import json + with open(analysis_file, 'w') as f: + json.dump(analysis, f, indent=2, default=str) + + logger.info(f"Analysis saved to: {analysis_file}") + + return output_file, analysis_file + +def main(): + """ + Main merge process + """ + logger.info("Starting Santiment-Crypto merge process...") + + try: + # Load data + crypto_df, santiment_df = load_data() + + # Prepare data + crypto_prepared = prepare_crypto_data(crypto_df) + if santiment_df is None: + logger.warning("No Santiment data available; exporting crypto-only dataset") + # Export crypto-only with datetime included + output_dir = 'data/merged/features' + os.makedirs(output_dir, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + output_file = os.path.join(output_dir, f'crypto_with_santiment_features_{timestamp}.parquet') + crypto_prepared.reset_index().to_parquet(output_file, index=False) + logger.info(f"Crypto-only features saved to: {output_file}") + return + santiment_prepared = prepare_santiment_data(santiment_df) + + # Define symbol mapping + symbol_mapping = normalize_symbol_mapping() + logger.info(f"Symbol mapping: {symbol_mapping}") + + # Perform merge + merged_df = merge_with_tolerance( + crypto_prepared, + santiment_prepared, + symbol_mapping, + tolerance_hours=1 + ) + + if merged_df is None: + logger.error("Merge failed!") + return + + # Analyze results + analysis = analyze_merge_quality(merged_df) + + # Print summary + print("\n" + "="*60) + print("SANTIMENT-CRYPTO MERGE SUMMARY") + print("="*60) + print(f"Total records: {analysis['total_records']}") + print(f"Santiment features added: {analysis['santiment_features_added']}") + print(f"Overall completeness: {analysis['overall_completeness']:.1f}%") + print(f"Symbols processed: {analysis['symbols_processed']}") + + print(f"\nCompleteness by symbol:") + for symbol, stats in analysis['completeness_by_symbol'].items(): + print(f" {symbol}: {stats['records_with_santiment']}/{stats['total_records']} " + f"({stats['completeness_pct']:.1f}%)") + + # Save results + output_file, analysis_file = save_results(merged_df, analysis) + + print(f"\nFiles saved:") + print(f" Merged data: {output_file}") + print(f" Analysis: {analysis_file}") + print("="*60) + + logger.info("Merge process completed successfully!") + + except Exception as e: + logger.error(f"Merge process failed: {e}") + raise + +if __name__ == "__main__": + main() diff --git a/src/merge/merge_santiment_with_crypto.py b/src/merge/merge_santiment_with_crypto.py new file mode 100644 index 0000000000000000000000000000000000000000..766dbf5c4c47d6588c32cd36162c6ae8817c1e20 --- /dev/null +++ b/src/merge/merge_santiment_with_crypto.py @@ -0,0 +1,586 @@ +""" +Santiment-Crypto Features Merger +=============================== + +This script merges the Santiment merged features with the existing normalized crypto features. +It reads santiment/merged_features.parquet and crypto_features_normalized.pkl, +aligns them by symbol and datetime, and creates a unified feature set. + +Features: +- Loads Santiment merged features (parquet) +- Loads existing crypto features (pickle) +- Symbol alignment and normalization +- Time-based merging with tolerance +- Feature name conflict resolution +- Creates unified normalized feature set + +Author: AI Assistant +Date: August 2025 +""" + +import os +import sys +import pandas as pd +import numpy as np +import pickle +from pathlib import Path +from datetime import datetime, timedelta +import logging +from typing import List, Dict, Optional, Tuple, Union + +# Resolve data directory base +try: + from src.config import DATA_DIR as CFG_DATA_DIR +except Exception: + try: + from config import DATA_DIR as CFG_DATA_DIR + except Exception: + CFG_DATA_DIR = "/data" + + +def _resolve_under_data(path_like: str | os.PathLike) -> Path: + p = Path(path_like) + if p.is_absolute(): + return p + parts = p.parts + if parts and parts[0].lower() == "data": + rel = Path(*parts[1:]) if len(parts) > 1 else Path() + else: + rel = p + return Path(CFG_DATA_DIR) / rel + +# Configure logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +class SantimentCryptoMerger: + """ + Merger for combining Santiment features with existing crypto features + """ + + def __init__(self, + santiment_file: str = "data/santiment/merged_features.parquet", + crypto_file: str = "data/merged/features/crypto_features.parquet", + output_file: str = "data/merged/features/crypto_features.parquet", + time_tolerance_hours: int = 1): + """ + Initialize the merger + + Args: + santiment_file: Path to original Santiment merged features parquet file + crypto_file: Path to original crypto features file (crypto_features.parquet) + output_file: Path for the final merged output file (will replace crypto_features.parquet) + time_tolerance_hours: Time tolerance for merging (hours) + """ + self.santiment_file = _resolve_under_data(santiment_file) + self.crypto_file = _resolve_under_data(crypto_file) + self.output_file = _resolve_under_data(output_file) + self.time_tolerance = timedelta(hours=time_tolerance_hours) + + # Ensure output directory exists + self.output_file.parent.mkdir(parents=True, exist_ok=True) + + # Data storage + self.santiment_data: Optional[pd.DataFrame] = None + self.crypto_data: Optional[pd.DataFrame] = None + self.merged_data: Optional[pd.DataFrame] = None + + # Processing stats + self.stats = { + 'santiment_records': 0, + 'crypto_records': 0, + 'common_symbols': 0, + 'merged_records': 0, + 'santiment_features': 0, + 'crypto_features': 0, + 'total_features': 0, + 'time_range': {} + } + + # Symbol normalizer + self.symbol_normalizer = self._setup_symbol_normalizer() + + def _setup_symbol_normalizer(self): + """Setup symbol normalization mapping""" + return { + # Common crypto symbols + 'bitcoin': 'BTC', 'btc': 'BTC', 'Bitcoin': 'BTC', 'BTC': 'BTC', + 'ethereum': 'ETH', 'eth': 'ETH', 'Ethereum': 'ETH', 'ETH': 'ETH', + 'ripple': 'XRP', 'xrp': 'XRP', 'Ripple': 'XRP', 'XRP': 'XRP', + 'solana': 'SOL', 'sol': 'SOL', 'Solana': 'SOL', 'SOL': 'SOL', + 'cardano': 'ADA', 'ada': 'ADA', 'Cardano': 'ADA', 'ADA': 'ADA', + 'polkadot': 'DOT', 'dot': 'DOT', 'Polkadot': 'DOT', 'DOT': 'DOT', + 'chainlink': 'LINK', 'link': 'LINK', 'Chainlink': 'LINK', 'LINK': 'LINK', + 'litecoin': 'LTC', 'ltc': 'LTC', 'Litecoin': 'LTC', 'LTC': 'LTC', + 'bitcoin-cash': 'BCH', 'bch': 'BCH', 'Bitcoin Cash': 'BCH', 'BCH': 'BCH', + 'stellar': 'XLM', 'xlm': 'XLM', 'Stellar': 'XLM', 'XLM': 'XLM', + 'ethereum-classic': 'ETC', 'etc': 'ETC', 'Ethereum Classic': 'ETC', 'ETC': 'ETC', + 'eos': 'EOS', 'EOS': 'EOS' + } + + def normalize_symbol(self, symbol: str) -> str: + """Normalize a symbol to canonical format""" + if symbol in self.symbol_normalizer: + return self.symbol_normalizer[symbol] + return symbol.upper() + + def load_santiment_data(self) -> bool: + """ + Load original Santiment merged features and apply time-shift logic + + Returns: + True if successful, False otherwise + """ + try: + if not self.santiment_file.exists(): + logger.error(f"Santiment file not found: {self.santiment_file}") + return False + + logger.info(f"Loading Santiment data from {self.santiment_file}") + self.santiment_data = pd.read_parquet(self.santiment_file) + + # Ensure datetime index + if not isinstance(self.santiment_data.index, pd.DatetimeIndex): + if 'datetime' in self.santiment_data.columns: + self.santiment_data.set_index('datetime', inplace=True) + else: + logger.error("No datetime index found in Santiment data") + return False + + # Ensure timezone consistency (convert to UTC) + if self.santiment_data.index.tz is None: + self.santiment_data.index = self.santiment_data.index.tz_localize('UTC') + else: + self.santiment_data.index = self.santiment_data.index.tz_convert('UTC') + + # Normalize symbol column + if 'slug' in self.santiment_data.columns: + self.santiment_data['symbol'] = self.santiment_data['slug'].apply(self.normalize_symbol) + self.santiment_data.drop(columns=['slug'], inplace=True) + elif 'symbol' in self.santiment_data.columns: + self.santiment_data['symbol'] = self.santiment_data['symbol'].apply(self.normalize_symbol) + else: + logger.error("No symbol/slug column found in Santiment data") + return False + + # Add feature prefix to avoid conflicts + feature_cols = [col for col in self.santiment_data.columns if col != 'symbol'] + rename_dict = {col: f"santiment_{col}" for col in feature_cols} + self.santiment_data.rename(columns=rename_dict, inplace=True) + + self.stats['santiment_records'] = len(self.santiment_data) + self.stats['santiment_features'] = len([col for col in self.santiment_data.columns if col != 'symbol']) + + logger.info(f"Loaded Santiment data: {len(self.santiment_data)} records, {len(self.santiment_data.columns)} columns") + logger.info(f"Santiment symbols: {sorted(self.santiment_data['symbol'].unique())}") + logger.info(f"Santiment date range: {self.santiment_data.index.min()} to {self.santiment_data.index.max()}") + + return True + + except Exception as e: + logger.error(f"Failed to load Santiment data: {e}") + return False + + def load_crypto_data(self) -> bool: + """ + Load existing crypto features + + Returns: + True if successful, False otherwise + """ + try: + if not self.crypto_file.exists(): + logger.error(f"Crypto file not found: {self.crypto_file}") + return False + + logger.info(f"Loading crypto data from {self.crypto_file}") + + # Load parquet file + self.crypto_data = pd.read_parquet(self.crypto_file) + + # Don't modify the index - work with interval_timestamp column directly + # The data is already clean and properly formatted from previous pipeline steps + if 'interval_timestamp' not in self.crypto_data.columns: + logger.error("No interval_timestamp column found in crypto data") + return False + + # Check for symbol column + symbol_col = None + for col in ['symbol', 'Symbol', 'ticker', 'asset', 'slug']: + if col in self.crypto_data.columns: + symbol_col = col + break + + if symbol_col is None: + logger.error("No symbol column found in crypto data") + logger.info(f"Available columns: {list(self.crypto_data.columns)}") + return False + + # Normalize symbol column + if symbol_col != 'symbol': + self.crypto_data['symbol'] = self.crypto_data[symbol_col] + self.crypto_data.drop(columns=[symbol_col], inplace=True) + + self.crypto_data['symbol'] = self.crypto_data['symbol'].apply(self.normalize_symbol) + + self.stats['crypto_records'] = len(self.crypto_data) + self.stats['crypto_features'] = len([col for col in self.crypto_data.columns if col != 'symbol']) + + logger.info(f"Loaded crypto data: {len(self.crypto_data)} records, {len(self.crypto_data.columns)} columns") + logger.info(f"Crypto symbols: {sorted(self.crypto_data['symbol'].unique())}") + logger.info(f"Crypto date range: {self.crypto_data['interval_timestamp'].min()} to {self.crypto_data['interval_timestamp'].max()}") + + return True + + except Exception as e: + logger.error(f"Failed to load crypto data: {e}") + return False + + def apply_time_shift_merge(self, crypto_df, santiment_df, symbol): + """ + Apply time-shifted merge for a specific symbol using day-of-week matching + This function preserves ALL crypto records and adds Santiment features where possible + + Args: + crypto_df: Crypto data for one symbol + santiment_df: Santiment data for one symbol + symbol: Symbol being processed + + Returns: + Merged DataFrame with ALL crypto records plus Santiment features + """ + logger.info(f" Time-shift merging {len(crypto_df)} crypto records for {symbol}") + + # Start with all crypto records + result_df = crypto_df.copy() + + # Initialize all Santiment columns with NaN + for col in santiment_df.columns: + if col != 'symbol': + result_df[col] = np.nan + + # For each crypto record, try to find a matching Santiment record + for crypto_idx, crypto_row in crypto_df.iterrows(): + # Convert crypto timestamp to datetime for comparison + crypto_timestamp_ms = crypto_row['interval_timestamp'] + crypto_time = pd.to_datetime(crypto_timestamp_ms, unit='ms', utc=True) + + # Find Santiment records with same day-of-week and similar time + santiment_same_weekday = santiment_df[ + santiment_df.index.dayofweek == crypto_time.dayofweek + ] + + if not santiment_same_weekday.empty: + # Find closest time-of-day match + crypto_time_of_day = crypto_time.time() + + time_diffs = santiment_same_weekday.index.map( + lambda x: abs((x.time().hour * 60 + x.time().minute) - + (crypto_time_of_day.hour * 60 + crypto_time_of_day.minute)) + ) + + closest_idx = time_diffs.argmin() + closest_idx = santiment_same_weekday.index[closest_idx] + santiment_row = santiment_same_weekday.loc[closest_idx] + + # Update the result DataFrame with Santiment features for this record + for col in santiment_df.columns: + if col != 'symbol': + result_df.loc[crypto_idx, col] = santiment_row[col] + + logger.info(f" Preserved all {len(result_df)} crypto records for {symbol}") + + # Count how many got Santiment data + santiment_cols = [col for col in santiment_df.columns if col != 'symbol'] + if santiment_cols: + non_null_count = result_df[santiment_cols[0]].notna().sum() + logger.info(f" Added Santiment features to {non_null_count}/{len(result_df)} records ({non_null_count/len(result_df)*100:.1f}%)") + + return result_df + def merge_datasets(self) -> bool: + """ + Merge Santiment and crypto datasets using time-shift logic + + Returns: + True if successful, False otherwise + """ + try: + if self.santiment_data is None or self.crypto_data is None: + logger.error("Both datasets must be loaded before merging") + return False + + logger.info("Starting time-shifted merge process...") + + # Check date ranges + # Convert crypto interval_timestamp to datetime for comparison + try: + crypto_timestamps = pd.to_datetime(self.crypto_data['interval_timestamp'], unit='ms', utc=True) + crypto_start, crypto_end = crypto_timestamps.min(), crypto_timestamps.max() + sant_start, sant_end = self.santiment_data.index.min(), self.santiment_data.index.max() + + logger.info(f"Crypto date range: {crypto_start} to {crypto_end}") + logger.info(f"Santiment date range: {sant_start} to {sant_end}") + except Exception as e: + logger.warning(f"Could not calculate date ranges for comparison: {e}") + # Use simple range instead + crypto_start = crypto_end = None + sant_start, sant_end = self.santiment_data.index.min(), self.santiment_data.index.max() + logger.info(f"Santiment date range: {sant_start} to {sant_end}") + + # Check for overlap + if crypto_start and crypto_end: + overlap = (crypto_start <= sant_end) and (sant_start <= crypto_end) + if not overlap: + logger.warning("No date overlap detected - using time-shift merge strategy") + else: + logger.warning("Using time-shift merge strategy (date comparison skipped)") + + # Find common symbols + santiment_symbols = set(self.santiment_data['symbol'].unique()) + crypto_symbols = set(self.crypto_data['symbol'].unique()) + common_symbols = santiment_symbols & crypto_symbols + + self.stats['common_symbols'] = len(common_symbols) + + logger.info(f"Common symbols found: {len(common_symbols)} - {sorted(common_symbols)}") + + if not common_symbols: + logger.error("No common symbols found between datasets") + # Fallback: produce crypto-only dataset with santiment_* columns as NaN + logger.info("Falling back to crypto-only merged output with empty Santiment features") + crypto_only = self.crypto_data.copy() + # If santiment_data is present but symbols mismatch, create placeholder santiment columns + sant_cols = [] + if self.santiment_data is not None: + sant_cols = [col for col in self.santiment_data.columns if col != 'symbol'] + # Prefix and add NaN columns + for col in sant_cols: + crypto_only[col] = np.nan + # Ensure we keep interval_timestamp and symbol ordering + self.merged_data = crypto_only.reset_index(drop=True) + self.stats['merged_records'] = len(self.merged_data) + self.stats['total_features'] = len([c for c in self.merged_data.columns if c != 'symbol']) + start_time = pd.to_datetime(self.merged_data['interval_timestamp'].min(), unit='ms', utc=True) + end_time = pd.to_datetime(self.merged_data['interval_timestamp'].max(), unit='ms', utc=True) + self.stats['time_range'] = { + 'start': str(start_time), + 'end': str(end_time), + 'total_days': (end_time - start_time).days + } + return True + + # Process each common symbol with time-shift merge + merged_parts = [] + total_merged_records = 0 + + for symbol in common_symbols: + logger.info(f"Processing {symbol} with time-shift merge...") + + sant_symbol = self.santiment_data[self.santiment_data['symbol'] == symbol].copy() + crypto_symbol = self.crypto_data[self.crypto_data['symbol'] == symbol].copy() + + if crypto_symbol.empty: + logger.warning(f"Skipping {symbol} - no crypto data") + continue + + if sant_symbol.empty: + logger.warning(f"No Santiment data for {symbol} - adding with null Santiment features") + # Add null Santiment columns to crypto data + sant_cols = [col for col in self.santiment_data.columns if col != 'symbol'] + for col in sant_cols: + crypto_symbol[col] = np.nan + # Reset index to avoid conflicts + crypto_symbol = crypto_symbol.reset_index(drop=True) + merged_parts.append(crypto_symbol) + total_merged_records += len(crypto_symbol) + else: + # Apply time-shift merge + merged_symbol = self.apply_time_shift_merge(crypto_symbol, sant_symbol, symbol) + # Reset index to avoid conflicts + merged_symbol = merged_symbol.reset_index(drop=True) + merged_parts.append(merged_symbol) + total_merged_records += len(merged_symbol) + + logger.info(f" Processed {len(crypto_symbol)} crypto records for {symbol}") + + # Add crypto-only symbols (without Santiment features) + crypto_only_symbols = crypto_symbols - common_symbols + for symbol in crypto_only_symbols: + logger.info(f"Adding crypto-only symbol: {symbol}") + crypto_only = self.crypto_data[self.crypto_data['symbol'] == symbol].copy() + + # Add null Santiment columns + sant_cols = [col for col in self.santiment_data.columns if col != 'symbol'] + for col in sant_cols: + crypto_only[col] = np.nan + + # Reset index to avoid conflicts + crypto_only = crypto_only.reset_index(drop=True) + merged_parts.append(crypto_only) + total_merged_records += len(crypto_only) + + # Combine all parts + if merged_parts: + self.merged_data = pd.concat(merged_parts, axis=0, ignore_index=True) + # Sort by interval_timestamp instead of index + self.merged_data = self.merged_data.sort_values('interval_timestamp') + + self.stats['merged_records'] = len(self.merged_data) + self.stats['total_features'] = len([col for col in self.merged_data.columns if col != 'symbol']) + + # Update time range using interval_timestamp + start_time = pd.to_datetime(self.merged_data['interval_timestamp'].min(), unit='ms', utc=True) + end_time = pd.to_datetime(self.merged_data['interval_timestamp'].max(), unit='ms', utc=True) + self.stats['time_range'] = { + 'start': str(start_time), + 'end': str(end_time), + 'total_days': (end_time - start_time).days + } + + logger.info(f"Total crypto records processed: {total_merged_records}") + logger.info("Time-shifted merge completed successfully!") + return True + else: + logger.error("No data to merge") + return False + + except Exception as e: + logger.error(f"Failed to merge datasets: {e}") + return False + + def save_merged_data(self) -> bool: + """ + Save the merged dataset, backing up the original crypto file + + Returns: + True if successful, False otherwise + """ + try: + if self.merged_data is None or self.merged_data.empty: + logger.error("No merged data to save") + return False + + # Backup original crypto file if it exists and is different from output + if self.crypto_file != self.output_file and self.crypto_file.exists(): + backup_file = self.crypto_file.with_suffix('.backup.parquet') + import shutil + shutil.copy2(self.crypto_file, backup_file) + logger.info(f"Backed up original crypto file to: {backup_file}") + + logger.info(f"Saving merged data to {self.output_file}") + + # Save with regular index since we're using interval_timestamp column + # Save as parquet (primary format) - this will replace crypto_features.parquet + self.merged_data.to_parquet(self.output_file, index=False, compression='snappy') + + # Don't create pickle file to avoid clutter + # pickle_file = self.output_file.with_suffix('.pkl') + # with open(pickle_file, 'wb') as f: + # pickle.dump(self.merged_data, f) + + logger.info(f"Merged data saved successfully!") + logger.info(f"Enhanced crypto file: {self.output_file}") + # logger.info(f"Pickle file: {pickle_file}") + + return True + + except Exception as e: + logger.error(f"Failed to save merged data: {e}") + return False + + def print_summary(self): + """Print merge summary""" + print("\n" + "="*70) + print("SANTIMENT-CRYPTO MERGER SUMMARY") + print("="*70) + + print(f"\nInput Data:") + print(f" Santiment records: {self.stats['santiment_records']:,}") + print(f" Santiment features: {self.stats['santiment_features']}") + print(f" Crypto records: {self.stats['crypto_records']:,}") + print(f" Crypto features: {self.stats['crypto_features']}") + + print(f"\nMerge Results:") + print(f" Common symbols: {self.stats['common_symbols']}") + print(f" Final records: {self.stats['merged_records']:,}") + print(f" Total features: {self.stats['total_features']}") + + if self.stats['time_range']: + print(f"\nTime Range:") + print(f" Start: {self.stats['time_range']['start']}") + print(f" End: {self.stats['time_range']['end']}") + print(f" Total days: {self.stats['time_range']['total_days']}") + + if self.merged_data is not None: + print(f"\nFinal Dataset:") + print(f" Memory usage: {self.merged_data.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB") + print(f" Null percentage: {(self.merged_data.isnull().sum().sum() / (len(self.merged_data) * len(self.merged_data.columns))) * 100:.2f}%") + + # Show symbol distribution + symbol_dist = self.merged_data['symbol'].value_counts() + print(f"\nSymbol Distribution:") + for symbol, count in symbol_dist.head(10).items(): + print(f" {symbol}: {count:,} records") + + print("="*70) + + def run_merge(self) -> bool: + """ + Run the complete merge process + + Returns: + True if successful, False otherwise + """ + try: + logger.info("Starting Santiment-Crypto merge process...") + + # Load data + sant_ok = self.load_santiment_data() + crypto_ok = self.load_crypto_data() + + if not crypto_ok: + return False + if not sant_ok: + logger.warning("Proceeding without Santiment data; emitting crypto-only output") + self.merged_data = self.crypto_data.copy() + # Save results immediately + if not self.save_merged_data(): + return False + self.print_summary() + logger.info("Santiment-Crypto merge completed successfully with crypto-only output") + return True + + # Merge datasets + if not self.merge_datasets(): + return False + + # Save results + if not self.save_merged_data(): + return False + + # Print summary + self.print_summary() + + logger.info("Santiment-Crypto merge completed successfully!") + return True + + except Exception as e: + logger.error(f"Merge process failed: {e}") + return False + + +def main(): + """Main function""" + merger = SantimentCryptoMerger( + santiment_file="data/santiment/merged_features.parquet", + # crypto_file="data/merged/features/crypto_features.parquet", + output_file="data/merged/features/crypto_features.parquet", # Replace original file + time_tolerance_hours=1 + ) + + success = merger.run_merge() + return success + + +if __name__ == "__main__": + main() diff --git a/src/merge/merge_temp.py b/src/merge/merge_temp.py new file mode 100644 index 0000000000000000000000000000000000000000..91f0c927385fa797635d6a631a11dee96b297871 --- /dev/null +++ b/src/merge/merge_temp.py @@ -0,0 +1,444 @@ +import pandas as pd +import os +import numpy as np +from datetime import datetime, timedelta + +DAYS_OLD = 7 +MERGED_DIR = "data/merged/features" +TEMP_DIR = "data/merged/temp" + +# Helper: safely cast a value to match a target column's dtype (e.g., drop tz on datetimes) +def _cast_value_for_column(target_series: pd.Series, value): + try: + # If target is datetime64[ns], ensure assigned value is tz-naive + if pd.api.types.is_datetime64_any_dtype(target_series.dtype): + v = pd.to_datetime(value, errors='coerce', utc=True) + if isinstance(v, pd.Timestamp): + return v.tz_localize(None) + return v + return value + except Exception: + return value + +def fill_nulls_from_temp(df_merged, df_temp): + """ + Fill null values in df_merged using non-null values from df_temp + for the same symbol + interval_timestamp combination. + Returns the number of null values filled. + """ + nulls_filled = 0 + + if df_merged.empty or df_temp.empty: + return nulls_filled + + # Create lookup key for efficient matching + key_cols = ["symbol", "interval_timestamp"] + + # Check if key columns exist in both dataframes + if not all(col in df_merged.columns for col in key_cols): + print("[WARN] Key columns missing in merged data, skipping null filling") + return nulls_filled + if not all(col in df_temp.columns for col in key_cols): + print("[WARN] Key columns missing in temp data, skipping null filling") + return nulls_filled + + # Create a lookup dictionary from temp data + # Format: {(symbol, timestamp): {column: value, ...}} + temp_lookup = {} + for _, row in df_temp.iterrows(): + key = (row['symbol'], row['interval_timestamp']) + temp_lookup[key] = row.to_dict() + + # Find common columns between merged and temp (excluding keys) + common_cols = [col for col in df_merged.columns + if col in df_temp.columns and col not in key_cols] + + if not common_cols: + print("[WARN] No common columns found for null filling") + return nulls_filled + + # Track columns with null values before processing + null_cols_before = [] + for col in common_cols: + if df_merged[col].isnull().any(): + null_cols_before.append(col) + + if not null_cols_before: + print("[INFO] No null values found in common columns") + return nulls_filled + + print(f"[INFO] Attempting to fill nulls in {len(null_cols_before)} columns: {null_cols_before}") + + # Fill null values row by row + for idx, row in df_merged.iterrows(): + key = (row['symbol'], row['interval_timestamp']) + + # Check if we have corresponding temp data for this key + if key in temp_lookup: + temp_row = temp_lookup[key] + + # Fill null values for each column + for col in null_cols_before: + try: + # Use more robust null checking to handle arrays/scalars + row_val = row[col] + temp_val = temp_row.get(col) + + # Check if row value is null (handle both scalar and array cases) + row_is_null = pd.isnull(row_val) + if hasattr(row_is_null, '__len__') and len(row_is_null) > 1: + row_is_null = row_is_null.any() # For arrays, check if any are null + + # Check if temp value is not null + temp_is_not_null = not pd.isnull(temp_val) + if hasattr(temp_is_not_null, '__len__') and len(temp_is_not_null) > 1: + temp_is_not_null = temp_is_not_null.all() # For arrays, check if all are not null + + if row_is_null and temp_is_not_null: + # Fill the null value with dtype-compatible casting + df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], temp_val) + nulls_filled += 1 + except Exception as e: + # Skip problematic columns with a warning + print(f"[WARN] Could not process column '{col}' for null filling: {e}") + continue + + if nulls_filled > 0: + print(f"[INFO] Successfully filled {nulls_filled} null values from temp data") + + # Report which columns were improved + for col in null_cols_before: + nulls_remaining = df_merged[col].isnull().sum() + print(f"[INFO] Column '{col}': {nulls_remaining} nulls remaining") + + return nulls_filled + +# Helper to filter new records (DISABLED - now keeps ALL data for accumulative merging) +def filter_new(df): + # IMPORTANT: Return ALL data instead of filtering by days + # This ensures accumulative merging from day one + return df.copy() + +def merge_temp_to_merged(temp_name, merged_name): + temp_path = os.path.join(TEMP_DIR, temp_name) + merged_path = os.path.join(MERGED_DIR, merged_name) + if not os.path.exists(temp_path): + print(f"[WARN] Temp file missing: {temp_path}") + return + if not os.path.exists(merged_path): + print(f"[WARN] Merged file missing: {merged_path}") + return + + df_temp = pd.read_parquet(temp_path) + df_merged = pd.read_parquet(merged_path) + + # Check if required columns exist + required_cols = ["symbol", "interval_timestamp"] + missing_cols_temp = [col for col in required_cols if col not in df_temp.columns] + missing_cols_merged = [col for col in required_cols if col not in df_merged.columns] + + if missing_cols_temp: + print(f"[ERROR] Missing columns in temp file {temp_name}: {missing_cols_temp}") + print(f"[INFO] Available columns in temp: {list(df_temp.columns)}") + return + + if missing_cols_merged: + print(f"[ERROR] Missing columns in merged file {merged_name}: {missing_cols_merged}") + print(f"[INFO] Available columns in merged: {list(df_merged.columns)}") + return + + new_temp = filter_new(df_temp) + + # Step 1: Fill null values in merged data using temp data for same symbol+timestamp + nulls_filled = fill_nulls_from_temp(df_merged, df_temp) + + # Step 2: Only add truly new rows (not already in merged) + key_cols = ["symbol", "interval_timestamp"] + merged_keys = set(tuple(row) for row in df_merged[key_cols].values) + new_rows = new_temp[~new_temp[key_cols].apply(tuple, axis=1).isin(merged_keys)] + + if new_rows.empty and nulls_filled == 0: + print(f"[INFO] No new records to add from {temp_name} and no nulls filled") + return + + df_final = pd.concat([df_merged, new_rows], ignore_index=True) + df_final.to_parquet(merged_path, index=False) + print(f"[OK] Added {len(new_rows)} new records from {temp_name} to {merged_name}, filled {nulls_filled} null values") + +def merge_all_to_train(archive_name, features_name, temp_name, train_name): + """ + Merge archive, features, and temp files into a deduplicated train file under merge/train/. + Uniqueness is enforced on (symbol, interval_timestamp). + Also performs null filling between different sources. + """ + ARCHIVE_DIR = os.path.join(MERGED_DIR, "archive") + TRAIN_DIR = os.path.join("data", "merged", "train") + os.makedirs(TRAIN_DIR, exist_ok=True) + features_path = os.path.join(MERGED_DIR, features_name) + temp_path = os.path.join(TEMP_DIR, temp_name) + train_path = os.path.join(TRAIN_DIR, train_name) + + dfs = [] + df_sources = {} # Track which dataframe came from which source + + # 1. Read all relevant archive files (recursively) + archive_dfs = [] + if os.path.isdir(ARCHIVE_DIR): + for root, dirs, files in os.walk(ARCHIVE_DIR): + for fname in files: + # Only include files matching the asset (e.g., crypto_features_archived_*.parquet) + if fname.startswith(archive_name.replace('.parquet', '_archived_')) and fname.endswith('.parquet'): + fpath = os.path.join(root, fname) + try: + archive_dfs.append(pd.read_parquet(fpath)) + except Exception as e: + print(f"[WARN] Could not read archive file {fpath}: {e}") + if archive_dfs: + df_archive = pd.concat(archive_dfs, ignore_index=True) + dfs.append(df_archive) + df_sources['archive'] = df_archive + else: + print(f"[WARN] No archive files found for {archive_name}") + + # 2. Read features and temp + if os.path.exists(features_path): + df_features = pd.read_parquet(features_path) + dfs.append(df_features) + df_sources['features'] = df_features + else: + print(f"[WARN] Missing: {features_path}") + + if os.path.exists(temp_path): + df_temp = pd.read_parquet(temp_path) + dfs.append(df_temp) + df_sources['temp'] = df_temp + else: + print(f"[WARN] Missing: {temp_path}") + + if not dfs: + print("[ERROR] No input files found.") + return + + # 3. Merge all data + df_all = pd.concat(dfs, ignore_index=True) + + # 4. Before deduplication, try to fill nulls using data from different sources + total_nulls_filled = 0 + if len(df_sources) > 1: + print(f"[INFO] Attempting cross-source null filling for {train_name}") + + # Create a comprehensive lookup from all sources + all_data_lookup = {} + for source_name, df_source in df_sources.items(): + for _, row in df_source.iterrows(): + key = (row['symbol'], row['interval_timestamp']) + if key not in all_data_lookup: + all_data_lookup[key] = {} + + # Add non-null values from this source + for col in df_source.columns: + try: + # Use more robust null checking to handle arrays/scalars + col_val = row[col] + + # Check if value is not null (handle both scalar and array cases) + is_not_null = not pd.isnull(col_val) + if hasattr(is_not_null, '__len__') and len(is_not_null) > 1: + is_not_null = is_not_null.all() # For arrays, check if all are not null + + if is_not_null: + all_data_lookup[key][col] = col_val + except Exception as e: + # Skip problematic columns with a warning + print(f"[WARN] Could not process column '{col}' for train lookup: {e}") + continue + + # Fill nulls in the combined dataframe + for idx, row in df_all.iterrows(): + key = (row['symbol'], row['interval_timestamp']) + if key in all_data_lookup: + lookup_row = all_data_lookup[key] + for col in df_all.columns: + try: + # Use more robust null checking + row_val = row[col] + + # Check if row value is null (handle both scalar and array cases) + row_is_null = pd.isnull(row_val) + if hasattr(row_is_null, '__len__') and len(row_is_null) > 1: + row_is_null = row_is_null.any() # For arrays, check if any are null + + if row_is_null and col in lookup_row: + df_all.at[idx, col] = _cast_value_for_column(df_all[col], lookup_row[col]) + total_nulls_filled += 1 + except Exception as e: + # Skip problematic columns with a warning + print(f"[WARN] Could not process column '{col}' for train null filling: {e}") + continue + + # 5. Deduplicate by symbol+interval_timestamp, keeping the last occurrence + df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last") + + # 6. Handle problematic columns that can't be serialized to parquet + problematic_cols = [] + for col in df_all.columns: + try: + # Test if column can be converted to parquet-compatible format + sample = df_all[col].iloc[0] if len(df_all) > 0 else None + if sample is not None and hasattr(sample, '__len__') and not isinstance(sample, str): + # If it's an array-like object (but not string), it might cause issues + if len(sample) > 1: # Multi-dimensional array + problematic_cols.append(col) + except: + # If we can't even check the sample, it's definitely problematic + problematic_cols.append(col) + + if problematic_cols: + print(f"[WARN] Dropping problematic columns that can't be serialized: {problematic_cols}") + df_all = df_all.drop(columns=problematic_cols) + + # Save to parquet + df_all.to_parquet(train_path, index=False) + + if total_nulls_filled > 0: + print(f"[OK] Created train file: {train_path} with {len(df_all)} records, filled {total_nulls_filled} nulls") + else: + print(f"[OK] Created train file: {train_path} with {len(df_all)} records.") + +def create_merged_features(): + """ + Create the main merged_features.parquet file by combining crypto and stock features + with intelligent null filling between the two datasets. + """ + crypto_path = os.path.join(MERGED_DIR, "crypto_features.parquet") + stocks_path = os.path.join(MERGED_DIR, "stocks_features.parquet") + merged_path = os.path.join(MERGED_DIR, "merged_features.parquet") + + dfs_to_merge = [] + + # Read crypto features + if os.path.exists(crypto_path): + df_crypto = pd.read_parquet(crypto_path) + dfs_to_merge.append(('crypto', df_crypto)) + print(f"[INFO] Loaded crypto features: {len(df_crypto)} rows, {len(df_crypto.columns)} columns") + else: + print(f"[WARN] Crypto features not found: {crypto_path}") + + # Read stock features + if os.path.exists(stocks_path): + df_stocks = pd.read_parquet(stocks_path) + dfs_to_merge.append(('stocks', df_stocks)) + print(f"[INFO] Loaded stock features: {len(df_stocks)} rows, {len(df_stocks.columns)} columns") + else: + print(f"[WARN] Stock features not found: {stocks_path}") + + if not dfs_to_merge: + print("[ERROR] No feature files found to merge") + return + + if len(dfs_to_merge) == 1: + # Only one dataset available, just copy it + df_merged = dfs_to_merge[0][1].copy() + print(f"[INFO] Only {dfs_to_merge[0][0]} features available") + else: + # Multiple datasets - merge with null filling + print("[INFO] Merging crypto and stock features with cross-dataset null filling") + + # Combine all dataframes + all_dfs = [df for _, df in dfs_to_merge] + df_merged = pd.concat(all_dfs, ignore_index=True, sort=False) + + # Perform cross-dataset null filling + # Create lookup from all datasets for same symbol+timestamp + lookup_data = {} + for dataset_name, df in dfs_to_merge: + for _, row in df.iterrows(): + key = (row['symbol'], row['interval_timestamp']) + if key not in lookup_data: + lookup_data[key] = {} + + # Add non-null values from this dataset + for col in df.columns: + try: + # Use more robust null checking to handle arrays/scalars + col_val = row[col] + + # Check if value is not null (handle both scalar and array cases) + is_not_null = not pd.isnull(col_val) + if hasattr(is_not_null, '__len__') and len(is_not_null) > 1: + is_not_null = is_not_null.all() # For arrays, check if all are not null + + if is_not_null: + lookup_data[key][col] = col_val + except Exception as e: + # Skip problematic columns with a warning + print(f"[WARN] Could not process column '{col}' for lookup: {e}") + continue + + # Fill nulls using the comprehensive lookup + nulls_filled = 0 + for idx, row in df_merged.iterrows(): + key = (row['symbol'], row['interval_timestamp']) + if key in lookup_data: + lookup_row = lookup_data[key] + for col in df_merged.columns: + try: + # Use more robust null checking + row_val = row[col] + + # Check if row value is null (handle both scalar and array cases) + row_is_null = pd.isnull(row_val) + if hasattr(row_is_null, '__len__') and len(row_is_null) > 1: + row_is_null = row_is_null.any() # For arrays, check if any are null + + if row_is_null and col in lookup_row: + df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], lookup_row[col]) + nulls_filled += 1 + except Exception as e: + # Skip problematic columns with a warning + print(f"[WARN] Could not process column '{col}' for null filling: {e}") + continue + + if nulls_filled > 0: + print(f"[INFO] Cross-dataset null filling: {nulls_filled} values filled") + + # Remove duplicates if any (keeping last occurrence) + initial_len = len(df_merged) + df_merged = df_merged.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last") + final_len = len(df_merged) + + if initial_len != final_len: + print(f"[INFO] Removed {initial_len - final_len} duplicate records") + + # Save merged features + df_merged.to_parquet(merged_path, index=False) + print(f"[OK] Created merged features: {merged_path} with {len(df_merged)} rows, {len(df_merged.columns)} columns") + + # Report statistics + nulls_remaining = df_merged.isnull().sum().sum() + print(f"[INFO] Merged features null count: {nulls_remaining}") + + # Report symbol breakdown + if 'symbol' in df_merged.columns: + symbol_counts = df_merged['symbol'].value_counts() + print(f"[INFO] Top symbols: {dict(symbol_counts.head(10))}") + +def main(): + import sys + + # Check if this is being run as a test + if len(sys.argv) > 1 and sys.argv[1] == '--test-null-filling': + from test_null_filling_merge import main as test_main + sys.exit(test_main()) + + merge_temp_to_merged("crypto_features.parquet", "crypto_features.parquet") + merge_temp_to_merged("stocks_features.parquet", "stocks_features.parquet") + + # Create the main merged features file + create_merged_features() + + merge_all_to_train("crypto_features.parquet", "crypto_features.parquet", "crypto_features.parquet", "crypto_features_train.parquet") + merge_all_to_train("stocks_features.parquet", "stocks_features.parquet", "stocks_features.parquet", "stocks_features_train.parquet") + +if __name__ == "__main__": + main() diff --git a/src/merge/norm/crypto.py b/src/merge/norm/crypto.py new file mode 100644 index 0000000000000000000000000000000000000000..6f24497b9423a52ea8b77b32e1e47489e6e7ca86 --- /dev/null +++ b/src/merge/norm/crypto.py @@ -0,0 +1,618 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, PowerTransformer +import json +import pickle +from datetime import datetime +import warnings +warnings.filterwarnings('ignore') +import os + +class CryptoDataNormalizer: + """ + Enhanced normalization pipeline for cryptocurrency features data with crypto-specific handling + """ + + def __init__(self, preserve_symbol=True, handle_outliers=True, feature_engineering=True): + self.scalers = {} + self.encoders = {} + self.feature_info = {} + self.is_fitted = False + self.preserve_symbol = preserve_symbol + self.handle_outliers = handle_outliers + self.feature_engineering = feature_engineering + self.outlier_bounds = {} + + def _detect_outliers(self, df, column): + """Detect outliers using IQR method""" + Q1 = df[column].quantile(0.25) + Q3 = df[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + return lower_bound, upper_bound + + def _handle_outliers(self, df, column, method='clip'): + """Handle outliers in numerical data""" + if column not in self.outlier_bounds: + lower_bound, upper_bound = self._detect_outliers(df, column) + self.outlier_bounds[column] = (lower_bound, upper_bound) + else: + lower_bound, upper_bound = self.outlier_bounds[column] + + if method == 'clip': + return df[column].clip(lower_bound, upper_bound) + elif method == 'remove': + return df[column].where((df[column] >= lower_bound) & (df[column] <= upper_bound)) + return df[column] + + def _categorize_features(self, df): + """Enhanced feature categorization for crypto data""" + # Core identification features + id_features = ['symbol', 'backup_id', '__index_level_0__', 'cg_id'] + + # Timestamp features + timestamp_features = [col for col in df.columns if 'timestamp' in col.lower()] + + # Binary features (0/1, True/False, or boolean-like) + binary_features = [] + for col in df.columns: + if col not in id_features + timestamp_features: + unique_vals = set(df[col].dropna().unique()) + if (df[col].dtype == bool or + (len(unique_vals) <= 2 and unique_vals.issubset({0, 1, True, False, np.nan})) or + col in ['stable']): + binary_features.append(col) + + # Categorical features (strings, objects, or low cardinality integers) + categorical_features = [] + for col in df.columns: + if (col not in id_features + binary_features + timestamp_features and + (df[col].dtype == 'object' or + df[col].dtype.name == 'category' or + (df[col].nunique() < 20 and df[col].dtype in ['int64', 'int32']))): + categorical_features.append(col) + + # Crypto-specific features + crypto_specific_features = [] + crypto_keywords = ['dominance', 'rank'] + for col in df.columns: + if any(keyword in col.lower() for keyword in crypto_keywords): + if col not in id_features + timestamp_features + binary_features + categorical_features: + crypto_specific_features.append(col) + + # Price/volume/market features + price_volume_features = [] + for col in df.columns: + if any(keyword in col.lower() for keyword in ['price', 'volume', 'marketcap', 'open']): + if col not in id_features + timestamp_features + binary_features + categorical_features + crypto_specific_features: + price_volume_features.append(col) + + # Exchange price features + exchange_features = [] + for col in df.columns: + if col.startswith('exchangePrices.'): + exchange_features.append(col) + + # Performance features + performance_features = [] + for col in df.columns: + if col.startswith('performance.'): + performance_features.append(col) + + # Rank difference features + rank_diff_features = [] + for col in df.columns: + if col.startswith('rankDiffs.'): + rank_diff_features.append(col) + + # Technical indicator features + technical_features = [] + tech_keywords = ['rsi', 'macd', 'ema', 'sma', 'bb_', 'cci', 'mfi', 'atr', 'stoch', 'roc'] + for col in df.columns: + if any(keyword in col.lower() for keyword in tech_keywords): + if col not in (id_features + timestamp_features + binary_features + categorical_features + + crypto_specific_features + price_volume_features + exchange_features + + performance_features + rank_diff_features): + technical_features.append(col) + + # Social sentiment features + social_features = [] + for col in df.columns: + if any(keyword in col.lower() for keyword in ['social', 'sentiment', 'confidence', 'pos', 'neg', 'neu']): + if col not in (id_features + timestamp_features + binary_features + categorical_features + + crypto_specific_features + price_volume_features + exchange_features + + performance_features + rank_diff_features + technical_features): + social_features.append(col) + + # Transaction/blockchain features + transaction_features = [] + for col in df.columns: + if any(keyword in col.lower() for keyword in ['transaction', 'tx_', 'gas', 'fees']): + if col not in (id_features + timestamp_features + binary_features + categorical_features + + crypto_specific_features + price_volume_features + exchange_features + + performance_features + rank_diff_features + technical_features + social_features): + transaction_features.append(col) + + # Data quality features + quality_features = [] + for col in df.columns: + if any(keyword in col.lower() for keyword in ['completeness', 'quality', 'correlation']): + if col not in (id_features + timestamp_features + binary_features + categorical_features + + crypto_specific_features + price_volume_features + exchange_features + + performance_features + rank_diff_features + technical_features + + social_features + transaction_features): + quality_features.append(col) + + # Remaining numerical features + numerical_features = [] + all_categorized = (id_features + timestamp_features + binary_features + categorical_features + + crypto_specific_features + price_volume_features + exchange_features + + performance_features + rank_diff_features + technical_features + + social_features + transaction_features + quality_features) + + for col in df.columns: + if (col not in all_categorized and + pd.api.types.is_numeric_dtype(df[col])): + numerical_features.append(col) + + return { + 'id_features': id_features, + 'timestamp_features': timestamp_features, + 'binary_features': binary_features, + 'categorical_features': categorical_features, + 'crypto_specific_features': crypto_specific_features, + 'price_volume_features': price_volume_features, + 'exchange_features': exchange_features, + 'performance_features': performance_features, + 'rank_diff_features': rank_diff_features, + 'technical_features': technical_features, + 'social_features': social_features, + 'transaction_features': transaction_features, + 'quality_features': quality_features, + 'numerical_features': numerical_features + } + + def _engineer_crypto_features(self, df, normalized_df): + """Create crypto-specific engineered features""" + if not self.feature_engineering: + return normalized_df + + # Exchange price spread analysis + exchange_cols = [col for col in df.columns if col.startswith('exchangePrices.')] + if len(exchange_cols) > 1: + exchange_prices = df[exchange_cols].replace([np.inf, -np.inf], np.nan) + if not exchange_prices.empty and exchange_prices.notna().any().any(): + price_mean = exchange_prices.mean(axis=1) + price_max = exchange_prices.max(axis=1) + price_min = exchange_prices.min(axis=1) + price_std = exchange_prices.std(axis=1) + + # Only calculate if we have valid data + valid_mask = (price_mean > 0) & price_mean.notna() + if valid_mask.any(): + normalized_df['exchange_price_spread'] = ((price_max - price_min) / price_mean).fillna(0) + normalized_df['exchange_price_std'] = (price_std / price_mean).fillna(0) + + # Performance momentum + perf_short_cols = [col for col in df.columns if col.startswith('performance.') and any(timeframe in col for timeframe in ['min1', 'min5', 'min15', 'hour'])] + perf_long_cols = [col for col in df.columns if col.startswith('performance.') and any(timeframe in col for timeframe in ['day', 'week', 'month'])] + + if perf_short_cols: + short_perf = df[perf_short_cols].replace([np.inf, -np.inf], np.nan) + normalized_df['short_term_momentum'] = short_perf.mean(axis=1).fillna(0) + if perf_long_cols: + long_perf = df[perf_long_cols].replace([np.inf, -np.inf], np.nan) + normalized_df['long_term_momentum'] = long_perf.mean(axis=1).fillna(0) + + # Rank stability + rank_diff_cols = [col for col in df.columns if col.startswith('rankDiffs.')] + if rank_diff_cols: + rank_diffs = df[rank_diff_cols].replace([np.inf, -np.inf], np.nan).fillna(0) + normalized_df['rank_stability'] = 1 / (1 + rank_diffs.abs().sum(axis=1) + 1e-8) # Add small epsilon to avoid division by zero + + # Social sentiment aggregation + social_sentiment_cols = [col for col in df.columns if 'social_sentiment' in col.lower()] + if social_sentiment_cols: + social_data = df[social_sentiment_cols].replace([np.inf, -np.inf], np.nan) + normalized_df['avg_social_sentiment'] = social_data.mean(axis=1).fillna(0.5) # Neutral sentiment + + # Technical strength (similar to stocks but crypto-focused) + tech_cols = [col for col in df.columns if any(tech in col.lower() for tech in ['rsi', 'macd', 'cci'])] + if tech_cols: + tech_data = df[tech_cols].replace([np.inf, -np.inf], np.nan) + normalized_df['technical_strength'] = tech_data.mean(axis=1).fillna(0) + + # Volume-price relationship + if 'volume' in df.columns and 'price' in df.columns: + volume = df['volume'].replace([np.inf, -np.inf], np.nan) + price = df['price'].replace([np.inf, -np.inf], np.nan) + valid_mask = (price > 0) & price.notna() & volume.notna() + if valid_mask.any(): + ratio = volume / price + normalized_df['volume_price_ratio'] = ratio.fillna(0) + + # Market dominance relative to rank + if 'dominance' in df.columns and 'rank' in df.columns: + dominance = df['dominance'].replace([np.inf, -np.inf], np.nan).fillna(0) + rank = df['rank'].replace([np.inf, -np.inf], np.nan).fillna(1000) # High rank for unknown + # Avoid division by zero + rank_reciprocal = 1 / (rank + 1e-8) + normalized_df['dominance_rank_ratio'] = (dominance / rank_reciprocal).fillna(0) + + return normalized_df + + def fit(self, df): + """Fit the normalizer on training data with crypto-specific preprocessing""" + if isinstance(df, dict): + df = pd.DataFrame([df]) + + self.feature_info = self._categorize_features(df) + + # Fit scalers for different feature types + feature_types = { + 'crypto_specific_features': RobustScaler(), # Rank and dominance can have outliers + 'price_volume_features': RobustScaler(), # Price and volume data often has outliers + 'exchange_features': StandardScaler(), # Exchange prices should be similar + 'performance_features': StandardScaler(), # Performance percentages + 'rank_diff_features': StandardScaler(), # Rank differences are usually small integers + 'technical_features': StandardScaler(), # Technical indicators are usually normalized + 'social_features': StandardScaler(), # Sentiment scores + 'transaction_features': PowerTransformer(), # Transaction data can be very skewed + 'quality_features': MinMaxScaler(), # Quality scores are usually 0-1 + 'numerical_features': PowerTransformer() # General numerical features + } + + for feature_type, scaler in feature_types.items(): + features = self.feature_info[feature_type] + if features: + # Filter existing columns + existing_features = [col for col in features if col in df.columns] + if existing_features: + # Handle outliers if enabled + if self.handle_outliers and feature_type in ['crypto_specific_features', 'price_volume_features']: + df_clean = df.copy() + for col in existing_features: + df_clean[col] = self._handle_outliers(df_clean, col) + else: + df_clean = df.copy() + + # Comprehensive data cleaning for fitting + try: + # Replace inf/-inf with NaN + df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], np.nan) + + # Fill NaN with appropriate strategy based on feature type + if feature_type in ['crypto_specific_features', 'price_volume_features']: + # For price/volume data, use forward fill then median + for col in existing_features: + df_clean[col] = df_clean[col].fillna(method='ffill').fillna(df_clean[col].median()).fillna(0) + elif feature_type in ['performance_features', 'rank_diff_features']: + # Performance and rank diffs can be 0 when no change + df_clean[existing_features] = df_clean[existing_features].fillna(0) + elif feature_type == 'quality_features': + # Quality features should default to reasonable values + df_clean[existing_features] = df_clean[existing_features].fillna(0.5) + else: + # General strategy: median then 0 + for col in existing_features: + df_clean[col] = df_clean[col].fillna(df_clean[col].median()).fillna(0) + + # Ensure no infinite values remain + df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], 0) + + # Fit the scaler + scaler.fit(df_clean[existing_features]) + self.scalers[feature_type] = scaler + self.feature_info[f'{feature_type}_existing'] = existing_features + + except Exception as e: + print(f"Warning: Could not fit scaler for {feature_type}: {e}") + # Skip this feature type if fitting fails + continue + + # Fit encoders for categorical features + for col in self.feature_info['categorical_features']: + if col in df.columns: + self.encoders[col] = LabelEncoder() + self.encoders[col].fit(df[col].astype(str).fillna('unknown')) + + self.is_fitted = True + return self + + def transform(self, data): + """Transform data using fitted normalizers with crypto-specific handling""" + if not self.is_fitted: + raise ValueError("Normalizer must be fitted before transform") + + if isinstance(data, dict): + df = pd.DataFrame([data]) + else: + df = data.copy() + + normalized_df = pd.DataFrame(index=df.index) + + # 1. Preserve symbol if requested + if self.preserve_symbol and 'symbol' in df.columns: + normalized_df['symbol'] = df['symbol'] + + # 2. Enhanced timestamp features + for col in self.feature_info['timestamp_features']: + if col in df.columns: + ts = pd.to_datetime(df[col], unit='ms', errors='coerce') + # Crypto markets are 24/7, so different time features + normalized_df[f'{col}_hour'] = ts.dt.hour / 23.0 + normalized_df[f'{col}_day_of_week'] = ts.dt.dayofweek / 6.0 + normalized_df[f'{col}_month'] = (ts.dt.month - 1) / 11.0 + normalized_df[f'{col}_quarter'] = (ts.dt.quarter - 1) / 3.0 + normalized_df[f'{col}_is_weekend'] = (ts.dt.dayofweek >= 5).astype(int) + # For crypto, we might want to track different time patterns + normalized_df[f'{col}_is_asian_hours'] = ((ts.dt.hour >= 0) & (ts.dt.hour <= 8)).astype(int) + normalized_df[f'{col}_is_european_hours'] = ((ts.dt.hour >= 8) & (ts.dt.hour <= 16)).astype(int) + normalized_df[f'{col}_is_american_hours'] = ((ts.dt.hour >= 16) & (ts.dt.hour <= 24)).astype(int) + + # 3. Binary features (keep as is, fill NaN with 0) + for col in self.feature_info['binary_features']: + if col in df.columns: + normalized_df[col] = df[col].fillna(0).astype(int) + + # 4. Categorical features with better encoding + for col in self.feature_info['categorical_features']: + if col in df.columns and col in self.encoders: + try: + # Handle unknown categories + values = df[col].astype(str).fillna('unknown') + encoded_values = [] + for val in values: + try: + encoded_values.append(self.encoders[col].transform([val])[0]) + except ValueError: + # Unknown category, assign most frequent class + encoded_values.append(0) + normalized_df[f'{col}_encoded'] = encoded_values + except Exception: + normalized_df[f'{col}_encoded'] = 0 + + # 5. Scale different feature types with appropriate scalers + feature_types = ['crypto_specific_features', 'price_volume_features', 'exchange_features', + 'performance_features', 'rank_diff_features', 'technical_features', + 'social_features', 'transaction_features', 'quality_features', 'numerical_features'] + + for feature_type in feature_types: + if feature_type in self.scalers: + existing_features = self.feature_info.get(f'{feature_type}_existing', []) + available_features = [col for col in existing_features if col in df.columns] + if available_features: + try: + # Handle outliers if enabled + if (self.handle_outliers and + feature_type in ['crypto_specific_features', 'price_volume_features']): + df_clean = df.copy() + for col in available_features: + if col in self.outlier_bounds: + lower_bound, upper_bound = self.outlier_bounds[col] + df_clean[col] = df_clean[col].clip(lower_bound, upper_bound) + else: + df_clean = df.copy() + + # Comprehensive data cleaning for transform + # Replace inf/-inf with NaN + df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], np.nan) + + # Fill NaN with appropriate strategy based on feature type + if feature_type in ['crypto_specific_features', 'price_volume_features']: + # For price/volume data, use forward fill then median from training + for col in available_features: + df_clean[col] = df_clean[col].fillna(method='ffill').fillna(method='bfill').fillna(0) + elif feature_type in ['performance_features', 'rank_diff_features']: + # Performance and rank diffs can be 0 when no change + df_clean[available_features] = df_clean[available_features].fillna(0) + elif feature_type == 'quality_features': + # Quality features should default to reasonable values + df_clean[available_features] = df_clean[available_features].fillna(0.5) + else: + # General strategy: 0 (since we don't have training medians in transform) + df_clean[available_features] = df_clean[available_features].fillna(0) + + # Ensure no infinite values remain + df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], 0) + + # Transform the data + scaled_data = self.scalers[feature_type].transform(df_clean[available_features]) + + # Add scaled features with descriptive names + scaler_name = type(self.scalers[feature_type]).__name__.lower().replace('scaler', '').replace('transformer', '') + for i, col in enumerate(available_features): + normalized_df[f'{col}_{scaler_name}_scaled'] = scaled_data[:, i] + + except Exception as e: + print(f"Warning: Could not transform {feature_type}: {e}") + # If transformation fails, add original features with minimal processing + for col in available_features: + if col in df.columns: + clean_col = df[col].replace([np.inf, -np.inf], np.nan).fillna(0) + normalized_df[f'{col}_raw'] = clean_col + + # 6. Crypto-specific feature engineering + normalized_df = self._engineer_crypto_features(df, normalized_df) + + # 7. Final comprehensive cleanup of any remaining issues + # Replace any infinite values that might have been created + normalized_df = normalized_df.replace([np.inf, -np.inf], np.nan) + + # Fill remaining NaN values with appropriate defaults + for col in normalized_df.columns: + if normalized_df[col].isna().any(): + if col == 'symbol': + continue # Don't fill symbol + elif 'sentiment' in col.lower(): + normalized_df[col] = normalized_df[col].fillna(0.5) # Neutral sentiment + elif 'ratio' in col.lower() or 'momentum' in col.lower(): + normalized_df[col] = normalized_df[col].fillna(0) # No change/neutral + elif 'hour' in col or 'day_of_week' in col or 'month' in col or 'quarter' in col: + normalized_df[col] = normalized_df[col].fillna(0) # Time features + elif col.endswith('_encoded'): + normalized_df[col] = normalized_df[col].fillna(0) # Encoded categories + else: + normalized_df[col] = normalized_df[col].fillna(0) # General fallback + + # Final validation - ensure no NaN or infinite values remain + try: + assert not normalized_df.isnull().any().any(), "Still contains NaN values after cleanup" + assert not np.isinf(normalized_df.select_dtypes(include=[np.number])).any().any(), "Still contains infinite values after cleanup" + except AssertionError as e: + print(f"Warning: {e}") + # Emergency cleanup + normalized_df = normalized_df.fillna(0).replace([np.inf, -np.inf], 0) + + return normalized_df + + def fit_transform(self, data): + """Fit and transform in one step""" + return self.fit(data).transform(data) + + def get_feature_importance_info(self): + """Return information about feature categories for model interpretation""" + return { + 'feature_categories': self.feature_info, + 'scalers_used': {k: type(v).__name__ for k, v in self.scalers.items()}, + 'total_features': sum(len(features) for features in self.feature_info.values() if isinstance(features, list)) + } + + def save(self, filepath): + """Save the fitted normalizer""" + with open(filepath, 'wb') as f: + pickle.dump({ + 'scalers': self.scalers, + 'encoders': self.encoders, + 'feature_info': self.feature_info, + 'is_fitted': self.is_fitted, + 'preserve_symbol': self.preserve_symbol, + 'handle_outliers': self.handle_outliers, + 'feature_engineering': self.feature_engineering, + 'outlier_bounds': self.outlier_bounds + }, f) + + def load(self, filepath): + """Load a fitted normalizer""" + with open(filepath, 'rb') as f: + data = pickle.load(f) + self.scalers = data['scalers'] + self.encoders = data['encoders'] + self.feature_info = data['feature_info'] + self.is_fitted = data['is_fitted'] + self.preserve_symbol = data.get('preserve_symbol', True) + self.handle_outliers = data.get('handle_outliers', True) + self.feature_engineering = data.get('feature_engineering', True) + self.outlier_bounds = data.get('outlier_bounds', {}) + return self + +def normalize_crypto_data_file(input_file, output_file, save_normalizer=True, **kwargs): + """ + Enhanced normalization function for crypto data + """ + # Load data + if input_file.endswith('.parquet'): + df = pd.read_parquet(input_file) + print(f"Loaded {len(df)} records with {len(df.columns)} features from parquet") + else: + data = [] + with open(input_file, 'r') as f: + for line in f: + data.append(json.loads(line.strip())) + df = pd.DataFrame(data) + print(f"Loaded {len(df)} records with {len(df.columns)} features from jsonl") + + # Initialize crypto normalizer + normalizer = CryptoDataNormalizer(**kwargs) + + # Show feature categorization + feature_info = normalizer._categorize_features(df) + print("\nCrypto Feature Categorization:") + for category, features in feature_info.items(): + if features: + print(f" {category}: {len(features)} features") + + # Fit and transform + normalized_df = normalizer.fit_transform(df) + + print(f"\nNormalized to {len(normalized_df.columns)} features") + print(f"Data shape: {normalized_df.shape}") + + # Show feature importance info + importance_info = normalizer.get_feature_importance_info() + print(f"\nScalers used: {importance_info['scalers_used']}") + + # Ensure output directory exists + import os + output_dir = os.path.dirname(output_file) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + + # Save normalized data as pickle instead of CSV + pkl_output_file = output_file.replace('.csv', '.pkl') + normalized_df.to_pickle(pkl_output_file) + print(f"Saved normalized data to {pkl_output_file}") + + # Save normalizer + if save_normalizer: + normalizer_file = output_file.replace('.csv', '_crypto_normalizer.pkl') + normalizer.save(normalizer_file) + print(f"Saved normalizer to {normalizer_file}") + + return normalized_df, normalizer + +# CLI function +import argparse + +def main(): + parser = argparse.ArgumentParser( + description="Enhanced normalization for cryptocurrency features with crypto-specific handling" + ) + parser.add_argument('input', nargs='?', default='data/merged/features/crypto_features.parquet', + help='Input file (.parquet or .jsonl)') + parser.add_argument('output', nargs='?', default='data/merged/features/norm/crypto_features_normalized.pkl', + help='Output PKL file for normalized features') + parser.add_argument('--no-save-normalizer', action='store_true', + help='Do not save the normalizer pickle') + parser.add_argument('--no-preserve-symbol', action='store_true', + help='Do not preserve symbol column') + parser.add_argument('--no-handle-outliers', action='store_true', + help='Do not handle outliers') + parser.add_argument('--no-feature-engineering', action='store_true', + help='Do not create engineered features') + parser.add_argument('--train', action='store_true', + help='Normalize the train file and save under train/norm/') + + args = parser.parse_args() + + kwargs = { + 'preserve_symbol': not args.no_preserve_symbol, + 'handle_outliers': not args.no_handle_outliers, + 'feature_engineering': not args.no_feature_engineering + } + + if args.train: + train_input = 'data/merged/train/crypto_features_train.parquet' + train_norm_dir = 'data/merged/train/norm' + os.makedirs(train_norm_dir, exist_ok=True) + train_output = os.path.join(train_norm_dir, 'crypto_features_train_normalized.pkl') + print(f"[INFO] Normalizing train file: {train_input} -> {train_output}") + normalize_crypto_data_file( + train_input, + train_output, + save_normalizer=not args.no_save_normalizer, + **kwargs + ) + else: + print(f"[INFO] Enhanced crypto normalizing: {args.input} -> {args.output}") + print(f"[INFO] Options: {kwargs}") + normalize_crypto_data_file( + args.input, + args.output, + save_normalizer=not args.no_save_normalizer, + **kwargs + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/norm/stocks.py b/src/merge/norm/stocks.py new file mode 100644 index 0000000000000000000000000000000000000000..dc76264f8046c3dd567b0b90db0c5b080559a617 --- /dev/null +++ b/src/merge/norm/stocks.py @@ -0,0 +1,600 @@ +import pandas as pd +import numpy as np +from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, PowerTransformer +import json +import pickle +from datetime import datetime +import warnings +warnings.filterwarnings('ignore') + +class ImprovedStockDataNormalizer: + """ + Enhanced normalization pipeline for stock features data with better feature handling + """ + + def __init__(self, preserve_symbol=True, handle_outliers=True, feature_engineering=True): + self.scalers = {} + self.encoders = {} + self.feature_info = {} + self.is_fitted = False + self.preserve_symbol = preserve_symbol + self.handle_outliers = handle_outliers + self.feature_engineering = feature_engineering + self.outlier_bounds = {} + + def _detect_outliers(self, df, column): + """Detect outliers using IQR method""" + Q1 = df[column].quantile(0.25) + Q3 = df[column].quantile(0.75) + IQR = Q3 - Q1 + lower_bound = Q1 - 1.5 * IQR + upper_bound = Q3 + 1.5 * IQR + return lower_bound, upper_bound + + def _handle_outliers(self, df, column, method='clip'): + """Handle outliers in numerical data""" + if column not in self.outlier_bounds: + lower_bound, upper_bound = self._detect_outliers(df, column) + self.outlier_bounds[column] = (lower_bound, upper_bound) + else: + lower_bound, upper_bound = self.outlier_bounds[column] + + if method == 'clip': + return df[column].clip(lower_bound, upper_bound) + elif method == 'remove': + return df[column].where((df[column] >= lower_bound) & (df[column] <= upper_bound)) + return df[column] + + def _categorize_features(self, df): + """Enhanced feature categorization with better detection""" + # Core identification features + id_features = ['symbol', 'backup_id', '__index_level_0__'] + + # Timestamp features + timestamp_features = [col for col in df.columns if 'timestamp' in col.lower()] + + # Binary features (0/1, True/False, or boolean-like) + binary_features = [] + for col in df.columns: + if col not in id_features + timestamp_features: + # Skip columns with array-like values (unhashable) + try: + vals = df[col].dropna().unique() + # If any value is a list/array, skip this column + if any(isinstance(v, (list, np.ndarray)) for v in vals): + continue + unique_vals = set(vals) + except TypeError: + continue + if (df[col].dtype == bool or + (len(unique_vals) <= 2 and unique_vals.issubset({0, 1, True, False, np.nan})) or + col.startswith('is_')): + binary_features.append(col) + + # Categorical features (strings, objects, or low cardinality integers) + categorical_features = [] + for col in df.columns: + if (col not in id_features + binary_features + timestamp_features and + (df[col].dtype == 'object' or + df[col].dtype.name == 'category' or + (df[col].nunique() < 20 and df[col].dtype in ['int64', 'int32']))): + categorical_features.append(col) + + # Price/volume features (need special handling) + price_volume_features = [] + for col in df.columns: + if any(keyword in col.lower() for keyword in ['price', 'volume', 'vwap', 'market', 'cap']): + if col not in id_features + timestamp_features + binary_features + categorical_features: + price_volume_features.append(col) + + # Technical indicator features + technical_features = [] + tech_keywords = ['rsi', 'macd', 'ema', 'sma', 'bb_', 'cci', 'mfi', 'atr', 'stoch', 'roc'] + for col in df.columns: + if any(keyword in col.lower() for keyword in tech_keywords): + if col not in id_features + timestamp_features + binary_features + categorical_features + price_volume_features: + technical_features.append(col) + + # News/sentiment features + news_features = [] + for col in df.columns: + if any(keyword in col.lower() for keyword in ['news', 'sentiment', 'pos', 'neg', 'neu']): + if col not in id_features + timestamp_features + binary_features + categorical_features + price_volume_features + technical_features: + news_features.append(col) + + # Count/ratio features + count_features = [] + for col in df.columns: + if any(keyword in col.lower() for keyword in ['count', 'size', 'ratio', 'change']): + if col not in id_features + timestamp_features + binary_features + categorical_features + price_volume_features + technical_features + news_features: + count_features.append(col) + + # Remaining numerical features + numerical_features = [] + all_categorized = (id_features + timestamp_features + binary_features + + categorical_features + price_volume_features + + technical_features + news_features + count_features) + + for col in df.columns: + if (col not in all_categorized and + pd.api.types.is_numeric_dtype(df[col])): + numerical_features.append(col) + + return { + 'id_features': id_features, + 'timestamp_features': timestamp_features, + 'binary_features': binary_features, + 'categorical_features': categorical_features, + 'price_volume_features': price_volume_features, + 'technical_features': technical_features, + 'news_features': news_features, + 'count_features': count_features, + 'numerical_features': numerical_features + } + + def _engineer_features(self, df, normalized_df): + """Create additional engineered features""" + if not self.feature_engineering: + return normalized_df + + # Price momentum features + if 'close' in df.columns and 'prev_close' in df.columns: + close = df['close'].replace([np.inf, -np.inf], np.nan) + prev_close = df['prev_close'].replace([np.inf, -np.inf], np.nan) + valid_mask = (prev_close > 0) & prev_close.notna() & close.notna() + if valid_mask.any(): + momentum = (close - prev_close) / prev_close + normalized_df['price_momentum'] = momentum.fillna(0) + + # Volume-price relationship + if 'volume' in df.columns and 'close' in df.columns: + volume = df['volume'].replace([np.inf, -np.inf], np.nan) + close = df['close'].replace([np.inf, -np.inf], np.nan) + valid_mask = (close > 0) & close.notna() & volume.notna() + if valid_mask.any(): + ratio = volume / close + normalized_df['volume_price_ratio'] = ratio.fillna(0) + + # Volatility features + if 'high' in df.columns and 'low' in df.columns and 'close' in df.columns: + high = df['high'].replace([np.inf, -np.inf], np.nan) + low = df['low'].replace([np.inf, -np.inf], np.nan) + close = df['close'].replace([np.inf, -np.inf], np.nan) + valid_mask = (close > 0) & close.notna() & high.notna() & low.notna() + if valid_mask.any(): + daily_range = (high - low) / close + normalized_df['daily_range'] = daily_range.fillna(0) + + # News sentiment aggregation + sentiment_cols = [col for col in df.columns if 'sentiment' in col.lower() and 'mean' in col.lower()] + if sentiment_cols: + sentiment_data = df[sentiment_cols].replace([np.inf, -np.inf], np.nan) + normalized_df['avg_sentiment'] = sentiment_data.mean(axis=1).fillna(0.5) # Neutral sentiment + + # Technical indicator strength + tech_cols = [col for col in df.columns if any(tech in col.lower() for tech in ['rsi', 'macd', 'cci'])] + if tech_cols: + tech_data = df[tech_cols].replace([np.inf, -np.inf], np.nan) + normalized_df['technical_strength'] = tech_data.mean(axis=1).fillna(0) + + return normalized_df + + def fit(self, df): + """Fit the normalizer on training data with enhanced preprocessing""" + if isinstance(df, dict): + df = pd.DataFrame([df]) + + self.feature_info = self._categorize_features(df) + + # Fit scalers for different feature types + feature_types = ['price_volume_features', 'technical_features', 'news_features', + 'count_features', 'numerical_features'] + + for feature_type in feature_types: + features = self.feature_info[feature_type] + if features: + # Filter existing columns + existing_features = [col for col in features if col in df.columns] + if existing_features: + # Choose appropriate scaler based on feature type + if feature_type == 'price_volume_features': + scaler = RobustScaler() # Robust to outliers + elif feature_type == 'technical_features': + scaler = StandardScaler() # Most technical indicators are already normalized + elif feature_type in ['count_features', 'numerical_features']: + scaler = PowerTransformer(method='yeo-johnson') # Handle skewed distributions + else: + scaler = StandardScaler() + + try: + # Handle outliers if enabled + if self.handle_outliers: + df_clean = df.copy() + for col in existing_features: + df_clean[col] = self._handle_outliers(df_clean, col) + else: + df_clean = df.copy() + + # Comprehensive data cleaning for fitting + # Replace inf/-inf with NaN + df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], np.nan) + + # Fill NaN with appropriate strategy based on feature type + if feature_type == 'price_volume_features': + # For price/volume data, use forward fill then median + for col in existing_features: + df_clean[col] = df_clean[col].fillna(method='ffill').fillna(df_clean[col].median()).fillna(0) + elif feature_type == 'technical_features': + # Technical indicators: use median for each column + for col in existing_features: + median_val = df_clean[col].median() + df_clean[col] = df_clean[col].fillna(median_val if not pd.isna(median_val) else 0) + elif feature_type == 'news_features': + # News features: neutral values + for col in existing_features: + if 'sentiment' in col.lower(): + df_clean[col] = df_clean[col].fillna(0.5) # Neutral sentiment + elif 'count' in col.lower(): + df_clean[col] = df_clean[col].fillna(0) # No news + else: + df_clean[col] = df_clean[col].fillna(df_clean[col].median()).fillna(0) + else: + # General strategy: median then 0 + for col in existing_features: + df_clean[col] = df_clean[col].fillna(df_clean[col].median()).fillna(0) + + # Ensure no infinite values remain + df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], 0) + + # Fit the scaler + scaler.fit(df_clean[existing_features]) + self.scalers[feature_type] = scaler + self.feature_info[f'{feature_type}_existing'] = existing_features + + except Exception as e: + print(f"Warning: Could not fit scaler for {feature_type}: {e}") + # Skip this feature type if fitting fails + continue + + # Fit encoders for categorical features + for col in self.feature_info['categorical_features']: + if col in df.columns: + self.encoders[col] = LabelEncoder() + self.encoders[col].fit(df[col].astype(str).fillna('unknown')) + + self.is_fitted = True + return self + + def transform(self, data): + """Transform data using fitted normalizers with enhanced feature handling""" + if not self.is_fitted: + raise ValueError("Normalizer must be fitted before transform") + + if isinstance(data, dict): + df = pd.DataFrame([data]) + else: + df = data.copy() + + normalized_df = pd.DataFrame(index=df.index) + + # 1. Preserve symbol if requested + if self.preserve_symbol and 'symbol' in df.columns: + normalized_df['symbol'] = df['symbol'] + + # 2. Enhanced timestamp features + for col in self.feature_info['timestamp_features']: + if col in df.columns: + ts = pd.to_datetime(df[col], unit='ms', errors='coerce') + # More comprehensive time features + normalized_df[f'{col}_hour'] = ts.dt.hour / 23.0 + normalized_df[f'{col}_day_of_week'] = ts.dt.dayofweek / 6.0 + normalized_df[f'{col}_month'] = (ts.dt.month - 1) / 11.0 + normalized_df[f'{col}_quarter'] = (ts.dt.quarter - 1) / 3.0 + normalized_df[f'{col}_is_weekend'] = (ts.dt.dayofweek >= 5).astype(int) + normalized_df[f'{col}_is_market_hours'] = ((ts.dt.hour >= 9) & (ts.dt.hour <= 16) & (ts.dt.dayofweek < 5)).astype(int) + + # 3. Binary features (keep as is, fill NaN with 0) + for col in self.feature_info['binary_features']: + if col in df.columns: + normalized_df[col] = df[col].fillna(0).astype(int) + + # 4. Categorical features with better encoding + for col in self.feature_info['categorical_features']: + if col in df.columns and col in self.encoders: + try: + # Handle unknown categories + values = df[col].astype(str).fillna('unknown') + encoded_values = [] + for val in values: + try: + encoded_values.append(self.encoders[col].transform([val])[0]) + except ValueError: + # Unknown category, assign most frequent class + encoded_values.append(0) + normalized_df[f'{col}_encoded'] = encoded_values + except Exception: + normalized_df[f'{col}_encoded'] = 0 + + # 5. Scale different feature types with appropriate scalers + feature_types = ['price_volume_features', 'technical_features', 'news_features', + 'count_features', 'numerical_features'] + + for feature_type in feature_types: + if feature_type in self.scalers: + existing_features = self.feature_info.get(f'{feature_type}_existing', []) + available_features = [col for col in existing_features if col in df.columns] + + if available_features: + try: + # Handle outliers if enabled + if self.handle_outliers: + df_clean = df.copy() + for col in available_features: + if col in self.outlier_bounds: + lower_bound, upper_bound = self.outlier_bounds[col] + df_clean[col] = df_clean[col].clip(lower_bound, upper_bound) + else: + df_clean = df.copy() + + # Comprehensive data cleaning for transform + # Replace inf/-inf with NaN + df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], np.nan) + + # Fill NaN with appropriate strategy based on feature type + if feature_type == 'price_volume_features': + # For price/volume data, use forward fill then back fill + for col in available_features: + df_clean[col] = df_clean[col].fillna(method='ffill').fillna(method='bfill').fillna(0) + elif feature_type == 'technical_features': + # Technical indicators: use neutral values + for col in available_features: + if 'rsi' in col.lower(): + df_clean[col] = df_clean[col].fillna(50) # Neutral RSI + elif any(indicator in col.lower() for indicator in ['macd', 'cci']): + df_clean[col] = df_clean[col].fillna(0) # Neutral MACD/CCI + else: + df_clean[col] = df_clean[col].fillna(0) + elif feature_type == 'news_features': + # News features: neutral values + for col in available_features: + if 'sentiment' in col.lower(): + df_clean[col] = df_clean[col].fillna(0.5) # Neutral sentiment + elif 'count' in col.lower(): + df_clean[col] = df_clean[col].fillna(0) # No news + else: + df_clean[col] = df_clean[col].fillna(0) + else: + # General strategy: 0 (since we don't have training medians in transform) + df_clean[available_features] = df_clean[available_features].fillna(0) + + # Ensure no infinite values remain + df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], 0) + + # Transform the data + scaled_data = self.scalers[feature_type].transform(df_clean[available_features]) + + # Add scaled features with descriptive names + scaler_name = type(self.scalers[feature_type]).__name__.lower().replace('scaler', '').replace('transformer', '') + for i, col in enumerate(available_features): + normalized_df[f'{col}_{scaler_name}_scaled'] = scaled_data[:, i] + + except Exception as e: + print(f"Warning: Could not transform {feature_type}: {e}") + # If transformation fails, add original features with minimal processing + for col in available_features: + if col in df.columns: + clean_col = df[col].replace([np.inf, -np.inf], np.nan).fillna(0) + normalized_df[f'{col}_raw'] = clean_col + + # 6. Feature engineering + normalized_df = self._engineer_features(df, normalized_df) + + # 7. Final comprehensive cleanup of any remaining issues + # Replace any infinite values that might have been created + normalized_df = normalized_df.replace([np.inf, -np.inf], np.nan) + + # Fill remaining NaN values with appropriate defaults + for col in normalized_df.columns: + if normalized_df[col].isna().any(): + if col == 'symbol': + continue # Don't fill symbol + elif 'sentiment' in col.lower(): + normalized_df[col] = normalized_df[col].fillna(0.5) # Neutral sentiment + elif 'ratio' in col.lower() or 'momentum' in col.lower(): + normalized_df[col] = normalized_df[col].fillna(0) # No change/neutral + elif 'hour' in col or 'day_of_week' in col or 'month' in col or 'quarter' in col: + normalized_df[col] = normalized_df[col].fillna(0) # Time features + elif col.endswith('_encoded'): + normalized_df[col] = normalized_df[col].fillna(0) # Encoded categories + else: + normalized_df[col] = normalized_df[col].fillna(0) # General fallback + + # Final validation - ensure no NaN or infinite values remain + try: + assert not normalized_df.isnull().any().any(), "Still contains NaN values after cleanup" + assert not np.isinf(normalized_df.select_dtypes(include=[np.number])).any().any(), "Still contains infinite values after cleanup" + except AssertionError as e: + print(f"Warning: {e}") + # Emergency cleanup + normalized_df = normalized_df.fillna(0).replace([np.inf, -np.inf], 0) + + return normalized_df + + def fit_transform(self, data): + """Fit and transform in one step""" + return self.fit(data).transform(data) + + def get_feature_importance_info(self): + """Return information about feature categories for model interpretation""" + return { + 'feature_categories': self.feature_info, + 'scalers_used': {k: type(v).__name__ for k, v in self.scalers.items()}, + 'total_features': sum(len(features) for features in self.feature_info.values() if isinstance(features, list)) + } + + def save(self, filepath): + """Save the fitted normalizer""" + with open(filepath, 'wb') as f: + pickle.dump({ + 'scalers': self.scalers, + 'encoders': self.encoders, + 'feature_info': self.feature_info, + 'is_fitted': self.is_fitted, + 'preserve_symbol': self.preserve_symbol, + 'handle_outliers': self.handle_outliers, + 'feature_engineering': self.feature_engineering, + 'outlier_bounds': self.outlier_bounds + }, f) + + def load(self, filepath): + """Load a fitted normalizer""" + with open(filepath, 'rb') as f: + data = pickle.load(f) + self.scalers = data['scalers'] + self.encoders = data['encoders'] + self.feature_info = data['feature_info'] + self.is_fitted = data['is_fitted'] + self.preserve_symbol = data.get('preserve_symbol', True) + self.handle_outliers = data.get('handle_outliers', True) + self.feature_engineering = data.get('feature_engineering', True) + self.outlier_bounds = data.get('outlier_bounds', {}) + return self + +def cap_outliers(df, features=None, method='iqr', factor=1.5): + """ + Cap outliers in the DataFrame for the given features using the IQR method. + If features is None, all numeric columns are used. + """ + capped_df = df.copy() + if features is None: + features = capped_df.select_dtypes(include=[np.number]).columns + for col in features: + if col not in capped_df.columns: + continue + Q1 = capped_df[col].quantile(0.25) + Q3 = capped_df[col].quantile(0.75) + IQR = Q3 - Q1 + lower = Q1 - factor * IQR + upper = Q3 + factor * IQR + capped_df[col] = np.clip(capped_df[col], lower, upper) + print(f"Capped outliers in {col}: [{lower:.3g}, {upper:.3g}]") + return capped_df + +# Example usage after normalization: +# normalized_df = cap_outliers(normalized_df, features=['price_momentum', 'volume_price_ratio', 'daily_range', 'technical_strength']) +# (You can call this function in your pipeline after normalization, before saving or modeling.) + +def normalize_stock_data_file_improved(input_file, output_file, save_normalizer=True, **kwargs): + """ + Enhanced normalization function with better defaults + """ + # Load data + if input_file.endswith('.parquet'): + df = pd.read_parquet(input_file) + print(f"Loaded {len(df)} records with {len(df.columns)} features from parquet") + else: + data = [] + with open(input_file, 'r') as f: + for line in f: + data.append(json.loads(line.strip())) + df = pd.DataFrame(data) + print(f"Loaded {len(df)} records with {len(df.columns)} features from jsonl") + + # Initialize improved normalizer + normalizer = ImprovedStockDataNormalizer(**kwargs) + + # Show feature categorization + feature_info = normalizer._categorize_features(df) + print("\nFeature Categorization:") + for category, features in feature_info.items(): + if features: + print(f" {category}: {len(features)} features") + + # Fit and transform + normalized_df = normalizer.fit_transform(df) + + print(f"\nNormalized to {len(normalized_df.columns)} features") + print(f"Data shape: {normalized_df.shape}") + + # Cap outliers in engineered features + engineered_features = ['price_momentum', 'volume_price_ratio', 'daily_range', 'technical_strength'] + normalized_df = cap_outliers(normalized_df, features=[f for f in engineered_features if f in normalized_df.columns]) + + # Show feature importance info + importance_info = normalizer.get_feature_importance_info() + print(f"\nScalers used: {importance_info['scalers_used']}") + + # Ensure output directory exists + import os + output_dir = os.path.dirname(output_file) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + + # Save normalized data as pickle + pkl_output_file = output_file.replace('.csv', '.pkl') + normalized_df.to_pickle(pkl_output_file) + print(f"Saved normalized data to {pkl_output_file}") + + # Save normalizer + if save_normalizer: + normalizer_file = pkl_output_file.replace('.pkl', '_improved_normalizer.pkl') + normalizer.save(normalizer_file) + print(f"Saved normalizer to {normalizer_file}") + + return normalized_df, normalizer + +# CLI function +import argparse + +def main(): + parser = argparse.ArgumentParser( + description="Enhanced normalization for stock/crypto features with better handling of different feature types" + ) + parser.add_argument('input', nargs='?', default='data/merged/features/stocks_features.parquet', + help='Input file (.parquet or .jsonl)') + parser.add_argument('output', nargs='?', default='data/merged/features/norm/stocks_features_improved_normalized.pkl', + help='Output pickle file for normalized features') + parser.add_argument('--no-save-normalizer', action='store_true', + help='Do not save the normalizer pickle') + parser.add_argument('--no-preserve-symbol', action='store_true', + help='Do not preserve symbol column') + parser.add_argument('--no-handle-outliers', action='store_true', + help='Do not handle outliers') + parser.add_argument('--no-feature-engineering', action='store_true', + help='Do not create engineered features') + parser.add_argument('--train', action='store_true', + help='Normalize the train file and save under train/norm/') + + args = parser.parse_args() + + kwargs = { + 'preserve_symbol': not args.no_preserve_symbol, + 'handle_outliers': not args.no_handle_outliers, + 'feature_engineering': not args.no_feature_engineering + } + + if args.train: + train_input = 'data/merged/train/stocks_features_train.parquet' + train_norm_dir = 'data/merged/train/norm' + import os + os.makedirs(train_norm_dir, exist_ok=True) + train_output = os.path.join(train_norm_dir, 'stocks_features_train_normalized.pkl') + print(f"[INFO] Normalizing train file: {train_input} -> {train_output}") + normalize_stock_data_file_improved( + train_input, + train_output, + save_normalizer=not args.no_save_normalizer, + **kwargs + ) + else: + print(f"[INFO] Enhanced normalizing: {args.input} -> {args.output}") + print(f"[INFO] Options: {kwargs}") + normalize_stock_data_file_improved( + args.input, + args.output, + save_normalizer=not args.no_save_normalizer, + **kwargs + ) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/merge/norm/test_null_handling.py b/src/merge/norm/test_null_handling.py new file mode 100644 index 0000000000000000000000000000000000000000..e98a837324b7dc4ec4f242f062ef576f647233f5 --- /dev/null +++ b/src/merge/norm/test_null_handling.py @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Test script to verify null handling improvements in normalization +""" + +import pandas as pd +import numpy as np +import sys +from pathlib import Path + +# Add the norm directory to path +sys.path.append(str(Path(__file__).parent)) + +# Import the normalizers +from crypto import CryptoDataNormalizer +from stocks import ImprovedStockDataNormalizer + +def create_test_crypto_data(): + """Create test crypto data with various null scenarios""" + data = { + 'symbol': ['bitcoin', 'ethereum', 'cardano'] * 10, + 'price': [50000, np.nan, 2000] * 10, + 'volume': [1000000, 2000000, np.inf] * 10, + 'dominance': [0.4, 0.15, np.nan] * 10, + 'rank': [1, 2, 8] * 10, + 'performance.day': [2.5, -1.2, np.nan] * 10, + 'performance.week': [-5.0, np.inf, 1.5] * 10, + 'exchangePrices.binance': [50001, 1601, np.nan] * 10, + 'exchangePrices.coinbase': [49999, np.nan, 2001] * 10, + 'rsi': [65, np.nan, 45] * 10, + 'macd': [100, -50, np.nan] * 10, + 'interval_timestamp': [1640995200000] * 30, + 'stable': [False, False, False] * 10, + 'transaction_count': [1000, np.nan, 500] * 10 + } + return pd.DataFrame(data) + +def create_test_stock_data(): + """Create test stock data with various null scenarios""" + data = { + 'symbol': ['AAPL', 'GOOGL', 'MSFT'] * 10, + 'close': [150, np.nan, 300] * 10, + 'prev_close': [148, 2850, np.inf] * 10, + 'volume': [1000000, 500000, np.nan] * 10, + 'high': [152, 2870, 305] * 10, + 'low': [147, np.nan, 295] * 10, + 'rsi': [65, 45, np.nan] * 10, + 'macd': [1.5, -0.8, np.nan] * 10, + 'news_sentiment_mean_x': [0.7, np.nan, 0.3] * 10, + 'news_articles_count_x': [5, 0, np.nan] * 10, + 'marketCapitalization': [2500000000000, np.inf, 2000000000000] * 10, + 'interval_timestamp': [1640995200000] * 30 + } + return pd.DataFrame(data) + +def test_crypto_normalizer(): + """Test crypto normalizer with null handling""" + print("Testing Crypto Normalizer...") + + # Create test data + df = create_test_crypto_data() + print(f"Original data shape: {df.shape}") + print(f"Original nulls: {df.isnull().sum().sum()}") + print(f"Original infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum()}") + + # Initialize and test normalizer + try: + normalizer = CryptoDataNormalizer() + normalized = normalizer.fit_transform(df) + + print(f"Normalized data shape: {normalized.shape}") + print(f"Remaining nulls: {normalized.isnull().sum().sum()}") + print(f"Remaining infinite values: {np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum()}") + + if normalized.isnull().sum().sum() == 0 and np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum() == 0: + print("✅ Crypto normalizer passed null handling test!") + return True + else: + print("❌ Crypto normalizer failed null handling test!") + return False + + except Exception as e: + print(f"❌ Crypto normalizer failed with error: {e}") + return False + +def test_stock_normalizer(): + """Test stock normalizer with null handling""" + print("\nTesting Stock Normalizer...") + + # Create test data + df = create_test_stock_data() + print(f"Original data shape: {df.shape}") + print(f"Original nulls: {df.isnull().sum().sum()}") + print(f"Original infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum()}") + + # Initialize and test normalizer + try: + normalizer = ImprovedStockDataNormalizer() + normalized = normalizer.fit_transform(df) + + print(f"Normalized data shape: {normalized.shape}") + print(f"Remaining nulls: {normalized.isnull().sum().sum()}") + print(f"Remaining infinite values: {np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum()}") + + if normalized.isnull().sum().sum() == 0 and np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum() == 0: + print("✅ Stock normalizer passed null handling test!") + return True + else: + print("❌ Stock normalizer failed null handling test!") + return False + + except Exception as e: + print(f"❌ Stock normalizer failed with error: {e}") + return False + +def main(): + """Run all tests""" + print("="*60) + print("TESTING NULL HANDLING IMPROVEMENTS") + print("="*60) + + crypto_passed = test_crypto_normalizer() + stock_passed = test_stock_normalizer() + + print("\n" + "="*60) + print("TEST RESULTS SUMMARY") + print("="*60) + + if crypto_passed and stock_passed: + print("🎉 All tests passed! Null handling improvements are working correctly.") + return 0 + else: + print("❌ Some tests failed. Review the output above for details.") + return 1 + +if __name__ == "__main__": + exit_code = main() + sys.exit(exit_code) diff --git a/src/merge/normalize.py b/src/merge/normalize.py new file mode 100644 index 0000000000000000000000000000000000000000..54f954e3226e5326cb8cf202eed5eb99ddd90ee6 --- /dev/null +++ b/src/merge/normalize.py @@ -0,0 +1,23 @@ +# This script runs both the stock and crypto normalization pipelines from the norm/ directory +import sys +import os + +# Add norm directory to sys.path for imports +norm_dir = os.path.join(os.path.dirname(__file__), 'norm') +sys.path.insert(0, norm_dir) + +# Import and run stock normalization +try: + from norm import stocks + print("\n--- Running Stock Normalization ---") + stocks.main() +except Exception as e: + print(f"[ERROR] Stock normalization failed: {e}") + +# Import and run crypto normalization +try: + from norm import crypto + print("\n--- Running Crypto Normalization ---") + crypto.main() +except Exception as e: + print(f"[ERROR] Crypto normalization failed: {e}") diff --git a/src/merge/remove_null_symbols.py b/src/merge/remove_null_symbols.py new file mode 100644 index 0000000000000000000000000000000000000000..90085c50cfabfe6216d4619dfcee95cac06cb9cf --- /dev/null +++ b/src/merge/remove_null_symbols.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 +""" +Remove rows with null symbols from crypto and stock features. +This script ensures that all records have valid symbols for downstream processing. +""" + +import pandas as pd +from pathlib import Path + +def remove_null_symbols(): + """Remove rows with null symbols from crypto and stock features.""" + + # Process crypto features + crypto_path = Path("data/merged/features/crypto_features.parquet") + if crypto_path.exists(): + df_crypto = pd.read_parquet(crypto_path) + + initial_count = len(df_crypto) + null_count = df_crypto['symbol'].isnull().sum() + + if null_count > 0: + # Remove null symbol rows + df_crypto_clean = df_crypto[df_crypto['symbol'].notnull()].copy() + + final_count = len(df_crypto_clean) + removed_count = initial_count - final_count + + print(f"[CRYPTO] Removed {removed_count} rows with null symbols ({final_count} remaining)") + + # Save cleaned data + df_crypto_clean.to_parquet(crypto_path, index=False) + + # Verify no null symbols remain + remaining_nulls = df_crypto_clean['symbol'].isnull().sum() + if remaining_nulls > 0: + print(f"⚠️ Warning: {remaining_nulls} null symbols still remain") + + # Process stock features + stocks_path = Path("data/merged/features/stocks_features.parquet") + if stocks_path.exists(): + df_stocks = pd.read_parquet(stocks_path) + + initial_count = len(df_stocks) + null_count = df_stocks['symbol'].isnull().sum() + + if null_count > 0: + # Remove null symbol rows + df_stocks_clean = df_stocks[df_stocks['symbol'].notnull()].copy() + + final_count = len(df_stocks_clean) + removed_count = initial_count - final_count + + print(f"[STOCKS] Removed {removed_count} rows with null symbols ({final_count} remaining)") + + # Save cleaned data + df_stocks_clean.to_parquet(stocks_path, index=False) + + # Verify no null symbols remain + remaining_nulls = df_stocks_clean['symbol'].isnull().sum() + if remaining_nulls > 0: + print(f"⚠️ Warning: {remaining_nulls} null symbols still remain") + +if __name__ == "__main__": + remove_null_symbols() diff --git a/src/merge/run_final_null_handling.py b/src/merge/run_final_null_handling.py new file mode 100644 index 0000000000000000000000000000000000000000..2876dea12d002e0f34c2708f5570ec2528e3ab4c --- /dev/null +++ b/src/merge/run_final_null_handling.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +""" +Final Null Handler Integration Script +Integrates the final null value handler into the existing merge pipeline. +""" + +import sys +import subprocess +from pathlib import Path +import numpy as np +import pandas as pd +from final_null_handler import FinalNullValueHandler, process_crypto_features_file, process_stock_features_file + +def run_final_null_handling(): + """Run the final null value handling on all feature files""" + + print("="*60) + print("STARTING FINAL NULL VALUE HANDLING") + print("="*60) + + base_path = Path("data/merged/features") + + files_to_process = [ + ("crypto_features.parquet", "crypto"), + ("stocks_features.parquet", "stock"), + ("merged_features.parquet", "merged") + ] + + results = {} + + for filename, file_type in files_to_process: + file_path = base_path / filename + + if not file_path.exists(): + print(f"[WARNING] {filename} not found, skipping...") + continue + + print(f"\n[INFO] Processing {filename}...") + + try: + if file_type == "crypto": + df_processed, report = process_crypto_features_file(file_path) + elif file_type == "stock": + df_processed, report = process_stock_features_file(file_path) + elif file_type == "merged": + # For merged file, determine type by content + df_processed, report = process_merged_features_file(file_path) + + results[file_type] = { + 'success': True, + 'file_path': file_path, + 'report': report, + 'rows': len(df_processed), + 'nulls_filled': report['total_nulls_filled'] + } + + print(f"[SUCCESS] {filename} processed successfully!") + print(f" - Rows: {len(df_processed):,}") + print(f" - Nulls filled: {report['total_nulls_filled']:,}") + + except Exception as e: + print(f"[ERROR] Error processing {filename}: {str(e)}") + results[file_type] = { + 'success': False, + 'error': str(e), + 'file_path': file_path + } + + return results + +def process_merged_features_file(file_path): + """Process merged features file (contains both crypto and stock data)""" + print(f"Loading merged features from {file_path}...") + df = pd.read_parquet(file_path) + + print(f"Loaded {len(df)} rows with {len(df.columns)} columns") + print(f"Null values before processing: {df.isnull().sum().sum()}") + + handler = FinalNullValueHandler() + + # Separate crypto and stock data if possible + if 'symbol' in df.columns: + # Detect crypto vs stock based on available columns + crypto_indicators = ['rank', 'dominance', 'performance.day', 'exchangePrices.binance'] + stock_indicators = ['news_activity_score_x', 'strongBuy', 'marketCapitalization'] + + has_crypto_cols = any(col in df.columns for col in crypto_indicators) + has_stock_cols = any(col in df.columns for col in stock_indicators) + + if has_crypto_cols and has_stock_cols: + # Mixed data - process intelligently + print("Detected mixed crypto/stock data - processing intelligently...") + + # Try to separate by symbol patterns or available data + crypto_mask = df['rank'].notna() | df['dominance'].notna() + if crypto_mask.any(): + print(f"Processing {crypto_mask.sum()} rows as crypto data...") + df_crypto = df[crypto_mask].copy() + df_crypto_processed = handler.process_crypto_features(df_crypto) + df.loc[crypto_mask] = df_crypto_processed + + stock_mask = ~crypto_mask + if stock_mask.any(): + print(f"Processing {stock_mask.sum()} rows as stock data...") + df_stock = df[stock_mask].copy() + df_stock_processed = handler.process_stock_features(df_stock) + df.loc[stock_mask] = df_stock_processed + + df_processed = df + + elif has_crypto_cols: + print("Detected crypto-only data...") + df_processed = handler.process_crypto_features(df) + elif has_stock_cols: + print("Detected stock-only data...") + df_processed = handler.process_stock_features(df) + else: + print("Could not determine data type, applying generic processing...") + df_processed = handler.process_stock_features(df) # Default to stock processing + else: + print("No symbol column found, applying generic processing...") + df_processed = handler.process_stock_features(df) + + print(f"Null values after processing: {df_processed.isnull().sum().sum()}") + + # Generate report + report = handler.generate_report(df, df_processed, 'merged') + + # Save processed data + df_processed.to_parquet(file_path, index=False) + print(f"Saved processed merged features to {file_path}") + + return df_processed, report + +def validate_data_quality(results): + """Validate that the data quality is maintained after null handling""" + print("\n" + "="*60) + print("DATA QUALITY VALIDATION") + print("="*60) + + validation_results = {} + + for file_type, result in results.items(): + if not result.get('success', False): + continue + + file_path = result['file_path'] + + try: + df = pd.read_parquet(file_path) + + # Basic validation checks + validation = { + 'total_rows': len(df), + 'total_columns': len(df.columns), + 'remaining_nulls': df.isnull().sum().sum(), + 'duplicate_rows': df.duplicated().sum(), + 'infinite_values': np.isinf(df.select_dtypes(include=[np.number])).sum().sum(), + 'data_types_consistent': True, # Could add more sophisticated checks + } + + # Check for unrealistic values + numeric_cols = df.select_dtypes(include=[np.number]).columns + extreme_values = {} + + for col in numeric_cols: + if col in df.columns: + col_data = df[col].dropna() + if len(col_data) > 0: + q1, q99 = col_data.quantile([0.01, 0.99]) + extreme_count = ((col_data < q1 - 10 * (q99 - q1)) | + (col_data > q99 + 10 * (q99 - q1))).sum() + if extreme_count > 0: + extreme_values[col] = extreme_count + + validation['extreme_values'] = extreme_values + validation['quality_score'] = calculate_quality_score(validation) + + validation_results[file_type] = validation + + print(f"\n{file_type.upper()} VALIDATION:") + print(f" ✓ Rows: {validation['total_rows']:,}") + print(f" ✓ Columns: {validation['total_columns']}") + print(f" ✓ Remaining nulls: {validation['remaining_nulls']}") + print(f" ✓ Duplicate rows: {validation['duplicate_rows']}") + print(f" ✓ Infinite values: {validation['infinite_values']}") + print(f" ✓ Quality score: {validation['quality_score']:.2%}") + + if extreme_values: + print(f" [WARNING] Extreme values detected in {len(extreme_values)} columns") + + except Exception as e: + print(f"[ERROR] Validation failed for {file_type}: {str(e)}") + validation_results[file_type] = {'error': str(e)} + + return validation_results + +def calculate_quality_score(validation): + """Calculate a simple quality score""" + score = 1.0 + + # Penalize remaining nulls + if validation['total_rows'] > 0: + null_ratio = validation['remaining_nulls'] / (validation['total_rows'] * validation['total_columns']) + score -= null_ratio * 0.5 + + # Penalize duplicates + if validation['total_rows'] > 0: + dup_ratio = validation['duplicate_rows'] / validation['total_rows'] + score -= dup_ratio * 0.3 + + # Penalize infinite values + if validation['infinite_values'] > 0: + score -= 0.1 + + # Penalize extreme values + extreme_columns = len(validation.get('extreme_values', {})) + if extreme_columns > 0: + score -= (extreme_columns / validation['total_columns']) * 0.2 + + return max(0.0, score) + +def print_final_summary(results, validation_results): + """Print final summary of the null handling process""" + print("\n" + "="*60) + print("FINAL NULL HANDLING SUMMARY") + print("="*60) + + total_nulls_filled = sum(r.get('nulls_filled', 0) for r in results.values() if r.get('success')) + successful_files = sum(1 for r in results.values() if r.get('success')) + total_files = len(results) + + print(f"\n[INFO] PROCESSING RESULTS:") + print(f" Files processed: {successful_files}/{total_files}") + print(f" Total nulls filled: {total_nulls_filled:,}") + + print(f"\n[METRICS] QUALITY METRICS:") + for file_type, validation in validation_results.items(): + if 'error' not in validation: + print(f" {file_type}: {validation['quality_score']:.1%} quality score") + + if successful_files == total_files: + print(f"\n[SUCCESS] ALL FILES PROCESSED SUCCESSFULLY!") + else: + failed_files = total_files - successful_files + print(f"\n[WARNING] {failed_files} files failed to process") + + print("\n[TIPS] RECOMMENDATIONS:") + print(" - Review any remaining null columns in the reports") + print(" - Monitor data quality scores in production") + print(" - Consider additional validation rules if needed") + + print("\n" + "="*60) + +def main(): + """Main function""" + try: + # Import numpy for validation + import numpy as np + globals()['np'] = np + + # Run the null handling process + results = run_final_null_handling() + + # Validate data quality + validation_results = validate_data_quality(results) + + # Print final summary + print_final_summary(results, validation_results) + + # Return success if all files processed successfully + success_count = sum(1 for r in results.values() if r.get('success')) + return 0 if success_count == len(results) else 1 + + except Exception as e: + print(f"[ERROR] Fatal error in null handling process: {str(e)}") + return 1 + +if __name__ == "__main__": + exit_code = main() + sys.exit(exit_code) diff --git a/src/merge/separator.py b/src/merge/separator.py new file mode 100644 index 0000000000000000000000000000000000000000..0bb125c533beb27785944e609bc009d853898ba8 --- /dev/null +++ b/src/merge/separator.py @@ -0,0 +1,57 @@ +import pandas as pd +from pathlib import Path + +def separate_features(merged_path, crypto_path, stocks_path): + """ + Split merged_features.parquet into crypto_features and stocks_features using is_crypto attribute, + then drop any columns that are entirely null. + """ + merged_path = Path(merged_path) + if not merged_path.exists(): + print(f"File not found: {merged_path}") + return + + df = pd.read_parquet(merged_path) + + # Ensure COIN and XRP are marked as crypto + if 'symbol' in df.columns: + xrp_mask = df['symbol'].str.upper() == 'RIPPLE' + df.loc[xrp_mask, 'is_crypto'] = 1 + + # Separate by is_crypto + crypto_df = df[df['is_crypto'] == 1].copy() + stocks_df = df[df['is_crypto'] == 0].copy() + + # Drop columns that are entirely null + def drop_all_null(df, name): + null_cols = df.columns[df.isna().all()] + if len(null_cols): + print(f"Dropping {len(null_cols)} all-null columns from {name}:") + # for c in null_cols: + # print(f" • {c}") + df.drop(columns=null_cols, inplace=True) + else: + print(f"No all-null columns in {name}.") + return df + + crypto_df = drop_all_null(crypto_df, "crypto_features") + stocks_df = drop_all_null(stocks_df, "stocks_features") + + # Save to parquet + crypto_df.to_parquet(crypto_path) + stocks_df.to_parquet(stocks_path) + print(f"Saved {len(crypto_df)} crypto features to {crypto_path}") + print(f"Saved {len(stocks_df)} stocks features to {stocks_path}") + + +if __name__ == "__main__": + try: + from src import config as app_config + base = Path(app_config.DATA_DIR) + except Exception: + from os import getenv + base = Path(getenv("DATA_DIR", "/data")) + merged_path = base / "merged" / "features" / "merged_features.parquet" + crypto_path = base / "merged" / "features" / "crypto_features.parquet" + stocks_path = base / "merged" / "features" / "stocks_features.parquet" + separate_features(merged_path, crypto_path, stocks_path) diff --git a/src/merge/stocks_data_filler.py b/src/merge/stocks_data_filler.py new file mode 100644 index 0000000000000000000000000000000000000000..888388f519833e26af14d8dc9cd2a76b991f314f --- /dev/null +++ b/src/merge/stocks_data_filler.py @@ -0,0 +1,438 @@ +import pandas as pd +import numpy as np +from sklearn.impute import KNNImputer +from sklearn.preprocessing import StandardScaler +import warnings +warnings.filterwarnings('ignore') + +class ImprovedStockDataImputer: + """ + Enhanced imputation that prevents data homogenization by using + symbol-specific patterns and relationships. + """ + + def __init__(self, preserve_symbol_diversity=True): + self.preserve_symbol_diversity = preserve_symbol_diversity + self.symbol_profiles = {} + self.scalers = {} + + def _create_symbol_profiles(self, df): + """Create profiles for each symbol to guide imputation.""" + profiles = {} + + for symbol in df['symbol'].unique(): + symbol_data = df[df['symbol'] == symbol] + + # Calculate symbol-specific statistics with proper null handling + price_col = None + for col in ['price', 'close', 'close_alpaca', 'open', 'high', 'low']: + if col in symbol_data.columns and not symbol_data[col].isnull().all(): + price_col = col + break + + volume_col = None + for col in ['volume', 'volume_alpaca']: + if col in symbol_data.columns and not symbol_data[col].isnull().all(): + volume_col = col + break + + profile = { + 'symbol': symbol, + 'price_level': symbol_data[price_col].median() if price_col else 100.0, # Default to 100 + 'price_volatility': symbol_data[price_col].std() if price_col else 2.0, # Default volatility + 'volume_level': symbol_data[volume_col].median() if volume_col else 1000.0, # Default volume + 'is_crypto': symbol_data['is_crypto'].mode().iloc[0] if 'is_crypto' in symbol_data.columns and not symbol_data['is_crypto'].isnull().all() else 0, + 'typical_rsi': symbol_data['rsi'].median() if 'rsi' in symbol_data.columns and not symbol_data['rsi'].isnull().all() else 50.0, + 'data_availability': len(symbol_data) / len(df) if len(df) > 0 else 0 + } + + # Ensure no None values in profile + for key, value in profile.items(): + if value is None or (isinstance(value, float) and np.isnan(value)): + if key == 'price_level': + profile[key] = 100.0 + elif key == 'price_volatility': + profile[key] = 2.0 + elif key == 'volume_level': + profile[key] = 1000.0 + elif key == 'typical_rsi': + profile[key] = 50.0 + elif key == 'is_crypto': + profile[key] = 0 + else: + profile[key] = 0.0 + + profiles[symbol] = profile + + return profiles + + def _impute_with_symbol_context(self, df, column, symbol_profiles): + """Impute values using symbol-specific context to prevent homogenization.""" + + df_result = df.copy() + + for symbol in df['symbol'].unique(): + symbol_mask = df['symbol'] == symbol + symbol_data = df.loc[symbol_mask, column] + + if symbol_data.isnull().sum() == 0: + continue # No missing values for this symbol + + profile = symbol_profiles.get(symbol, {}) + + # Strategy depends on column type and symbol characteristics + if column in ['price', 'open', 'high', 'low', 'close']: + # Price data - use interpolation with symbol-specific bounds + interpolated = symbol_data.interpolate(method='linear', limit_direction='both') + + # If still missing, use symbol's typical price level with noise + if interpolated.isnull().any(): + base_price = profile.get('price_level', 100.0) + volatility = profile.get('price_volatility', base_price * 0.02) + + # Add symbol-specific noise to prevent identical values + symbol_hash = hash(symbol) % 1000 / 1000 # 0-1 range + noise_factor = (symbol_hash - 0.5) * 0.1 # -5% to +5% + adjusted_price = base_price * (1 + noise_factor) + + interpolated = interpolated.fillna(adjusted_price) + + df_result.loc[symbol_mask, column] = interpolated + + elif column in ['volume', 'volume_alpaca']: + # Volume data - use forward fill then symbol-specific median + filled = symbol_data.fillna(method='ffill').fillna(method='bfill') + + if filled.isnull().any(): + # Use symbol's typical volume with variation + base_volume = profile.get('volume_level', 1000.0) + symbol_hash = hash(symbol + column) % 1000 / 1000 + volume_multiplier = 0.5 + symbol_hash # 0.5x to 1.5x variation + adjusted_volume = base_volume * volume_multiplier + filled = filled.fillna(adjusted_volume) + + df_result.loc[symbol_mask, column] = filled + + elif column in ['rsi', 'stoch_k', 'stoch_d']: + # Oscillator indicators - use symbol-specific typical values + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + # Use symbol-specific baseline with variation + symbol_hash = hash(symbol + column) % 1000 / 1000 + if column == 'rsi': + # RSI: 30-70 range with symbol variation + baseline = 30 + (symbol_hash * 40) # 30-70 range + else: # stochastic + baseline = 20 + (symbol_hash * 60) # 20-80 range + else: + baseline = symbol_median + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + elif column in ['macd', 'macd_signal', 'macd_histogram']: + # MACD - can be positive/negative, use symbol-specific pattern + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + # Use price-level dependent MACD estimation with null safety + price_level = profile.get('price_level', 100.0) # Default to 100 if None + if price_level is None or np.isnan(price_level): + price_level = 100.0 + + symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 # -1 to +1 + # Scale MACD relative to price level + baseline = (price_level * 0.001) * symbol_hash + else: + baseline = symbol_median + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + else: + # Generic numeric imputation with symbol variation + symbol_median = symbol_data.median() + + if pd.isna(symbol_median): + # Use overall median but add symbol-specific variation + overall_median = df[column].median() + if pd.isna(overall_median): + overall_median = 0 + + # Add symbol-specific variation (±10%) + symbol_hash = hash(symbol + column) % 2000 / 1000 - 1 # -1 to +1 + variation = overall_median * 0.1 * symbol_hash + baseline = overall_median + variation + else: + baseline = symbol_median + + df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline) + + return df_result[column] + + def fit_transform(self, df): + """Apply improved imputation with anti-homogenization measures.""" + + df_imputed = df.copy() + df_imputed = df_imputed.sort_values(['symbol', 'interval_timestamp']) + + # Create symbol profiles + self.symbol_profiles = self._create_symbol_profiles(df_imputed) + + print(f"Created profiles for {len(self.symbol_profiles)} unique symbols") + + # 1. Handle categorical/flag columns (same as before) + categorical_cols = [ + 'symbol', 'stock_market', 'is_crypto', 'is_stock', 'is_other', + 'alpaca_data_available', 'is_trading_hours', 'is_weekend' + ] + + for col in categorical_cols: + if col in df_imputed.columns: + df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill') + + # 2. Price and volume data - symbol-specific imputation + price_volume_cols = [ + 'price', 'open', 'high', 'low', 'close', 'volume', + 'open_alpaca', 'high_alpaca', 'low_alpaca', 'close_alpaca', 'volume_alpaca', + 'bid_price', 'ask_price', 'bid_price_alpaca', 'ask_price_alpaca', 'price_alpaca' + ] + + for col in price_volume_cols: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + print(f"Imputing {col} with symbol-specific context...") + df_imputed[col] = self._impute_with_symbol_context( + df_imputed, col, self.symbol_profiles + ) + + # 3. Technical indicators - symbol-specific imputation + tech_indicators = [ + 'rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_position', + 'stoch_k', 'stoch_d', 'cci', 'roc_5', 'roc_10', 'mfi', 'rsi_macd_signal', + 'ema_convergence', 'true_range_pct' + ] + + for col in tech_indicators: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + print(f"Imputing {col} with symbol-specific context...") + df_imputed[col] = self._impute_with_symbol_context( + df_imputed, col, self.symbol_profiles + ) + + # 4. Volume/price change features - symbol-specific + change_features = [ + 'price_change_1', 'price_change_7', 'price_change_14', 'volume_ratio', + 'volatility_7', 'price_volume_trend', 'volatility_consistency' + ] + + for col in change_features: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + df_imputed[col] = self._impute_with_symbol_context( + df_imputed, col, self.symbol_profiles + ) + + # 5. On-chain features (crypto only) + onchain_features = [ + 'total_fees', 'total_gas_used', 'avg_gas_price', 'tx_count_7d_change', + 'tx_count_sma_7', 'tx_volume_7d_change', 'tx_volume_sma_7', + 'gas_used_7d_change', 'gas_used_sma_7', 'gas_price_7d_change', + 'gas_price_sma_7', 'fees_7d_change', 'avg_tx_size' + ] + + for col in onchain_features: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + # Only impute for crypto assets + crypto_mask = df_imputed['is_crypto'] == 1 + non_crypto_mask = df_imputed['is_crypto'] != 1 + + if crypto_mask.any(): + crypto_data = df_imputed.loc[crypto_mask] + crypto_imputed = self._impute_with_symbol_context( + crypto_data, col, self.symbol_profiles + ) + df_imputed.loc[crypto_mask, col] = crypto_imputed + + # Fill non-crypto with 0 + df_imputed.loc[non_crypto_mask, col] = df_imputed.loc[non_crypto_mask, col].fillna(0) + + # 6. Handle remaining columns with simple strategies + remaining_strategies = { + 'quality_metrics': [ + 'data_quality_score', 'core_features_completeness', 'technical_indicators_completeness', + 'onchain_features_completeness', 'price_data_completeness', + 'overall_feature_completeness', 'data_completeness_score' + ], + 'news_sentiment': [ + 'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', + 'news_sentiment_max', 'news_sentiment_range', 'news_match_score_mean', + 'news_match_score_max', 'news_mentions_count', 'news_articles_count', + 'news_highlights_count', 'news_activity_score', 'sentiment_score' + ], + 'zero_fill': [ + 'trade_count', 'trade_count_alpaca', 'bid_size', 'ask_size', + 'bid_size_alpaca', 'ask_size_alpaca', 'size', 'size_alpaca' + ] + } + + # Quality metrics - use median but add small variation + for col in remaining_strategies['quality_metrics']: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + median_val = df_imputed[col].median() + if pd.isna(median_val): + median_val = 0.5 # Default for quality metrics + median_val = np.clip(median_val, 0, 1) + + # Add tiny symbol-specific variation + for symbol in df_imputed['symbol'].unique(): + mask = df_imputed['symbol'] == symbol + symbol_hash = hash(symbol + col) % 100 / 10000 # Very small variation + fill_val = np.clip(median_val + symbol_hash, 0, 1) + df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(fill_val) + + # News sentiment - neutral with symbol variation + for col in remaining_strategies['news_sentiment']: + if col in df_imputed.columns and df_imputed[col].isnull().any(): + if 'sentiment' in col.lower(): + # Slight variation around neutral + for symbol in df_imputed['symbol'].unique(): + mask = df_imputed['symbol'] == symbol + symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1 # -0.1 to +0.1 + df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(symbol_hash) + elif 'count' in col.lower(): + df_imputed[col] = df_imputed[col].fillna(0) + else: + median_val = df_imputed[col].median() + if pd.isna(median_val): + median_val = 0 + df_imputed[col] = df_imputed[col].fillna(median_val) + + # Zero fill + for col in remaining_strategies['zero_fill']: + if col in df_imputed.columns: + df_imputed[col] = df_imputed[col].fillna(0) + + # Handle any remaining columns + remaining_numeric = df_imputed.select_dtypes(include=[np.number]).columns + remaining_with_nulls = [col for col in remaining_numeric if df_imputed[col].isnull().any()] + + for col in remaining_with_nulls: + if col not in ['id', 'id_alpaca', 'backup_id']: + print(f"Imputing remaining column: {col}") + df_imputed[col] = self._impute_with_symbol_context( + df_imputed, col, self.symbol_profiles + ) + + print("[INFO] Imputation complete with anti-homogenization measures") + print(f"[INFO] Final null counts: {df_imputed.isnull().sum().sum()}") + return df_imputed + +# Usage function with validation +def impute_with_validation(file_path, output_path=None): + """Impute data and validate no homogenization occurred.""" + + try: + print(f"[INFO] Loading data from: {file_path}") + df = pd.read_parquet(file_path) + print(f"[INFO] Loaded data shape: {df.shape}") + print(f"[INFO] Initial null counts: {df.isnull().sum().sum()}") + except Exception as e: + print(f"[ERROR] Failed to load data: {e}") + return None + + # Sample symbols for validation + symbols_sample = df['symbol'].unique()[:5] + print(f"[INFO] Processing {len(df['symbol'].unique())} unique symbols") + + # Initialize and run imputer + imputer = ImprovedStockDataImputer() + df_imputed = imputer.fit_transform(df) + + # Combine alpaca data with main data where available + alpaca_combinations = [ + ('high', 'high_alpaca'), + ('low', 'low_alpaca'), + ('close', 'close_alpaca'), + ('open', 'open_alpaca'), + ('volume', 'volume_alpaca') + ] + + for main_col, alpaca_col in alpaca_combinations: + if main_col in df_imputed.columns and alpaca_col in df_imputed.columns: + df_imputed[main_col] = df_imputed[main_col].combine_first(df_imputed[alpaca_col]) + print(f"[INFO] Combined {main_col} with {alpaca_col}") + + # Drop unwanted columns before saving + drop_cols = [ + '_filename', '_original_format', 'alpaca_data_available', + 'ask_exchange', 'ask_exchange_alpaca', + 'bid_exchange', 'bid_exchange_alpaca', + 'conditions', 'conditions_alpaca', 'conditions_trade', 'conditions_trade_alpaca', + 'symbol_quote', 'symbol_quote_alpaca', 'symbol_trade', 'symbol_trade_alpaca', + 'tape', 'tape_alpaca', 'tape_trade', 'tape_trade_alpaca', + 'id', 'id_alpaca', + 'is_new_symbol', 'price', 'timestamp_dt', + 'alpaca_merge_timestamp', 'timestamp', 'timestamp_alpaca', + 'estimateCurrency', 'exchange', 'exchange_alpaca', 'exchange_company', + 'finnhubIndustry', 'headline', + 'sentiment_timestamp', 'logo', + 'ticker', 'stock_market', + 'weburl', 'latest_news_timestamp', 'day_of_week', 'feature_timestamp', + 'interval_timestamp_dt', 'is_crypto', 'is_other', 'is_stock', + 'country', 'currency', 'datetime', 'ipo', 'name', 'period', 'phone', + 'year', 'month', 'latest_news_timestamp_x', 'latest_news_timestamp_y' + ] + + original_cols = len(df_imputed.columns) + for col in drop_cols: + if col in df_imputed.columns: + df_imputed = df_imputed.drop(columns=col) + + print(f"[INFO] Dropped {original_cols - len(df_imputed.columns)} unwanted columns") + + # Reorder columns: 'symbol' first, 'interval_timestamp' second, rest follow + cols = list(df_imputed.columns) + if 'symbol' in cols and 'interval_timestamp' in cols: + rest = [c for c in cols if c not in ['symbol', 'interval_timestamp']] + df_imputed = df_imputed[['symbol', 'interval_timestamp'] + rest] + print("[INFO] Reordered columns with symbol and interval_timestamp first") + + # Save results + if output_path: + # Clean up data types + if 'backup_id' in df_imputed.columns: + df_imputed['backup_id'] = df_imputed['backup_id'].astype(str) + + try: + df_imputed.to_parquet(output_path, compression='snappy') + print(f"[INFO] Successfully saved imputed data to: {output_path}") + except Exception as e: + print(f"[ERROR] Failed to save data: {e}") + return None + + print(f"[INFO] Final dataset shape: {df_imputed.shape}") + return df_imputed + +# Example usage +def main(): + input_file = "data/merged/features/stocks_features.parquet" + output_file = input_file + + print("[INFO] Starting stock data imputation process...") + df_clean = impute_with_validation(input_file, output_file) + + if df_clean is not None: + print(f"[INFO] Data imputation completed successfully!") + print(f"[INFO] Final shape: {df_clean.shape}") + print(f"[INFO] Remaining nulls: {df_clean.isnull().sum().sum()}") + + # Quick validation + print("\n=== VALIDATION SUMMARY ===") + print(f"Unique symbols: {df_clean['symbol'].nunique()}") + if 'close' in df_clean.columns: + print(f"Price range: ${df_clean['close'].min():.2f} - ${df_clean['close'].max():.2f}") + if 'volume' in df_clean.columns: + print(f"Volume range: {df_clean['volume'].min():.0f} - {df_clean['volume'].max():.0f}") + else: + print("[ERROR] Failed to load or impute data.") + +if __name__ == "__main__": + main() diff --git a/src/merge/test_enhanced_null_handling.py b/src/merge/test_enhanced_null_handling.py new file mode 100644 index 0000000000000000000000000000000000000000..61aa738bffb3604b995cb9174c564e5eaa3321e8 --- /dev/null +++ b/src/merge/test_enhanced_null_handling.py @@ -0,0 +1,209 @@ +#!/usr/bin/env python3 +""" +Test script for the enhanced symbol-first null handling strategy +""" + +import pandas as pd +import numpy as np +import sys +from pathlib import Path +import json + +# Add the merge directory to path +sys.path.append(str(Path(__file__).parent.parent)) + +from final_null_handler import FinalNullValueHandler + +def create_realistic_test_data(): + """Create realistic test data with temporal patterns and symbol-specific characteristics""" + + # Create timestamps for the last 30 days + timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H') + timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist() + + symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL'] + data = [] + + for symbol in symbols: + for i, ts in enumerate(timestamp_ms[:100]): # 100 records per symbol + + if symbol in ['bitcoin', 'ethereum']: + # Crypto data + base_price = 50000 if symbol == 'bitcoin' else 3000 + price_trend = i * 10 # Upward trend + price = base_price + price_trend + np.random.normal(0, 500) + + record = { + 'symbol': symbol, + 'interval_timestamp': ts, + 'price': price if np.random.random() > 0.2 else np.nan, # 20% nulls + 'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan, + 'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan, + 'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan, + 'rank': 1 if symbol == 'bitcoin' else 2, + 'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan, + 'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan, + 'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan, + 'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan, + 'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan, + 'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan, + 'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan, + 'stable': False + } + else: + # Stock data + base_price = 150 if symbol == 'AAPL' else 2800 + price_trend = i * 0.5 # Modest upward trend + price = base_price + price_trend + np.random.normal(0, 5) + + record = { + 'symbol': symbol, + 'interval_timestamp': ts, + 'close': price if np.random.random() > 0.2 else np.nan, + 'open': price * 0.995 if np.random.random() > 0.2 else np.nan, + 'high': price * 1.02 if np.random.random() > 0.15 else np.nan, + 'low': price * 0.98 if np.random.random() > 0.15 else np.nan, + 'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan, + 'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan, + 'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan, + 'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan, + 'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan, + 'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan, + 'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan, + 'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan, + 'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan, + 'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan, + } + + data.append(record) + + return pd.DataFrame(data) + +def test_symbol_first_strategy(): + """Test the symbol-first null handling strategy""" + print("="*70) + print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY") + print("="*70) + + # Create realistic test data + print("Creating realistic test data with temporal patterns...") + df = create_realistic_test_data() + + print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns") + print(f"Symbols: {df['symbol'].unique()}") + print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}") + + # Analyze null patterns before processing + print(f"\nNULL ANALYSIS BEFORE PROCESSING:") + total_nulls_before = df.isnull().sum().sum() + print(f"Total nulls: {total_nulls_before}") + + symbol_nulls_before = {} + for symbol in df['symbol'].unique(): + symbol_data = df[df['symbol'] == symbol] + symbol_nulls = symbol_data.isnull().sum().sum() + symbol_nulls_before[symbol] = symbol_nulls + print(f" {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)") + + # Test the enhanced handler + print(f"\nTESTING ENHANCED NULL HANDLER...") + handler = FinalNullValueHandler() + + # Separate crypto and stock data for targeted processing + crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum']) + stock_mask = df['symbol'].isin(['AAPL', 'GOOGL']) + + results = {} + + if crypto_mask.any(): + print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...") + df_crypto = df[crypto_mask].copy() + df_crypto_processed = handler.process_crypto_features(df_crypto) + df.loc[crypto_mask] = df_crypto_processed + + crypto_nulls_after = df_crypto_processed.isnull().sum().sum() + results['crypto'] = { + 'nulls_before': df_crypto.isnull().sum().sum(), + 'nulls_after': crypto_nulls_after, + 'symbols': ['bitcoin', 'ethereum'] + } + + if stock_mask.any(): + print(f"\nProcessing stock data ({stock_mask.sum()} rows)...") + df_stock = df[stock_mask].copy() + df_stock_processed = handler.process_stock_features(df_stock) + df.loc[stock_mask] = df_stock_processed + + stock_nulls_after = df_stock_processed.isnull().sum().sum() + results['stock'] = { + 'nulls_before': df_stock.isnull().sum().sum(), + 'nulls_after': stock_nulls_after, + 'symbols': ['AAPL', 'GOOGL'] + } + + # Analyze results + print(f"\nRESULTS ANALYSIS:") + total_nulls_after = df.isnull().sum().sum() + print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})") + + for asset_type, result in results.items(): + nulls_filled = result['nulls_before'] - result['nulls_after'] + fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0 + print(f" {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)") + + # Symbol-level analysis + print(f"\nSYMBOL-LEVEL ANALYSIS:") + for symbol in df['symbol'].unique(): + symbol_data = df[df['symbol'] == symbol] + nulls_after = symbol_data.isnull().sum().sum() + nulls_filled = symbol_nulls_before[symbol] - nulls_after + fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0 + print(f" {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)") + + # Quality checks + print(f"\nQUALITY CHECKS:") + infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum() + print(f" Infinite values: {infinite_values}") + print(f" Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}") + + # Test temporal interpolation effectiveness + print(f"\nTEMPORAL INTERPOLATION TEST:") + for symbol in df['symbol'].unique(): + symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp') + if 'price' in symbol_data.columns: + price_series = symbol_data['price'] + if len(price_series.dropna()) >= 2: + # Check if we have reasonable price progression + price_diff = price_series.dropna().diff().abs().mean() + print(f" {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)") + + # Overall success assessment + success = (total_nulls_after == 0 and + infinite_values == 0 and + all(result['nulls_after'] < result['nulls_before'] for result in results.values())) + + if success: + print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!") + print(f" - All nulls handled successfully") + print(f" - No infinite values introduced") + print(f" - Symbol-specific patterns preserved") + print(f" - Temporal interpolation working") + return True + else: + print(f"\n❌ Test failed - review results above") + return False + +def main(): + """Main test function""" + try: + success = test_symbol_first_strategy() + return 0 if success else 1 + except Exception as e: + print(f"❌ Test failed with error: {str(e)}") + import traceback + traceback.print_exc() + return 1 + +if __name__ == "__main__": + exit_code = main() + sys.exit(exit_code) diff --git a/src/merge/test_null_filling_merge.py b/src/merge/test_null_filling_merge.py new file mode 100644 index 0000000000000000000000000000000000000000..36566a979ba154172fc14e00101d104535f7a28b --- /dev/null +++ b/src/merge/test_null_filling_merge.py @@ -0,0 +1,189 @@ +#!/usr/bin/env python3 +""" +Test script for null filling during merge operations +""" + +import pandas as pd +import numpy as np +import os +import sys +from pathlib import Path + +# Add the merge directory to path +sys.path.append(str(Path(__file__).parent)) + +from merge_temp import fill_nulls_from_temp + +def create_test_data(): + """Create test data with strategic null values""" + + # Create merged data with some null values + merged_data = { + 'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'], + 'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000], + 'price': [150.0, np.nan, 50000.0, np.nan, 4000.0], # AAPL and BTC have nulls + 'volume': [1000000, 1200000, np.nan, 800000, np.nan], # BTC and ETH have nulls + 'rsi': [65.0, np.nan, 70.0, 45.0, np.nan], # AAPL and ETH have nulls + 'macd': [1.5, 1.8, np.nan, -0.5, 2.1] # BTC has null + } + df_merged = pd.DataFrame(merged_data) + + # Create temp data that can fill some of the nulls + temp_data = { + 'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'], + 'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000], + 'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0], # Can fill AAPL and BTC nulls + 'volume': [950000, 1150000, 2000000, 780000, 500000, 400000], # Can fill BTC and ETH nulls + 'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0], # Can fill AAPL and ETH nulls + 'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8], # Can fill BTC null + 'new_feature': [100, 200, 300, 400, 500, 600] # New feature not in merged + } + df_temp = pd.DataFrame(temp_data) + + return df_merged, df_temp + +def test_null_filling(): + """Test the null filling functionality""" + print("="*60) + print("TESTING NULL FILLING DURING MERGE") + print("="*60) + + # Create test data + df_merged, df_temp = create_test_data() + + print("BEFORE NULL FILLING:") + print(f"Merged data shape: {df_merged.shape}") + print(f"Temp data shape: {df_temp.shape}") + print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}") + print("\nNull values by column in merged data:") + for col in df_merged.columns: + null_count = df_merged[col].isnull().sum() + if null_count > 0: + print(f" {col}: {null_count} nulls") + + print(f"\nMerged data preview:") + print(df_merged.to_string()) + print(f"\nTemp data preview:") + print(df_temp.to_string()) + + # Test the null filling function + df_merged_copy = df_merged.copy() + nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp) + + print(f"\nAFTER NULL FILLING:") + print(f"Nulls filled: {nulls_filled}") + print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}") + print("\nRemaining null values by column:") + for col in df_merged_copy.columns: + null_count = df_merged_copy[col].isnull().sum() + if null_count > 0: + print(f" {col}: {null_count} nulls") + + print(f"\nFilled data preview:") + print(df_merged_copy.to_string()) + + # Verify specific cases + print(f"\nVERIFICATION:") + + # Check AAPL price at timestamp 1640995260000 (should be filled) + aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') & + (df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0] + print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)") + + # Check BTC volume at timestamp 1640995200000 (should be filled) + btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') & + (df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0] + print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)") + + # Check if new features are NOT added (function should only fill existing columns) + has_new_feature = 'new_feature' in df_merged_copy.columns + print(f"New feature added: {has_new_feature} (should be False)") + + # Calculate success rate + original_nulls = df_merged.isnull().sum().sum() + remaining_nulls = df_merged_copy.isnull().sum().sum() + filled_nulls = original_nulls - remaining_nulls + + if filled_nulls == nulls_filled: + print(f"✅ Null counting is consistent: {filled_nulls} nulls filled") + else: + print(f"❌ Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}") + + if nulls_filled > 0: + fill_rate = (nulls_filled / original_nulls) * 100 + print(f"✅ Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})") + return True + else: + print("❌ No nulls were filled") + return False + +def test_edge_cases(): + """Test edge cases for null filling""" + print(f"\n" + "="*60) + print("TESTING EDGE CASES") + print("="*60) + + # Test with empty dataframes + df_empty = pd.DataFrame() + df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]}) + + print("Test 1: Empty merged dataframe") + nulls_filled = fill_nulls_from_temp(df_empty, df_test) + print(f"Nulls filled: {nulls_filled} (should be 0)") + + print("Test 2: Empty temp dataframe") + df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]}) + nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty) + print(f"Nulls filled: {nulls_filled} (should be 0)") + + # Test with no matching keys + print("Test 3: No matching symbol+timestamp combinations") + df_merged_nomatch = pd.DataFrame({ + 'symbol': ['A'], + 'interval_timestamp': [111], + 'value': [np.nan] + }) + df_temp_nomatch = pd.DataFrame({ + 'symbol': ['B'], + 'interval_timestamp': [222], + 'value': [100] + }) + nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch) + print(f"Nulls filled: {nulls_filled} (should be 0)") + + # Test with no common columns + print("Test 4: No common columns") + df_merged_nocols = pd.DataFrame({ + 'symbol': ['A'], + 'interval_timestamp': [123], + 'col1': [np.nan] + }) + df_temp_nocols = pd.DataFrame({ + 'symbol': ['A'], + 'interval_timestamp': [123], + 'col2': [100] + }) + nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols) + print(f"Nulls filled: {nulls_filled} (should be 0)") + + print("✅ All edge case tests completed") + +def main(): + """Run all tests""" + success = test_null_filling() + test_edge_cases() + + print(f"\n" + "="*60) + print("TEST SUMMARY") + print("="*60) + + if success: + print("🎉 Null filling functionality is working correctly!") + return 0 + else: + print("❌ Null filling functionality has issues") + return 1 + +if __name__ == "__main__": + exit_code = main() + sys.exit(exit_code) diff --git a/src/utils/symbol_normalizer.py b/src/utils/symbol_normalizer.py new file mode 100644 index 0000000000000000000000000000000000000000..2cb2623acc473a121fd7a75190bf5134953e8a5a --- /dev/null +++ b/src/utils/symbol_normalizer.py @@ -0,0 +1,233 @@ +""" +Crypto Symbol Normalizer +======================== + +Provides consistent symbol normalization across all data fetchers and mergers. +This ensures that different representations of the same cryptocurrency (e.g., XRP vs ripple) +are treated consistently throughout the entire pipeline. + +Features: +- Maps various symbol formats to canonical identifiers +- Supports both short symbols (BTC, ETH) and long names (bitcoin, ethereum) +- Case-insensitive matching +- Logging for debugging normalization process + +Author: AI Assistant +Date: August 2025 +""" + +import logging +from typing import Dict, List, Set + +logger = logging.getLogger(__name__) + +class CryptoSymbolNormalizer: + """ + Centralized crypto symbol normalization for consistent asset identification + """ + + def __init__(self): + """Initialize the symbol normalizer with predefined mappings""" + self.symbol_mapping = self._build_symbol_mapping() + logger.info(f"Initialized CryptoSymbolNormalizer with {len(self.symbol_mapping)} mappings") + + def _build_symbol_mapping(self) -> Dict[str, str]: + """ + Build comprehensive symbol mapping dictionary + + Returns: + Dictionary mapping various symbol formats to canonical slugs + """ + # Canonical mapping for major crypto assets + # Maps various symbols/names to the official canonical identifier + symbol_mapping = { + # Bitcoin variants + 'bitcoin': 'bitcoin', + 'btc': 'bitcoin', + 'Bitcoin': 'bitcoin', + 'BTC': 'bitcoin', + + # Ethereum variants + 'ethereum': 'ethereum', + 'eth': 'ethereum', + 'Ethereum': 'ethereum', + 'ETH': 'ethereum', + + # Ripple/XRP variants (canonical: ripple for Santiment) + 'ripple': 'ripple', + 'xrp': 'ripple', + 'Ripple': 'ripple', + 'XRP': 'ripple', + + # Solana variants (canonical: solana for Santiment) + 'solana': 'solana', + 'sol': 'solana', + 'Solana': 'solana', + 'SOL': 'solana', + + # Cardano variants (canonical: cardano for Santiment) + 'cardano': 'cardano', + 'ada': 'cardano', + 'Cardano': 'cardano', + 'ADA': 'cardano', + + # Polkadot variants + 'polkadot': 'polkadot', + 'dot': 'polkadot', + 'Polkadot': 'polkadot', + 'DOT': 'polkadot', + + # Chainlink variants + 'chainlink': 'chainlink', + 'link': 'chainlink', + 'Chainlink': 'chainlink', + 'LINK': 'chainlink', + + # Litecoin variants + 'litecoin': 'litecoin', + 'ltc': 'litecoin', + 'Litecoin': 'litecoin', + 'LTC': 'litecoin', + + # Bitcoin Cash variants + 'bitcoin-cash': 'bitcoin-cash', + 'bch': 'bitcoin-cash', + 'Bitcoin Cash': 'bitcoin-cash', + 'BCH': 'bitcoin-cash', + + # Stellar variants + 'stellar': 'stellar', + 'xlm': 'stellar', + 'Stellar': 'stellar', + 'XLM': 'stellar', + + # Ethereum Classic variants + 'ethereum-classic': 'ethereum-classic', + 'etc': 'ethereum-classic', + 'Ethereum Classic': 'ethereum-classic', + 'ETC': 'ethereum-classic', + + # EOS variants + 'eos': 'eos', + 'EOS': 'eos', + } + + return symbol_mapping + + def normalize(self, symbol: str) -> str: + """ + Normalize a symbol to its canonical identifier + + Args: + symbol: Symbol to normalize + + Returns: + Canonical identifier + """ + if symbol in self.symbol_mapping: + canonical = self.symbol_mapping[symbol] + if symbol != canonical: + logger.debug(f"Normalized '{symbol}' -> '{canonical}'") + return canonical + + # If not found in mapping, return as-is but log warning + logger.warning(f"Unknown symbol '{symbol}' not found in normalization mapping") + return symbol.lower() + + def normalize_list(self, symbols: List[str]) -> List[str]: + """ + Normalize a list of symbols and remove duplicates + + Args: + symbols: List of symbols to normalize + + Returns: + List of normalized, deduplicated symbols + """ + normalized = [] + seen = set() + + for symbol in symbols: + canonical = self.normalize(symbol) + if canonical not in seen: + normalized.append(canonical) + seen.add(canonical) + else: + logger.debug(f"Removed duplicate symbol: {symbol} (canonical: {canonical})") + + logger.info(f"Normalized {len(symbols)} symbols to {len(normalized)} unique canonical symbols") + return normalized + + def get_all_variants(self, canonical_symbol: str) -> List[str]: + """ + Get all known variants for a canonical symbol + + Args: + canonical_symbol: The canonical symbol to find variants for + + Returns: + List of all variants that map to this canonical symbol + """ + variants = [key for key, value in self.symbol_mapping.items() + if value == canonical_symbol] + return variants + + def get_canonical_symbols(self) -> Set[str]: + """ + Get set of all canonical symbols + + Returns: + Set of canonical symbols + """ + return set(self.symbol_mapping.values()) + + def add_mapping(self, symbol: str, canonical: str): + """ + Add a new symbol mapping + + Args: + symbol: Symbol variant to add + canonical: Canonical symbol it maps to + """ + self.symbol_mapping[symbol] = canonical + logger.info(f"Added new mapping: '{symbol}' -> '{canonical}'") + + +# Global instance for easy access +_normalizer = None + +def get_normalizer() -> CryptoSymbolNormalizer: + """ + Get the global normalizer instance (singleton pattern) + + Returns: + CryptoSymbolNormalizer instance + """ + global _normalizer + if _normalizer is None: + _normalizer = CryptoSymbolNormalizer() + return _normalizer + +def normalize_symbol(symbol: str) -> str: + """ + Convenience function to normalize a single symbol + + Args: + symbol: Symbol to normalize + + Returns: + Canonical symbol + """ + return get_normalizer().normalize(symbol) + +def normalize_symbol_list(symbols: List[str]) -> List[str]: + """ + Convenience function to normalize a list of symbols + + Args: + symbols: List of symbols to normalize + + Returns: + List of normalized symbols + """ + return get_normalizer().normalize_list(symbols) diff --git a/src/vis/stocks.py b/src/vis/stocks.py new file mode 100644 index 0000000000000000000000000000000000000000..d3514a7a5dde4c59a053970a0acf1a185817f439 --- /dev/null +++ b/src/vis/stocks.py @@ -0,0 +1,45 @@ +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns + +# Load normalized stock data +csv_path = 'data\\merged\\norm\\stocks_features_improved_normalized.csv' +df = pd.read_csv(csv_path) + +# 1. Show basic info and head +print('Data shape:', df.shape) +print(df.head()) + +# 2. Feature distribution histograms +features = [ + 'price_momentum', 'volume_price_ratio', 'daily_range', 'avg_sentiment', 'technical_strength' +] +existing_features = [f for f in features if f in df.columns] +if existing_features: + df[existing_features].hist(bins=30, figsize=(12, 8)) + plt.suptitle('Feature Distributions') + plt.tight_layout() + plt.show() +else: + print('No engineered features found for distribution plots.') + +# 3. Correlation heatmap +if len(existing_features) > 1: + plt.figure(figsize=(8, 6)) + sns.heatmap(df[existing_features].corr(), annot=True, cmap='coolwarm') + plt.title('Feature Correlation Heatmap') + plt.show() + +# 4. Outlier boxplots for engineered features +for feat in existing_features: + plt.figure(figsize=(6, 2)) + sns.boxplot(x=df[feat]) + plt.title(f'Boxplot: {feat}') + plt.show() + +# 5. Pairplot (if you have a target column, e.g., "target") +# Uncomment and adjust if you have a target/label +# sns.pairplot(df, vars=existing_features, hue='target') +# plt.show() + +print('Visualization complete. You can add more plots as needed!') diff --git a/test_gradio.py b/test_gradio.py new file mode 100644 index 0000000000000000000000000000000000000000..f58e980442b35612cc938870ebc5e70062c80de8 --- /dev/null +++ b/test_gradio.py @@ -0,0 +1,9 @@ +import gradio as gr + +def hello(name): + return f"Hello {name}!" + +demo = gr.Interface(fn=hello, inputs="text", outputs="text") + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0", server_port=7860)