diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..11930f8d300c566147cf6dccf2ac556c26b617ae
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,30 @@
+# Exclude large, generated, and local-only files from Docker build context
+.git
+.gitignore
+.vscode
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.log
+
+# Python build artifacts
+build/
+dist/
+*.egg-info/
+
+# Local env
+.env
+
+# Data and caches (mounted at runtime instead)
+data/
+/data/
+**/archive/
+**/temp/
+**/train/
+**/raw/
+**/features/
+**/warehouse/
+
+# Notebooks
+*.ipynb
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..348770c8313b0f5f8cc14e9f81d833b65cef92bd
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+data/
+.env
+src/data_cloud/__init__.py
+__pycache__/
+.vscode/
+last_run.txt
+*.pyc
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..261feb7dc7514f2f905a61be7f83fe2ab0d548a4
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,108 @@
+###############################
+# 1) ─── Python builder ───
+###############################
+FROM python:3.11-slim AS builder
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ git curl wget \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip wheel --no-cache-dir --wheel-dir=/app/wheels -r requirements.txt
+
+###############################
+# 2) ─── Runtime image ───
+###############################
+FROM python:3.11-slim
+WORKDIR /app
+
+# OS runtime deps (minimal for memory optimization)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    nginx \
+    supervisor \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Python deps
+COPY --from=builder /app/wheels /wheels
+COPY requirements.txt .
+
+# Install Python dependencies (with cleanup for memory optimization)
+RUN pip install --no-cache-dir --no-index --find-links=/wheels -r requirements.txt \
+    && rm -rf /wheels \
+    && pip cache purge 
+    # Install Playwright system dependencies and browsers
+    # && python -m playwright install-deps \
+    # && python -m playwright install chromium firefox webkit
+
+# Create necessary directories with proper permissions for root
+RUN mkdir -p /data/advisorai-data/archive \
+    && mkdir -p /data/advisorai-data/features \
+    && mkdir -p /data/advisorai-data/temp \
+    && mkdir -p /data/advisorai-data/train \
+    && mkdir -p /data/advisorai-data/warehouse \
+    && mkdir -p /data/alpaca/archive \
+    && mkdir -p /data/alpaca/features \
+    && mkdir -p /data/alpaca/temp \
+    && mkdir -p /data/alpaca/train \
+    && mkdir -p /data/crypto-bubbles/archive \
+    && mkdir -p /data/crypto-bubbles/features \
+    && mkdir -p /data/crypto-bubbles/temp \
+    && mkdir -p /data/crypto-bubbles/train \
+    && mkdir -p /data/finnhub/archive \
+    && mkdir -p /data/finnhub/features \
+    && mkdir -p /data/finnhub/temp \
+    && mkdir -p /data/finnhub/train \
+    && mkdir -p /data/finviz/archive \
+    && mkdir -p /data/finviz/features \
+    && mkdir -p /data/finviz/temp \
+    && mkdir -p /data/finviz/train \
+    && mkdir -p /data/marketaux/archive \
+    && mkdir -p /data/marketaux/features \
+    && mkdir -p /data/marketaux/temp \
+    && mkdir -p /data/marketaux/train \
+    && mkdir -p /data/merged/archive \
+    && mkdir -p /data/merged/features \
+    && mkdir -p /data/merged/temp \
+    && mkdir -p /data/merged/train \
+    && mkdir -p /data/merged/raw \
+    && mkdir -p /data/logs \
+    && mkdir -p /data/nltk_data \
+    && mkdir -p /tmp/nginx/body \
+    && mkdir -p /tmp/nginx/proxy \
+    && mkdir -p /tmp/nginx/fastcgi \
+    && chmod -R 777 /data /tmp/nginx
+
+# ─── Application code ───
+COPY . .
+
+# Set executable permissions for entrypoint
+RUN chmod +x /app/deployment/entrypoint.sh /app/deployment/gradio_entrypoint.sh
+
+# PYTHONPATH for FastAPI
+ENV PYTHONPATH=/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge
+
+# Nginx config
+RUN rm -f /etc/nginx/conf.d/default.conf
+COPY deployment/nginx.conf /etc/nginx/conf.d/app.conf
+COPY deployment/nginx.main.conf /etc/nginx/nginx.conf
+
+# Set resource limits for memory optimization (512MB limit)
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=utf-8
+ENV MAX_MEMORY_MB=450
+ENV MALLOC_TRIM_THRESHOLD_=100000
+ENV MALLOC_MMAP_THRESHOLD_=131072
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONHASHSEED=random
+ENV NLTK_DATA=/data/nltk_data
+
+# Supervisord config
+COPY deployment/supervisord.conf /etc/supervisord.conf
+
+ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
+
+# Ports
+EXPOSE 80 7860
+
+CMD ["supervisord", "-c", "/etc/supervisord.conf"]
\ No newline at end of file
diff --git a/Dockerfile.gradio b/Dockerfile.gradio
new file mode 100644
index 0000000000000000000000000000000000000000..ddcb78566b1ae9c4c9b6116a42438577bf260030
--- /dev/null
+++ b/Dockerfile.gradio
@@ -0,0 +1,85 @@
+###############################
+# Gradio-optimized Dockerfile
+###############################
+FROM python:3.11-slim
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    libgomp1 \
+    supervisor \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt \
+    && pip cache purge
+
+# Create necessary directories
+RUN mkdir -p /data/logs \
+    && mkdir -p /data/merged/features \
+    && mkdir -p /data/merged/train \
+    && mkdir -p /data/alpaca \
+    && mkdir -p /data/advisorai-data \
+    && mkdir -p /data/nltk_data \
+    && chmod -R 777 /data
+
+# Copy application code
+COPY . .
+
+# Set executable permissions
+RUN chmod +x /app/deployment/gradio_entrypoint.sh
+
+# Set environment variables
+ENV PYTHONPATH=/app:/app/src
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=utf-8
+ENV NLTK_DATA=/data/nltk_data
+
+# Create simplified supervisord config for Gradio
+RUN echo '[supervisord]\n\
+nodaemon=true\n\
+logfile=/dev/stdout\n\
+logfile_maxbytes=0\n\
+pidfile=/tmp/supervisord.pid\n\
+loglevel=info\n\
+\n\
+[program:gradio]\n\
+command=python /app/app.py\n\
+directory=/app\n\
+autostart=true\n\
+autorestart=true\n\
+stdout_logfile=/dev/stdout\n\
+stderr_logfile=/dev/stderr\n\
+stdout_logfile_maxbytes=0\n\
+stderr_logfile_maxbytes=0\n\
+startsecs=10\n\
+startretries=3\n\
+stopwaitsecs=30\n\
+killasgroup=true\n\
+stopasgroup=true\n\
+environment=PYTHONPATH="/app:/app/src"\n\
+\n\
+[program:scheduler]\n\
+command=/bin/sh -c "sleep 180 && python /app/deployment/scheduler.py"\n\
+directory=/app\n\
+autostart=true\n\
+autorestart=true\n\
+startsecs=0\n\
+stdout_logfile=/dev/stdout\n\
+stderr_logfile=/dev/stderr\n\
+stdout_logfile_maxbytes=0\n\
+stderr_logfile_maxbytes=0\n\
+startretries=3\n\
+stopwaitsecs=60\n\
+killasgroup=true\n\
+stopasgroup=true' > /etc/supervisord_gradio.conf
+
+ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
+
+# Expose Gradio port
+EXPOSE 7860
+
+CMD ["supervisord", "-c", "/etc/supervisord_gradio.conf"]
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..0861e280503c4872df740143be5bf5c0362ac09e
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Maaroufabousaleh
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/PERMISSION_FIX_COMPLETED.md b/PERMISSION_FIX_COMPLETED.md
new file mode 100644
index 0000000000000000000000000000000000000000..e2646791c5299fa5d8b692b16ad2f1d40fec3451
--- /dev/null
+++ b/PERMISSION_FIX_COMPLETED.md
@@ -0,0 +1,96 @@
+# Permission Fix Completion Report
+
+## Summary
+Successfully resolved Docker container permission errors for Hugging Face Spaces deployment. The application now uses the platform's persistent writable mount `/data` instead of attempting to write to read-only locations under `/app`.
+
+## Key Changes Applied
+
+### 1. Container Startup (`deployment/entrypoint.sh`)
+- **Before**: Created symlinks from `/tmp/data` to `/app/data` (not allowed on Spaces)
+- **After**: Creates directory structure under `/data` and exports `DATA_DIR="/data"`
+- **Result**: Container startup proceeds without symlink permission errors
+
+### 2. Data Fetch Script (`deployment/fetch_filebase.py`)
+- **Before**: Hard-coded paths under `/app/data`
+- **After**: Added CLI `--base-dir` support and `DATA_DIR` environment variable detection
+- **Result**: Fetch script downloads to `/data` successfully without permission errors
+
+### 3. Application Configuration (`src/config.py` - NEW)
+- **Purpose**: Centralized path management for DATA_DIR, LOG_DIR, and LAST_RUN_PATH
+- **Behavior**: Auto-detects writable locations with fallbacks (`/data` → `/app/data` → `/tmp`)
+- **Result**: Runtime code can work on both local dev and Hugging Face Spaces
+
+### 4. Runtime Components Updated
+- **health.py**: Uses `LAST_RUN_PATH` and `DATA_DIR` from `src.config`
+- **isrunning.py**: Uses `DATA_DIR` and `LAST_RUN_PATH` from `src.config`
+- **monitor.py**: Uses `LOG_DIR` from `src.config` and checks `DATA_DIR` for disk usage
+- **scheduler.py**: Writes `last_run.txt` to `LAST_RUN_PATH` from `src.config`
+
+### 5. Container Build (`Dockerfile`)
+- **Before**: Created directories under `/app/data`
+- **After**: Creates directories under `/data` and sets permissions
+- **Result**: Container image prepares the correct writable mount point
+
+### 6. Permission Test Scripts
+- **test_permissions.py**: Updated to test `/data` directories
+- **cleanup.py**: Updated to operate on `/data` paths
+
+## Validation Results
+
+### Fetch Script Test
+```bash
+python deployment/fetch_filebase.py --base-dir /data
+```
+**Result**: ✅ SUCCESS - All downloads completed with `[OK] Downloaded...` messages, no permission errors
+
+### Code Compilation Test
+```bash
+python -m py_compile src/config.py
+python -m py_compile src/api/routes/health.py
+python -m py_compile src/api/routes/isrunning.py
+python -m py_compile deployment/monitor.py
+python -m py_compile deployment/scheduler.py
+```
+**Result**: ✅ SUCCESS - All files compile without syntax errors
+
+## Configuration Details
+
+### Environment Variables
+- `DATA_DIR="/data"` - Exported by entrypoint.sh
+- `LOG_DIR` - Auto-detected as `$DATA_DIR/logs` with fallback to `/tmp/logs`
+
+### Path Mapping
+| Component | Old Path | New Path |
+|-----------|----------|----------|
+| Data storage | `/app/data` | `/data` |
+| Logs | `/app/logs` | `/data/logs` |
+| Last run marker | `/app/deployment/last_run.txt` | `/data/deployment/last_run.txt` |
+| Feature files | `/app/data/merged/features` | `/data/merged/features` |
+
+### CLI Usage
+- **Fetch script**: `python deployment/fetch_filebase.py --base-dir /data`
+- **Auto-detection**: Script uses `DATA_DIR` environment variable if no `--base-dir` provided
+- **Local dev**: Fallback to `/app/data` if `/data` doesn't exist
+
+## Next Steps for Deployment
+
+1. **Build and deploy** - The container should now start successfully on Hugging Face Spaces
+2. **Monitor logs** - Check that nginx, monitor, and scheduler services start without permission errors
+3. **Verify API endpoints** - Test `/health` and `/isrunning` endpoints return proper status
+4. **Validate data pipeline** - Confirm scheduled data pipeline runs write to `/data` successfully
+
+## Remaining Considerations
+
+### Nginx Configuration
+If nginx still fails with `/var/lib/nginx/body` permission errors, consider:
+- Using custom nginx config that writes to `/data/nginx` instead
+- Running nginx with user permissions that match container user
+- Using nginx-light or alternative reverse proxy
+
+### System Directories
+Monitor for any remaining attempts to write to system directories like:
+- `/var/log`
+- `/usr/local`
+- Any paths under `/app` (should be read-only)
+
+The permission fix is complete and validated. The application is now ready for deployment on Hugging Face Spaces.
diff --git a/README.md b/README.md
index 7eced63841dd53203a7048eb32653ca8c2c058e2..7a04876edb01ed9148391f0bb561e73e7ef0bdce 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,9 @@
 ---
 title: Advisorai Data Enhanced
-emoji: 🌖
-colorFrom: gray
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
+emoji: 📚
+colorFrom: indigo
+colorTo: green
+sdk: docker
 pinned: false
 license: mit
 ---
diff --git a/README_HF.md b/README_HF.md
new file mode 100644
index 0000000000000000000000000000000000000000..7a6dd3ce0e519ca47bd85fb4458243e908d2954a
--- /dev/null
+++ b/README_HF.md
@@ -0,0 +1,10 @@
+title: AdvisorAI Data Pipeline Monitor
+emoji: 🤖
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Real-time monitoring for AdvisorAI data collection pipeline
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..91e6c8c5cbd38d6d169a2d1ac558a5387a6b18b0
--- /dev/null
+++ b/app.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+AdvisorAI Data Pipeline Monitor - Gradio App
+This is the main entry point for Hugging Face Spaces
+"""
+
+import gradio as gr
+import json
+import os
+import sys
+import logging
+import time
+from datetime import datetime
+
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def get_basic_health():
+    """Get basic health status without external dependencies"""
+    return {
+        "status": "healthy",
+        "timestamp": datetime.now().isoformat(),
+        "message": "AdvisorAI Data Pipeline Monitor is running"
+    }
+
+def get_basic_pipeline_status():
+    """Get basic pipeline status"""
+    return {
+        "status": "monitoring",
+        "message": "Data pipeline monitoring active",
+        "last_check": datetime.now().isoformat()
+    }
+
+def get_sample_data():
+    """Get sample data for display"""
+    return [
+        ["sample_data.json", "merged/features/", "2.5 MB", "2025-01-18 10:30"],
+        ["market_data.parquet", "alpaca/", "15.3 MB", "2025-01-18 10:25"],
+        ["sentiment_data.json", "finviz/features/", "1.2 MB", "2025-01-18 10:20"]
+    ]
+
+def get_sample_logs():
+    """Get sample log entries"""
+    return """=== scheduler.log ===
+2025-01-18 10:30:15 - INFO - Scheduler started successfully
+2025-01-18 10:30:16 - INFO - Data collection task initiated
+2025-01-18 10:30:45 - INFO - Market data fetched successfully
+
+=== monitor.log ===
+2025-01-18 10:30:00 - INFO - System monitoring active
+2025-01-18 10:30:30 - INFO - Memory usage: 45%
+2025-01-18 10:31:00 - INFO - All services running normally
+"""
+
+# Create Gradio interface
+with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🤖 AdvisorAI Data Pipeline Monitor")
+    gr.Markdown("Real-time monitoring of the AdvisorAI data collection and processing pipeline")
+    
+    with gr.Tabs():
+        with gr.TabItem("📊 Dashboard"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Health Status")
+                    health_display = gr.JSON(label="System Health & Status")
+                    
+                with gr.Column():
+                    gr.Markdown("### Pipeline Status")
+                    pipeline_display = gr.JSON(label="Data Pipeline Status")
+                    
+            with gr.Row():
+                refresh_btn = gr.Button("🔄 Refresh", variant="primary")
+        
+        with gr.TabItem("📁 Recent Files"):
+            gr.Markdown("### Recently Modified Data Files")
+            files_display = gr.Dataframe(
+                headers=["File", "Path", "Size", "Modified"],
+                value=get_sample_data(),
+                label="Recent Files"
+            )
+            refresh_files_btn = gr.Button("🔄 Refresh Files")
+        
+        with gr.TabItem("📝 Logs"):
+            gr.Markdown("### Recent Log Entries")
+            logs_display = gr.Textbox(
+                label="Recent Logs",
+                value=get_sample_logs(),
+                lines=15,
+                max_lines=25,
+                show_copy_button=True
+            )
+            refresh_logs_btn = gr.Button("🔄 Refresh Logs")
+    
+    # Event handlers
+    def refresh_dashboard():
+        health = get_basic_health()
+        pipeline = get_basic_pipeline_status()
+        return json.dumps(health, indent=2), json.dumps(pipeline, indent=2)
+    
+    def refresh_files():
+        return get_sample_data()
+    
+    def refresh_logs():
+        return get_sample_logs()
+    
+    # Connect event handlers
+    refresh_btn.click(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+    
+    refresh_files_btn.click(
+        refresh_files,
+        outputs=[files_display]
+    )
+    
+    refresh_logs_btn.click(
+        refresh_logs,
+        outputs=[logs_display]
+    )
+    
+    # Auto-refresh on load
+    app.load(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+
+if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )
diff --git a/deployment/cleanup.py b/deployment/cleanup.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4bf11c903785d26a95708f808a25517015a1ec2
--- /dev/null
+++ b/deployment/cleanup.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+"""
+Cleanup script to manage disk space and prevent service issues
+"""
+import os
+import shutil
+import glob
+from datetime import datetime, timedelta
+
+def cleanup_logs():
+    """Clean up old log files"""
+    log_dirs = ["/data/logs", "/var/log"]
+    
+    for log_dir in log_dirs:
+        if os.path.exists(log_dir):
+            # Remove log files older than 7 days
+            cutoff_date = datetime.now() - timedelta(days=7)
+            
+            for log_file in glob.glob(os.path.join(log_dir, "*.log*")):
+                try:
+                    file_time = datetime.fromtimestamp(os.path.getmtime(log_file))
+                    if file_time < cutoff_date:
+                        os.remove(log_file)
+                        print(f"[Cleanup] Removed old log: {log_file}")
+                except Exception as e:
+                    print(f"[Cleanup] Error removing {log_file}: {e}")
+
+def cleanup_temp_files():
+    """Clean up temporary files"""
+    temp_dirs = ["/tmp", "/data/merged/temp"]
+    
+    for temp_dir in temp_dirs:
+        if os.path.exists(temp_dir):
+            # Remove files older than 1 day
+            cutoff_date = datetime.now() - timedelta(days=1)
+            
+            for temp_file in glob.glob(os.path.join(temp_dir, "*")):
+                try:
+                    if os.path.isfile(temp_file):
+                        file_time = datetime.fromtimestamp(os.path.getmtime(temp_file))
+                        if file_time < cutoff_date:
+                            os.remove(temp_file)
+                            print(f"[Cleanup] Removed temp file: {temp_file}")
+                except Exception as e:
+                    print(f"[Cleanup] Error removing {temp_file}: {e}")
+
+def cleanup_old_data():
+    """Clean up old data files to save space"""
+    # Keep only last 30 days of archived data
+    archive_dir = "/data/merged/archive"
+    if os.path.exists(archive_dir):
+        cutoff_date = datetime.now() - timedelta(days=30)
+        
+        for archive_folder in os.listdir(archive_dir):
+            folder_path = os.path.join(archive_dir, archive_folder)
+            if os.path.isdir(folder_path):
+                try:
+                    folder_time = datetime.fromtimestamp(os.path.getmtime(folder_path))
+                    if folder_time < cutoff_date:
+                        shutil.rmtree(folder_path)
+                        print(f"[Cleanup] Removed old archive: {folder_path}")
+                except Exception as e:
+                    print(f"[Cleanup] Error removing {folder_path}: {e}")
+
+def get_disk_usage():
+    """Get current disk usage"""
+    try:
+        import psutil
+        # Check disk usage for the data mount if present
+        disk_usage = psutil.disk_usage('/data' if os.path.exists('/data') else '/')
+        free_gb = disk_usage.free / (1024**3)
+        used_percent = (disk_usage.used / disk_usage.total) * 100
+        return free_gb, used_percent
+    except Exception:
+        return None, None
+
+def main():
+    """Main cleanup function"""
+    print(f"[Cleanup] Starting cleanup at {datetime.now()}")
+    
+    # Check disk usage before cleanup
+    free_before, used_before = get_disk_usage()
+    if free_before:
+        print(f"[Cleanup] Disk usage before: {used_before:.1f}% used, {free_before:.1f}GB free")
+    
+    # Run cleanup tasks
+    cleanup_logs()
+    cleanup_temp_files()
+    cleanup_old_data()
+    
+    # Check disk usage after cleanup
+    free_after, used_after = get_disk_usage()
+    if free_after and free_before:
+        freed_space = free_after - free_before
+        print(f"[Cleanup] Disk usage after: {used_after:.1f}% used, {free_after:.1f}GB free")
+        if freed_space > 0:
+            print(f"[Cleanup] Freed {freed_space:.2f}GB of disk space")
+    
+    print(f"[Cleanup] Cleanup completed at {datetime.now()}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/deployment/entrypoint.sh b/deployment/entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4c5f61b536bc56cc1e69da01fcb11dddfda58f12
--- /dev/null
+++ b/deployment/entrypoint.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+set -e
+
+echo "[entrypoint] v2025-08-16-permissions-fix"
+
+
+echo "[entrypoint] ensuring data directories exist with proper permissions..."
+# Create directories under /data and /tmp/nginx (for Nginx temp files)
+mkdir -p /data/advisorai-data \
+         /data/merged \
+         /data/alpaca \
+         /data/crypto-bubbles \
+         /data/finnhub \
+         /data/finviz \
+         /data/marketaux \
+         /data/logs \
+         /tmp/nginx/body \
+         /tmp/nginx/proxy \
+         /tmp/nginx/fastcgi
+
+# Fix permissions at runtime (in case Dockerfile is not enough)
+# Best-effort ownership/permission fixes; ignore errors on Space mounts
+chown -R $(id -u):$(id -g) /data /tmp/nginx 2>/dev/null || true
+chmod -R 777 /data /tmp/nginx 2>/dev/null || true
+
+echo "[entrypoint] restoring data from Filebase…"
+# Run data restoration in background to avoid blocking startup. Let script auto-detect writable base.
+python /app/deployment/fetch_filebase.py &
+FETCH_PID=$!
+
+# Wait a bit for critical data, but don't block indefinitely
+sleep 10
+
+# Check if fetch is still running
+if kill -0 $FETCH_PID 2>/dev/null; then
+    echo "[entrypoint] Data fetch still running in background (PID: $FETCH_PID)"
+else
+    echo "[entrypoint] Data fetch completed"
+fi
+
+echo "[entrypoint] launching services…"
+
+# ROLE-based startup: 'web' (default) runs API+nginx under supervisord; 'worker' runs scheduler directly
+ROLE_ENV=${ROLE:-web}
+echo "[entrypoint] detected ROLE=$ROLE_ENV"
+
+if [ "$ROLE_ENV" = "worker" ]; then
+    echo "[entrypoint] starting worker: scheduler only"
+    exec python /app/deployment/scheduler.py
+else
+    # Hugging Face Spaces friendly mode: run uvicorn directly on $PORT if HF_MODE=1
+    if [ "${HF_MODE:-0}" = "1" ]; then
+        export PORT=${PORT:-7860}
+        echo "[entrypoint] HF_MODE=1 -> launching uvicorn directly on PORT=$PORT"
+        exec uvicorn src.api.main:app --host 0.0.0.0 --port ${PORT} --workers 1 --timeout-keep-alive 30
+    else
+        # Default: nginx + uvicorn via supervisord
+        if [ -n "$PORT" ]; then
+            echo "[entrypoint] configuring nginx to listen on PORT=$PORT"
+            sed -i "s/listen 80;/listen ${PORT};/" /etc/nginx/conf.d/app.conf || true
+        fi
+        exec supervisord -c /etc/supervisord.conf
+    fi
+fi
\ No newline at end of file
diff --git a/deployment/fetch_filebase.py b/deployment/fetch_filebase.py
new file mode 100644
index 0000000000000000000000000000000000000000..2750a4395cd4cdd32cc4234a9b7ea1fe39fc9b1b
--- /dev/null
+++ b/deployment/fetch_filebase.py
@@ -0,0 +1,178 @@
+import os
+import sys
+import argparse
+
+from dotenv import load_dotenv
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from src.data_cloud.cloud_utils import StorageHandler
+
+
+def choose_base_dir(cli_base=None):
+    """Choose a writable base directory. Preference order:
+    1. CLI-provided path
+    2. /data (persistent volume on Spaces)
+    3. /tmp
+    """
+    candidates = []
+    if cli_base:
+        candidates.append(cli_base)
+    candidates.extend(['/data', '/tmp'])
+
+    for base in candidates:
+        try:
+            merged_path = os.path.abspath(os.path.join(base, 'merged'))
+            advisorai_path = os.path.abspath(os.path.join(base, 'advisorai-data'))
+            os.makedirs(merged_path, mode=0o777, exist_ok=True)
+            os.makedirs(advisorai_path, mode=0o777, exist_ok=True)
+            # Quick writability test
+            test_file = os.path.join(merged_path, '.write_test')
+            with open(test_file, 'w') as f:
+                f.write('ok')
+            os.remove(test_file)
+            return base
+        except Exception:
+            # cannot use this candidate; try next
+            continue
+
+    # As a last resort, use /tmp (may raise later if not writable)
+    return '/tmp'
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(description='Fetch data from Filebase/S3 into local disk')
+    parser.add_argument('--base-dir', help='Base directory to store data (default: auto-detected)')
+    args = parser.parse_args(argv)
+
+    load_dotenv()
+    # Load credentials from environment variables
+    endpoint_url = os.getenv('FILEBASE_ENDPOINT', 'https://s3.filebase.com')
+    access_key = os.getenv('FILEBASE_ACCESS_KEY')
+    secret_key = os.getenv('FILEBASE_SECRET_KEY')
+    bucket_name = os.getenv('FILEBASE_BUCKET')
+
+    # Prefer explicit DATA_DIR env var if present (Option 1)
+    env_base = os.getenv('DATA_DIR')
+    if env_base:
+        base_root = env_base
+    else:
+        base_root = choose_base_dir(args.base_dir)
+    local_base = os.path.abspath(os.path.join(base_root, 'merged'))
+    advisorai_base = os.path.abspath(os.path.join(base_root, 'advisorai-data'))
+
+    # Ensure base directories exist with proper permissions
+    os.makedirs(local_base, mode=0o777, exist_ok=True)
+    os.makedirs(advisorai_base, mode=0o777, exist_ok=True)
+
+    storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name, local_base=local_base)
+
+    # Fetch all folders/files from advisorai-data
+    advisor_prefix = "advisorai-data/"
+    print(f"Fetching all folders/files from: {advisor_prefix}")
+    advisor_keys = []
+    if storage.s3 and bucket_name:
+        try:
+            resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=advisor_prefix)
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                if not key.endswith('/'):
+                    advisor_keys.append(key)
+        except Exception as e:
+            print(f"[WARN] Could not list objects for {advisor_prefix}: {e}")
+    else:
+        print(f"[ERROR] No S3 client or bucket configured for advisorai-data!")
+    # Download advisorai-data files
+    for key in advisor_keys:
+        try:
+            data = storage.download(key)
+            # Remove 'advisorai-data/' from the start of the key for local path
+            local_rel_path = key[len("advisorai-data/"):] if key.startswith("advisorai-data/") else key
+            local_path = os.path.join(advisorai_base, local_rel_path)
+            os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True)
+            with open(local_path, 'wb') as f:
+                f.write(data)
+            print(f"[OK] Downloaded advisorai-data/{local_rel_path} from s3://{bucket_name}/{key}")
+        except Exception as e:
+            print(f"[ERROR] Failed to fetch advisorai-data file {key}: {e}")
+
+
+    # Fetch everything under merged/ except only the last 7 from merged/archive/
+    merged_prefix = "merged/"
+    print(f"Fetching everything under: {merged_prefix} (except only last 7 from archive)")
+    merged_keys = []
+    archive_prefix = "merged/archive/"
+    archive_folders = set()
+    archive_keys = []
+    if storage.s3 and bucket_name:
+        try:
+            resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=merged_prefix)
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                # Exclude all archive keys for now
+                if key.startswith(archive_prefix):
+                    # Collect archive folders for later
+                    parts = key[len(archive_prefix):].split('/')
+                    if len(parts) > 1 and parts[0].isdigit():
+                        archive_folders.add(parts[0])
+                    continue
+                if not key.endswith('/'):
+                    merged_keys.append(key)
+        except Exception as e:
+            print(f"[WARN] Could not list objects for {merged_prefix}: {e}")
+    else:
+        print(f"[ERROR] No S3 client or bucket configured for merged!")
+
+    # Download all merged/ (except archive)
+    for key in merged_keys:
+        try:
+            data = storage.download(key)
+            local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key
+            local_path = os.path.join(local_base, local_rel_path)
+            os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True)
+            with open(local_path, 'wb') as f:
+                f.write(data)
+            print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}")
+        except Exception as e:
+            print(f"[ERROR] Failed to fetch {key}: {e}")
+
+    # Fetch only the last 7 folders under merged/archive
+    archive_prefix = "merged/archive/"
+    print(f"Fetching last 7 archive folders from: {archive_prefix}")
+    archive_folders = set()
+    archive_keys = []
+    if storage.s3 and bucket_name:
+        try:
+            resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=archive_prefix)
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                # Expect keys like merged/archive/YYYYMMDD/...
+                parts = key[len(archive_prefix):].split('/')
+                if len(parts) > 1 and parts[0].isdigit():
+                    archive_folders.add(parts[0])
+            # Sort and get last 7 folders
+            last7 = sorted(archive_folders)[-7:]
+            print(f"[INFO] Last 7 archive folders: {last7}")
+            # Collect all keys in those folders
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                parts = key[len(archive_prefix):].split('/')
+                if len(parts) > 1 and parts[0] in last7:
+                    archive_keys.append(key)
+        except Exception as e:
+            print(f"[WARN] Could not list objects for {archive_prefix}: {e}")
+    else:
+        print(f"[ERROR] No S3 client or bucket configured for archive!")
+    # Download archive files
+    for key in archive_keys:
+        try:
+            data = storage.download(key)
+            local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key
+            local_path = os.path.join(local_base, local_rel_path)
+            os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True)
+            with open(local_path, 'wb') as f:
+                f.write(data)
+            print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}")
+        except Exception as e:
+            print(f"[ERROR] Failed to fetch archive file {key}: {e}")
+
+if __name__ == "__main__":
+    main()
diff --git a/deployment/gradio_entrypoint.sh b/deployment/gradio_entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0e28dcf1b14d5465e8851c9fb857933a7b85e32d
--- /dev/null
+++ b/deployment/gradio_entrypoint.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+set -e
+
+echo "Starting AdvisorAI Data Pipeline with Gradio..."
+
+# Create necessary directories
+mkdir -p /data/logs /data/nltk_data
+
+# Set proper permissions
+chmod -R 777 /data
+
+# Download NLTK data if needed
+python -c "
+import nltk
+import os
+os.environ['NLTK_DATA'] = '/data/nltk_data'
+try:
+    nltk.download('punkt', download_dir='/data/nltk_data', quiet=True)
+    nltk.download('stopwords', download_dir='/data/nltk_data', quiet=True)
+    nltk.download('vader_lexicon', download_dir='/data/nltk_data', quiet=True)
+    print('NLTK data downloaded successfully')
+except Exception as e:
+    print(f'NLTK download failed: {e}')
+"
+
+echo "Starting services..."
+exec "$@"
diff --git a/deployment/monitor.py b/deployment/monitor.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c99d175c6b197b90e841ad19fa9a6e550ab3825
--- /dev/null
+++ b/deployment/monitor.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+"""
+Simple monitoring script to track service health and resource usage
+"""
+import os
+import time
+import psutil
+import json
+from datetime import datetime
+
+from src import config as app_config
+
+def get_system_stats():
+    """Get current system statistics"""
+    try:
+        process = psutil.Process()
+        
+        # Memory info
+        memory_info = process.memory_info()
+        memory_mb = memory_info.rss / 1024 / 1024
+        
+        # CPU info
+        cpu_percent = process.cpu_percent(interval=1)
+        
+        # Disk info (prefer DATA_DIR)
+        disk_root = app_config.DATA_DIR if os.path.exists(app_config.DATA_DIR) else '/'
+        disk_usage = psutil.disk_usage(disk_root)
+        disk_free_gb = disk_usage.free / (1024**3)
+        disk_used_percent = (disk_usage.used / disk_usage.total) * 100
+        
+        # Process info
+        num_threads = process.num_threads()
+        
+        return {
+            "timestamp": datetime.now().isoformat(),
+            "memory_mb": round(memory_mb, 2),
+            "cpu_percent": round(cpu_percent, 2),
+            "disk_free_gb": round(disk_free_gb, 2),
+            "disk_used_percent": round(disk_used_percent, 2),
+            "num_threads": num_threads,
+            "pid": process.pid
+        }
+    except Exception as e:
+        return {
+            "timestamp": datetime.now().isoformat(),
+            "error": str(e)
+        }
+
+def log_stats():
+    """Log system statistics to file"""
+    stats = get_system_stats()
+    
+    # Create logs directory if it doesn't exist
+    log_dir = app_config.LOG_DIR
+    os.makedirs(log_dir, exist_ok=True)
+
+    # Write to log file
+    log_file = os.path.join(log_dir, "system_stats.jsonl")
+    with open(log_file, "a") as f:
+        f.write(json.dumps(stats) + "\n")
+    
+    # Print to stdout for supervisord
+    print(f"[Monitor] {json.dumps(stats)}")
+    
+    # Check for issues
+    if "error" not in stats:
+        issues = []
+        
+        if stats["memory_mb"] > 450:  # 90% of 512MB limit
+            issues.append(f"HIGH MEMORY: {stats['memory_mb']:.1f}MB")
+            
+        if stats["cpu_percent"] > 80:
+            issues.append(f"HIGH CPU: {stats['cpu_percent']:.1f}%")
+            
+        if stats["disk_free_gb"] < 0.5:
+            issues.append(f"LOW DISK: {stats['disk_free_gb']:.1f}GB free")
+            
+        if issues:
+            print(f"[Monitor] ALERTS: {', '.join(issues)}")
+
+if __name__ == "__main__":
+    print("[Monitor] Starting system monitoring...")
+    
+    while True:
+        try:
+            log_stats()
+            time.sleep(60)  # Log every minute
+        except KeyboardInterrupt:
+            print("[Monitor] Monitoring stopped")
+            break
+        except Exception as e:
+            print(f"[Monitor] Error: {e}")
+            time.sleep(60)
\ No newline at end of file
diff --git a/deployment/nginx.conf b/deployment/nginx.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ea91d164fd1c03527e1f92c368f00fa59171d7e1
--- /dev/null
+++ b/deployment/nginx.conf
@@ -0,0 +1,51 @@
+server {
+    listen 80;
+    
+    # Increase timeouts to handle long-running operations
+    proxy_connect_timeout       60s;
+    proxy_send_timeout          60s;
+    proxy_read_timeout          60s;
+    # Temp paths are configured globally in nginx.main.conf (http scope)
+    
+    # Buffer settings
+    proxy_buffering             on;
+    proxy_buffer_size           4k;
+    proxy_buffers               8 4k;
+    proxy_busy_buffers_size     8k;
+    
+    # Client settings
+    client_max_body_size        10m;
+    client_body_timeout         60s;
+    client_header_timeout       60s;
+
+    # -- health-check: proxy to gradio app --
+    location = /health {
+        proxy_pass       http://127.0.0.1:7860/;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        
+        # Shorter timeouts for health checks
+        proxy_connect_timeout 10s;
+        proxy_send_timeout 10s;
+        proxy_read_timeout 10s;
+        
+        # don't log upstream body
+        access_log off;
+    }
+
+    # -- everything else to Gradio --
+    location / {
+        proxy_pass       http://127.0.0.1:7860/;
+        proxy_set_header Host              $host;
+        proxy_set_header X-Real-IP         $remote_addr;
+        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        
+        # Handle WebSocket upgrades for Gradio
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+    }
+}
\ No newline at end of file
diff --git a/deployment/nginx.main.conf b/deployment/nginx.main.conf
new file mode 100644
index 0000000000000000000000000000000000000000..f474ca9b8f2a7a2f8fe8e838796c70653cc6312e
--- /dev/null
+++ b/deployment/nginx.main.conf
@@ -0,0 +1,37 @@
+worker_processes  auto;
+
+events {
+    worker_connections 1024;
+}
+
+http {
+    include       /etc/nginx/mime.types;
+    default_type  application/octet-stream;
+
+    # Timeouts
+    proxy_connect_timeout       60s;
+    proxy_send_timeout          60s;
+    proxy_read_timeout          60s;
+
+    # Temp paths (writable on Spaces)
+    client_body_temp_path /tmp/nginx/body 1 2;
+    proxy_temp_path       /tmp/nginx/proxy;
+    fastcgi_temp_path     /tmp/nginx/fastcgi;
+
+    # Buffers
+    proxy_buffering on;
+    proxy_buffer_size 4k;
+    proxy_buffers 8 4k;
+    proxy_busy_buffers_size 8k;
+
+    # Client
+    client_max_body_size 10m;
+    client_body_timeout 60s;
+    client_header_timeout 60s;
+
+    # Logs
+    access_log /dev/stdout;
+    error_log  /dev/stderr warn;
+
+    include /etc/nginx/conf.d/*.conf;
+}
diff --git a/deployment/render.yaml b/deployment/render.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b34488c82b65f2600494278ab0c14a3b63fc2832
--- /dev/null
+++ b/deployment/render.yaml
@@ -0,0 +1,83 @@
+services:
+  # ────────────────────────────────
+  # 1) Web service: API + nginx
+  # ────────────────────────────────
+  - type: web
+    name: advisorai-complete
+    env: docker
+    plan: free
+    instanceCount: 1
+    dockerfilePath: Dockerfile
+    dockerContext: .
+    # Health check configuration
+    healthCheckPath: /health
+    healthCheckInterval: 60s  # Longer interval for free plan
+    healthCheckTimeout: 15s
+    healthCheckThreshold: 5   # More lenient for free plan
+    # Environment variables
+    envVars:
+      - key: PORT
+        value: "80"
+      - key: API_PORT
+        value: "10000"
+      - key: ROLE
+        value: "web"
+      - key: PYTHONPATH
+        value: "/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
+      - key: MAX_MEMORY_MB
+        value: "512"  # Lower limit for free plan
+      - key: PYTHONUNBUFFERED
+        value: "1"
+      - key: PYTHONIOENCODING
+        value: "utf-8"
+      - key: TRIGGER_PING_INTERVAL
+        value: "600"  # Less frequent pinging for free plan
+    # Auto-deploy settings
+    autoDeploy: true
+    # Build settings
+    buildFilter:
+      paths:
+        - src/**
+        - deployment/**
+        - requirements.txt
+        - Dockerfile
+
+  # ────────────────────────────────
+  # 2) Worker service: pipeline scheduler & backup
+  # ────────────────────────────────
+  - type: worker
+    name: advisorai-scheduler
+    env: docker
+    plan: free
+    instanceCount: 1
+    dockerfilePath: Dockerfile
+    dockerContext: .
+    # entrypoint will respect ROLE=worker and launch scheduler
+    envVars:
+      - key: ROLE
+        value: "worker"
+      - key: PYTHONPATH
+        value: "/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
+      - key: MAX_MEMORY_MB
+        value: "512"  # Lower limit for free plan
+      - key: PYTHONUNBUFFERED
+        value: "1"
+      - key: PYTHONIOENCODING
+        value: "utf-8"
+      - key: TRIGGER_PING_INTERVAL
+        value: "600"  # Less frequent pinging for free plan
+      - key: MONGODB_URI
+        value: "<your-atlas-uri>"
+      - key: MONGODB_DATABASE
+        value: "AdvisorAI"
+      - key: MONGODB_COLLECTION_WAREHOUSE
+        value: "warehouse"
+    # Auto-deploy settings
+    autoDeploy: true
+    # Build settings
+    buildFilter:
+      paths:
+        - src/**
+        - deployment/**
+        - requirements.txt
+        - Dockerfile
\ No newline at end of file
diff --git a/deployment/scheduler.py b/deployment/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..d045e569a5251090c8bf94389299b0eab1404a91
--- /dev/null
+++ b/deployment/scheduler.py
@@ -0,0 +1,143 @@
+import os
+import time
+import subprocess
+import sys
+import threading
+import asyncio
+from dotenv import load_dotenv
+import httpx
+import os
+
+from src import config as app_config
+
+# -----------------------------------------------------------------------------
+# LOCATE YOUR DATA-PIPELINE SCRIPT
+# -----------------------------------------------------------------------------
+if os.path.exists(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py"))):
+    PIPELINE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py"))
+    PIPELINE_DIR = os.path.dirname(PIPELINE_PATH)
+else:
+    raise FileNotFoundError("src/main.py not found!")
+
+# -----------------------------------------------------------------------------
+# CONFIGURATION (via ENV)
+# -----------------------------------------------------------------------------
+load_dotenv()
+# URL to ping every N seconds (default 300s = 5min)
+def _parse_int_env(name: str, default_val: int) -> int:
+    raw = os.getenv(name, str(default_val))
+    if isinstance(raw, str):
+        # Strip inline comments and whitespace, e.g. "3600  # every hour"
+        cleaned = raw.split('#', 1)[0].strip()
+        if cleaned == "":
+            return int(default_val)
+        try:
+            return int(cleaned)
+        except Exception:
+            print(f"[Scheduler] Warning: {name}='{raw}' is not a valid int. Using default {default_val}.")
+            return int(default_val)
+    try:
+        return int(raw)
+    except Exception:
+        return int(default_val)
+
+TRIGGER_HEALTH_URL = os.getenv(
+    "TRIGGER_HEALTH_URL",
+    "https://advisor-trigger-ki3t.onrender.com/health, https://advisorai-data-1ew2.onrender.com/health"
+)
+PING_INTERVAL = _parse_int_env("TRIGGER_PING_INTERVAL", 300)
+# Pipeline interval default 3600s (1 hour)
+PIPELINE_INTERVAL = _parse_int_env("PIPELINE_INTERVAL", 3600)
+
+# -----------------------------------------------------------------------------
+# ASYNC PINGER WITH EXPONENTIAL BACKOFF
+# -----------------------------------------------------------------------------
+async def ping_remote():
+    """
+    Continuously GET each URL in TRIGGER_HEALTH_URL (comma-separated) every PING_INTERVAL seconds,
+    backing off on failure (up to 2.5 minutes).
+    """
+    urls = [u.strip() for u in TRIGGER_HEALTH_URL.split(",") if u.strip()]
+    backoff = min(PING_INTERVAL, 5)
+    async with httpx.AsyncClient(timeout=10.0) as client:
+        while True:
+            all_success = True
+            for url in urls:
+                try:
+                    resp = await client.get(url)
+                    resp.raise_for_status()
+                    print(f"[Pinger] {url} -> {resp.status_code}")
+                except Exception as e:
+                    print(f"[Pinger] error pinging {url}: {e}")
+                    all_success = False
+            if all_success:
+                backoff = PING_INTERVAL
+                await asyncio.sleep(PING_INTERVAL)
+            else:
+                await asyncio.sleep(backoff)
+                backoff = min(backoff * 2, 150)
+
+def start_async_ping():
+    """
+    Spin up a dedicated asyncio loop in a daemon thread
+    to run ping_remote() forever.
+    """
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    loop.create_task(ping_remote())
+    loop.run_forever()
+
+# launch the ping loop in the background
+threading.Thread(target=start_async_ping, daemon=True).start()
+print("[Scheduler] Started background ping thread")
+
+# -----------------------------------------------------------------------------
+# MAIN PIPELINE LOOP (runs every 30 minutes)
+# -----------------------------------------------------------------------------
+import traceback
+
+while True:
+    from datetime import datetime
+    last_run = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    print(f"[Scheduler] Running pipeline... Last run: {last_run}")
+    # Write last_run to file for API access
+    try:
+        with open(app_config.LAST_RUN_PATH, 'w') as f:
+            f.write(last_run)
+    except Exception as e:
+        print(f"[Scheduler] Failed to write last_run.txt: {e}")
+    try:
+        # Set working directory to project root (parent of deployment)
+        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+        print(f"[Scheduler] Project root: {project_root}")
+        print(f"[Scheduler] Pipeline path: {PIPELINE_PATH}")
+
+        # Run from '/' so relative 'data/...' writes resolve to '/data/...'
+        result = subprocess.run(
+            [sys.executable, PIPELINE_PATH],
+            cwd='/',
+            capture_output=True,
+            text=True,
+            env=os.environ.copy()
+        )
+        print(f"[Scheduler] Pipeline finished with code {result.returncode}")
+
+        if result.stdout:
+            print("[Scheduler] STDOUT:\n", result.stdout)
+        if result.stderr:
+            print("[Scheduler] STDERR:\n", result.stderr)
+
+        # Raise an exception if the return code is non-zero
+        if result.returncode != 0:
+            raise subprocess.CalledProcessError(result.returncode, result.args, result.stdout, result.stderr)
+
+    except subprocess.CalledProcessError as e:
+        print(f"[Scheduler] Pipeline execution failed with return code {e.returncode}")
+        print(f"[Scheduler] STDOUT:\n{e.stdout}")
+        print(f"[Scheduler] STDERR:\n{e.stderr}")
+    except Exception as e:
+        print(f"[Scheduler] Exception running pipeline: {e}")
+        print(traceback.format_exc())
+
+    print(f"[Scheduler] Sleeping for {PIPELINE_INTERVAL // 60} minutes...")
+    time.sleep(PIPELINE_INTERVAL)
diff --git a/deployment/supervisord.conf b/deployment/supervisord.conf
new file mode 100644
index 0000000000000000000000000000000000000000..81444670956154a17423af0d3e99c7ab1e7dc0ab
--- /dev/null
+++ b/deployment/supervisord.conf
@@ -0,0 +1,65 @@
+[supervisord]
+nodaemon=true
+logfile=/dev/stdout
+logfile_maxbytes=0
+pidfile=/tmp/supervisord.pid
+loglevel=info
+
+[program:gradio]
+command=python /app/src/api/gradio_main.py
+directory=/app
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startsecs=10
+startretries=3
+stopwaitsecs=30
+killasgroup=true
+stopasgroup=true
+environment=PYTHONPATH="/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
+
+[program:nginx]
+command=/usr/sbin/nginx -g 'daemon off;'
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startsecs=5
+startretries=3
+stopwaitsecs=10
+
+[program:scheduler]
+; wait 180 s before first run, then your scheduler.py handles its own 30 min sleeps
+command=/bin/sh -c 'sleep 180 && python /app/deployment/scheduler.py'
+directory=/app
+autostart=true
+autorestart=true
+startsecs=0
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startretries=3
+stopwaitsecs=60
+killasgroup=true
+stopasgroup=true
+
+[program:monitor]
+command=python /app/deployment/monitor.py
+directory=/app
+autostart=true
+autorestart=true
+startsecs=5
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startretries=3
+stopwaitsecs=10
+killasgroup=true
+stopasgroup=true
\ No newline at end of file
diff --git a/deployment/test_permissions.py b/deployment/test_permissions.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ab556da8661f20fd644b1e4efe885a8285ad542
--- /dev/null
+++ b/deployment/test_permissions.py
@@ -0,0 +1,129 @@
+"""
+Test script to verify directory permissions and file creation capabilities.
+This script should be run inside the container to verify the fixes.
+"""
+import os
+import tempfile
+import sys
+from pathlib import Path
+
+def test_directory_permissions():
+    """Test if we can create directories and files in the expected locations."""
+    
+    print("=== Directory Permission Test ===")
+    
+    # Test directories that should be writable (use /data on Spaces)
+    test_dirs = [
+        "/data/advisorai-data/test",
+        "/data/merged/test", 
+        "/data/alpaca/test",
+        "/data/crypto-bubbles/test",
+        "/data/finnhub/test",
+        "/data/finviz/test",
+        "/data/marketaux/test"
+    ]
+    
+    success_count = 0
+    for test_dir in test_dirs:
+        try:
+            # Try to create directory
+            os.makedirs(test_dir, mode=0o755, exist_ok=True)
+            
+            # Try to create a test file
+            test_file = os.path.join(test_dir, "test_write.txt")
+            with open(test_file, 'w') as f:
+                f.write(f"Test write successful at {test_dir}")
+            
+            # Try to read the file back
+            with open(test_file, 'r') as f:
+                content = f.read()
+                
+            # Clean up
+            os.remove(test_file)
+            os.rmdir(test_dir)
+            
+            print(f"✅ SUCCESS: {test_dir}")
+            success_count += 1
+            
+        except Exception as e:
+            print(f"❌ FAILED: {test_dir} - {e}")
+    
+    print(f"\n📊 Results: {success_count}/{len(test_dirs)} directories passed the test")
+    
+    if success_count == len(test_dirs):
+        print("🎉 All directory permission tests PASSED!")
+        return True
+    else:
+        print("⚠️  Some directory permission tests FAILED!")
+        return False
+
+def test_user_info():
+    """Display current user and process information."""
+    print("\n=== User & Process Information ===")
+    
+    # Check if running on Windows or Unix
+    if hasattr(os, 'getuid'):
+        # Unix/Linux system
+        print(f"Current UID: {os.getuid()}")
+        print(f"Current GID: {os.getgid()}")
+        print(f"Effective UID: {os.geteuid()}")
+        print(f"Effective GID: {os.getegid()}")
+        
+        # Check if running as root
+        if os.getuid() == 0:
+            print("✅ Running as root user")
+        else:
+            print("ℹ️  Running as non-root user")
+    else:
+        # Windows system
+        print("ℹ️  Running on Windows system")
+        print(f"Current user: {os.getenv('USERNAME', 'Unknown')}")
+        
+    print(f"Process ID: {os.getpid()}")
+    print(f"Parent Process ID: {os.getppid()}")
+
+def test_filebase_connectivity():
+    """Test if we can load environment variables needed for Filebase."""
+    print("\n=== Environment Variables Test ===")
+    
+    required_vars = [
+        'FILEBASE_ENDPOINT',
+        'FILEBASE_ACCESS_KEY', 
+        'FILEBASE_SECRET_KEY',
+        'FILEBASE_BUCKET'
+    ]
+    
+    missing_vars = []
+    for var in required_vars:
+        value = os.getenv(var)
+        if value:
+            # Don't print sensitive values, just show they exist
+            if 'KEY' in var:
+                print(f"✅ {var}: ***redacted*** (length: {len(value)})")
+            else:
+                print(f"✅ {var}: {value}")
+        else:
+            print(f"❌ {var}: NOT SET")
+            missing_vars.append(var)
+    
+    if missing_vars:
+        print(f"⚠️  Missing environment variables: {missing_vars}")
+        return False
+    else:
+        print("🎉 All required environment variables are set!")
+        return True
+
+if __name__ == "__main__":
+    print("Starting permission and environment tests...\n")
+    
+    test_user_info()
+    perm_test = test_directory_permissions() 
+    env_test = test_filebase_connectivity()
+    
+    print(f"\n=== Final Results ===")
+    if perm_test and env_test:
+        print("🎉 ALL TESTS PASSED! The container should work correctly.")
+        sys.exit(0)
+    else:
+        print("❌ SOME TESTS FAILED! Check the output above for details.")
+        sys.exit(1)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4137d84810713fd81ae95ff6636b0a643fc2967b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,31 @@
+# feedparser
+# crawl4ai
+python-dotenv
+requests>=2.25.0
+# pymongo
+pandas>=1.3.0
+pyarrow
+boto3==1.36.*
+finnhub-python==2.4.24
+alpaca-py>=0.6.0
+pydantic-settings>=1.0.0
+sanpy>=0.1.0
+python-dateutil
+plotly
+nltk
+Flask==2.2.2
+werkzeug==2.2.3
+fastapi
+uvicorn[standard]
+httpx
+gradio>=4.0.0
+# trafilatura
+rich
+numpy
+pydantic
+# playwright
+psutil
+beautifulsoup4
+scikit-learn
+python-multipart
+aiofiles
\ No newline at end of file
diff --git a/santiment_frequency_controller.py b/santiment_frequency_controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..935b7deaf581c734ed8d26a4a56aa527583ea708
--- /dev/null
+++ b/santiment_frequency_controller.py
@@ -0,0 +1,118 @@
+"""
+Santiment Frequency Controller
+=============================
+
+This module provides frequency control for Santiment API calls to preserve API limits.
+It tracks execution frequency and limits runs to avoid exceeding API quotas.
+"""
+
+import json
+import os
+from datetime import datetime, timedelta
+from pathlib import Path
+
+
+class SantimentFrequencyController:
+    """Controls the frequency of Santiment API calls to preserve API limits"""
+    
+    def __init__(self, state_file: str = None):
+        """Initialize the frequency controller
+        
+        Args:
+            state_file: Path to the state file. If None, uses default location.
+        """
+        if state_file is None:
+            # Try to find the state file in data/santiment directory
+            try:
+                from src.config import DATA_DIR
+                state_file = os.path.join(DATA_DIR, "santiment", "frequency_state.json")
+            except Exception:
+                # Fallback to local directory
+                state_file = "data/santiment/frequency_state.json"
+        
+        self.state_file = Path(state_file)
+        self.state_file.parent.mkdir(parents=True, exist_ok=True)
+        self._load_state()
+    
+    def _load_state(self):
+        """Load the current state from file"""
+        if self.state_file.exists():
+            try:
+                with open(self.state_file, 'r') as f:
+                    self.state = json.load(f)
+            except Exception:
+                self.state = {}
+        else:
+            self.state = {}
+        
+        # Ensure required fields exist
+        if 'last_run' not in self.state:
+            self.state['last_run'] = None
+        if 'runs_today' not in self.state:
+            self.state['runs_today'] = 0
+        if 'date' not in self.state:
+            self.state['date'] = None
+    
+    def _save_state(self):
+        """Save the current state to file"""
+        try:
+            with open(self.state_file, 'w') as f:
+                json.dump(self.state, f, indent=2)
+        except Exception as e:
+            print(f"[WARN] Failed to save frequency state: {e}")
+    
+    def should_run_santiment(self, max_runs_per_day: int = 2) -> bool:
+        """Check if Santiment should be allowed to run
+        
+        Args:
+            max_runs_per_day: Maximum number of runs allowed per day
+            
+        Returns:
+            True if Santiment should run, False otherwise
+        """
+        today = datetime.now().strftime("%Y-%m-%d")
+        
+        # Reset counter if it's a new day
+        if self.state.get('date') != today:
+            self.state['date'] = today
+            self.state['runs_today'] = 0
+            self._save_state()
+        
+        # Check if we've exceeded the daily limit
+        return self.state['runs_today'] < max_runs_per_day
+    
+    def record_run(self):
+        """Record that Santiment has been run"""
+        today = datetime.now().strftime("%Y-%m-%d")
+        now = datetime.now().isoformat()
+        
+        # Update state
+        self.state['last_run'] = now
+        self.state['date'] = today
+        self.state['runs_today'] = self.state.get('runs_today', 0) + 1
+        
+        # Save state
+        self._save_state()
+        
+        print(f"[SANTIMENT] Recorded run #{self.state['runs_today']} for {today}")
+    
+    def get_status(self) -> dict:
+        """Get the current status of the frequency controller
+        
+        Returns:
+            Dictionary with current status information
+        """
+        return {
+            'last_run': self.state.get('last_run'),
+            'runs_today': self.state.get('runs_today', 0),
+            'date': self.state.get('date'),
+            'state_file': str(self.state_file)
+        }
+    
+    def reset_daily_count(self):
+        """Reset the daily run count (for testing or manual reset)"""
+        today = datetime.now().strftime("%Y-%m-%d")
+        self.state['date'] = today
+        self.state['runs_today'] = 0
+        self._save_state()
+        print(f"[SANTIMENT] Reset daily count for {today}")
diff --git a/scripts/push_hf_secrets.py b/scripts/push_hf_secrets.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f3ff08b0b03712303a224d4ec1b8ef88a890696
--- /dev/null
+++ b/scripts/push_hf_secrets.py
@@ -0,0 +1,186 @@
+"""
+Push all variables from a .env file into a Hugging Face Space as secrets (or variables).
+
+Requirements:
+    - huggingface_hub (Python SDK)
+        Install: pip install -U huggingface_hub
+
+Usage examples:
+    python scripts/push_hf_secrets.py --repo your-username/your-space
+    python scripts/push_hf_secrets.py --repo your-username/your-space --env .env.production
+    python scripts/push_hf_secrets.py --repo your-username/your-space --dry-run
+    python scripts/push_hf_secrets.py --repo your-username/your-space --as-variables  # send as public variables
+
+Notes:
+    - This script is intentionally simple and cross-platform.
+    - It parses common .env formats (KEY=VALUE, supports quoted values and export prefix).
+    - It won’t print secret values; only key names are logged.
+    - "Secrets" are private; "Variables" are public. See: Settings → Secrets and variables
+"""
+
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import sys
+from typing import Dict, Tuple
+
+
+ENV_LINE_RE = re.compile(r"^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)\s*$")
+
+
+def _unquote(value: str) -> str:
+    """Strip matching single or double quotes and unescape simple escapes for double quotes.
+
+    - If value is wrapped in double quotes, unescape common sequences (\\n, \\r, \\t, \\" , \\\\).
+    - If wrapped in single quotes, return inner content as-is (no escapes processing).
+    - Otherwise, return value trimmed of surrounding whitespace.
+    """
+    if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'):
+        quote = value[0]
+        inner = value[1:-1]
+        if quote == '"':
+            # Process simple escape sequences
+            inner = (
+                inner.replace(r"\\n", "\n")
+                     .replace(r"\\r", "\r")
+                     .replace(r"\\t", "\t")
+                     .replace(r"\\\"", '"')
+                     .replace(r"\\\\", "\\")
+            )
+        return inner
+    return value.strip()
+
+
+def parse_env_file(path: str) -> Dict[str, str]:
+    """Parse a .env-like file into a dict of {KEY: VALUE}.
+
+    Skips blank lines and comments (lines starting with #, ignoring leading whitespace).
+    Supports lines like:
+      - KEY=VALUE
+      - export KEY=VALUE
+    Values can be quoted with single or double quotes.
+    """
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f".env file not found: {path}")
+
+    env: Dict[str, str] = {}
+    with open(path, "r", encoding="utf-8-sig") as f:
+        for idx, raw in enumerate(f, start=1):
+            line = raw.rstrip("\n\r")
+            stripped = line.strip()
+            if not stripped or stripped.startswith("#"):
+                continue
+
+            m = ENV_LINE_RE.match(line)
+            if not m:
+                # Non-fatal: skip lines that don't match KEY=VALUE
+                continue
+
+            key, raw_val = m.group(1), m.group(2).strip()
+
+            # If value is unquoted, do not strip inline comments aggressively to avoid breaking tokens.
+            value = _unquote(raw_val)
+            env[key] = value
+
+    return env
+
+
+def get_hf_api():
+    """Return an authenticated HfApi client or None with a helpful error.
+
+    Uses locally saved token if you previously ran `huggingface-cli login` or
+    set HF_TOKEN environment variable.
+    """
+    try:
+        from huggingface_hub import HfApi
+    except Exception:
+        sys.stderr.write(
+            "huggingface_hub is not installed. Install with: pip install -U huggingface_hub\n"
+        )
+        return None
+    return HfApi()
+
+def set_secret(api, repo: str, key: str, value: str, dry_run: bool = False) -> int:
+    if dry_run:
+        print(f"[DRY RUN] Set secret: {key} -> (hidden) on {repo}")
+        return 0
+    try:
+        api.add_space_secret(repo_id=repo, key=key, value=value)
+        print(f"Set secret: {key}")
+        return 0
+    except Exception as e:
+        sys.stderr.write(f"Error setting secret {key!r} for repo {repo!r}: {e}\n")
+        return 1
+
+
+def set_variable(api, repo: str, key: str, value: str, dry_run: bool = False) -> int:
+    if dry_run:
+        print(f"[DRY RUN] Set variable: {key} -> (hidden) on {repo}")
+        return 0
+    try:
+        api.add_space_variable(repo_id=repo, key=key, value=value)
+        print(f"Set variable: {key}")
+        return 0
+    except Exception as e:
+        sys.stderr.write(f"Error setting variable {key!r} for repo {repo!r}: {e}\n")
+        return 1
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Push .env variables to a Hugging Face Space as secrets or variables.")
+    parser.add_argument("--repo", required=True, help="Space repo id, e.g. your-username/your-space")
+    parser.add_argument("--env", default=".env", help="Path to .env file (default: .env)")
+    parser.add_argument("--dry-run", action="store_true", help="Print what would be set without applying changes")
+    parser.add_argument(
+        "--as-variables",
+        action="store_true",
+        help="Send entries as public variables instead of private secrets",
+    )
+    parser.add_argument(
+        "--exclude",
+        action="append",
+        default=[],
+        help="Key(s) to exclude (can be repeated)",
+    )
+    args = parser.parse_args(argv)
+
+    api = get_hf_api()
+    if api is None:
+        return 127
+
+    try:
+        env_map = parse_env_file(args.env)
+    except Exception as e:
+        sys.stderr.write(f"Failed to read env file {args.env}: {e}\n")
+        return 2
+
+    if not env_map:
+        print("No variables found in .env; nothing to do.")
+        return 0
+
+    excluded = set(args.exclude or [])
+    total = 0
+    failures = 0
+    for key, value in env_map.items():
+        if key in excluded:
+            continue
+        total += 1
+        if args.as_variables:
+            rc = set_variable(api, args.repo, key, value, args.dry_run)
+        else:
+            rc = set_secret(api, args.repo, key, value, args.dry_run)
+        if rc != 0:
+            failures += 1
+
+    if failures:
+        sys.stderr.write(f"Completed with {failures}/{total} failures.\n")
+        return 1
+
+    print(f"Completed: {total} secrets {'validated' if args.dry_run else 'set'} for {args.repo}.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/api/gradio_main.py b/src/api/gradio_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..76aa9e4325d33162dee7d90177255f56ca521ff9
--- /dev/null
+++ b/src/api/gradio_main.py
@@ -0,0 +1,265 @@
+import gradio as gr
+import json
+import os
+import sys
+import logging
+import pandas as pd
+import time
+from datetime import datetime, timedelta
+import psutil
+from pathlib import Path
+
+# Add src to Python path for imports
+sys.path.insert(0, '/app/src')
+sys.path.insert(0, '/app')
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+
+def get_health_status():
+    """Get basic health status"""
+    try:
+        # Get process info
+        process = psutil.Process()
+        memory_mb = process.memory_info().rss / 1024 / 1024
+        cpu_percent = process.cpu_percent()
+        
+        # Get system info
+        memory = psutil.virtual_memory()
+        disk = psutil.disk_usage('/')
+        
+        # Check scheduler status
+        scheduler_running = False
+        last_run_time = "Unknown"
+        try:
+            last_run_file = "/app/deployment/last_run.txt"
+            if os.path.exists(last_run_file):
+                with open(last_run_file, 'r') as f:
+                    last_run_str = f.read().strip()
+                    last_run = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S')
+                    time_since_last_run = (datetime.now() - last_run).total_seconds()
+                    scheduler_running = time_since_last_run < 2700  # 45 minutes
+                    last_run_time = last_run_str
+        except Exception as e:
+            logger.warning(f"Could not check scheduler status: {e}")
+        
+        return {
+            "status": "healthy" if memory_mb < 400 else "warning",
+            "timestamp": datetime.now().isoformat(),
+            "process_memory_mb": round(memory_mb, 2),
+            "process_cpu_percent": round(cpu_percent, 2),
+            "system_memory_percent": round(memory.percent, 1),
+            "system_memory_available_gb": round(memory.available / (1024**3), 2),
+            "disk_free_gb": round(disk.free / (1024**3), 2),
+            "scheduler_running": scheduler_running,
+            "scheduler_last_run": last_run_time
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return {
+            "status": "error",
+            "error": str(e),
+            "timestamp": datetime.now().isoformat()
+        }
+
+def get_pipeline_status():
+    """Get data pipeline status"""
+    try:
+        data_dirs = [
+            "/data/merged/features",
+            "/data/merged/train",
+            "/data/alpaca",
+            "/data/advisorai-data"
+        ]
+        
+        recent_files = 0
+        total_size = 0
+        
+        for data_dir in data_dirs:
+            if os.path.exists(data_dir):
+                for root, dirs, files in os.walk(data_dir):
+                    for file in files:
+                        if file.endswith(('.json', '.parquet', '.csv')):
+                            file_path = os.path.join(root, file)
+                            try:
+                                stat = os.stat(file_path)
+                                # Count files modified in last 24 hours
+                                if time.time() - stat.st_mtime < 86400:
+                                    recent_files += 1
+                                total_size += stat.st_size
+                            except Exception:
+                                continue
+        
+        return {
+            "status": "running" if recent_files > 0 else "stale",
+            "recent_files_24h": recent_files,
+            "total_data_size_gb": round(total_size / (1024**3), 2),
+            "last_check": datetime.now().isoformat()
+        }
+    except Exception as e:
+        logger.error(f"Pipeline status check failed: {e}")
+        return {
+            "status": "error",
+            "error": str(e),
+            "last_check": datetime.now().isoformat()
+        }
+
+def get_recent_files():
+    """Get list of recent files in the data directories"""
+    try:
+        base_paths = [
+            "/data/merged/features",
+            "/data/merged/train", 
+            "/data/alpaca",
+            "/data/advisorai-data/features"
+        ]
+        
+        recent_files = []
+        for base_path in base_paths:
+            if os.path.exists(base_path):
+                for root, dirs, files in os.walk(base_path):
+                    for file in files[:10]:  # Limit to 10 files per directory
+                        file_path = os.path.join(root, file)
+                        try:
+                            stat = os.stat(file_path)
+                            recent_files.append({
+                                "File": file,
+                                "Path": file_path.replace("/data/", ""),
+                                "Size": f"{stat.st_size / (1024**2):.2f} MB",
+                                "Modified": datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
+                            })
+                        except Exception:
+                            continue
+        
+        # Sort by modification time and take most recent 20
+        recent_files.sort(key=lambda x: x["Modified"], reverse=True)
+        return recent_files[:20]
+    
+    except Exception as e:
+        logger.error(f"Error getting recent files: {e}")
+        return [{"Error": str(e)}]
+
+def get_logs():
+    """Get recent log entries"""
+    try:
+        log_files = [
+            "/data/logs/scheduler.log",
+            "/data/logs/data_pipeline.log",
+            "/data/logs/monitor.log"
+        ]
+        
+        logs = []
+        for log_file in log_files:
+            if os.path.exists(log_file):
+                try:
+                    with open(log_file, 'r', encoding='utf-8') as f:
+                        lines = f.readlines()
+                        # Get last 10 lines
+                        recent_lines = lines[-10:] if len(lines) > 10 else lines
+                        logs.append(f"=== {os.path.basename(log_file)} ===\n")
+                        logs.extend(recent_lines)
+                        logs.append("\n")
+                except Exception as e:
+                    logs.append(f"Error reading {log_file}: {str(e)}\n")
+        
+        return "".join(logs) if logs else "No log files found"
+    
+    except Exception as e:
+        logger.error(f"Error getting logs: {e}")
+        return f"Error getting logs: {str(e)}"
+
+# Create Gradio interface
+with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🤖 AdvisorAI Data Pipeline Monitor")
+    gr.Markdown("Real-time monitoring of the AdvisorAI data collection and processing pipeline")
+    
+    with gr.Tabs():
+        with gr.TabItem("📊 Dashboard"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Health Status")
+                    health_display = gr.JSON(label="System Health & Status")
+                    
+                with gr.Column():
+                    gr.Markdown("### Pipeline Status")
+                    pipeline_display = gr.JSON(label="Data Pipeline Status")
+                    
+            with gr.Row():
+                refresh_btn = gr.Button("🔄 Refresh", variant="primary")
+        
+        with gr.TabItem("📁 Recent Files"):
+            gr.Markdown("### Recently Modified Data Files")
+            files_display = gr.Dataframe(
+                headers=["File", "Path", "Size", "Modified"],
+                datatype=["str", "str", "str", "str"],
+                label="Recent Files"
+            )
+            refresh_files_btn = gr.Button("🔄 Refresh Files")
+        
+        with gr.TabItem("📝 Logs"):
+            gr.Markdown("### Recent Log Entries")
+            logs_display = gr.Textbox(
+                label="Recent Logs",
+                lines=20,
+                max_lines=30,
+                show_copy_button=True
+            )
+            refresh_logs_btn = gr.Button("🔄 Refresh Logs")
+    
+    # Event handlers
+    def refresh_dashboard():
+        health = get_health_status()
+        pipeline = get_pipeline_status()
+        return json.dumps(health, indent=2), json.dumps(pipeline, indent=2)
+    
+    def refresh_files():
+        files = get_recent_files()
+        if files and isinstance(files[0], dict) and "Error" not in files[0]:
+            return [[f["File"], f["Path"], f["Size"], f["Modified"]] for f in files]
+        else:
+            return [["Error", str(files), "", ""]]
+    
+    def refresh_logs():
+        return get_logs()
+    
+    # Connect event handlers
+    refresh_btn.click(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+    
+    refresh_files_btn.click(
+        refresh_files,
+        outputs=[files_display]
+    )
+    
+    refresh_logs_btn.click(
+        refresh_logs,
+        outputs=[logs_display]
+    )
+    
+    # Auto-refresh on load
+    app.load(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+    
+    app.load(
+        refresh_files,
+        outputs=[files_display]
+    )
+
+if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        quiet=False
+    )
diff --git a/src/api/main.py b/src/api/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b6e0d135b32e0bee1fec2f18e1e0c28032a8c2
--- /dev/null
+++ b/src/api/main.py
@@ -0,0 +1,114 @@
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, HTMLResponse
+import uvicorn
+import logging
+import sys
+from src.api.routes.health import health_status
+from src.api.routes.isrunning import is_running
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+
+logger = logging.getLogger(__name__)
+
+app = FastAPI(
+    title="AdvisorAI Data API",
+    description="API for AdvisorAI data pipeline and health monitoring",
+    version="1.0.0"
+)
+
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+@app.exception_handler(Exception)
+async def global_exception_handler(request, exc):
+    logger.error(f"Global exception handler caught: {exc}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={"detail": "Internal server error", "error": str(exc)}
+    )
+
+@app.get('/health')
+def health():
+    """Enhanced health check endpoint"""
+    try:
+        return health_status()
+    except Exception as e:
+        logger.error(f"Health check failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")
+
+# Route to check if there are any JSON files under data/merged/features (relative path)
+@app.get('/status')
+def status():
+    """Check if the data pipeline is running and has recent data"""
+    try:
+        return is_running()
+    except Exception as e:
+        logger.error(f"Status check failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Status check failed: {str(e)}")
+
+@app.get('/', response_class=HTMLResponse)
+def root():
+        """Root endpoint returns simple HTML so HF Spaces iframe can render it."""
+        html = """
+        <!doctype html>
+        <html lang="en">
+        <head>
+            <meta charset="utf-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1">
+            <title>AdvisorAI Data API</title>
+            <style>
+                body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; padding: 24px; }
+                code { background: #f5f5f5; padding: 2px 4px; border-radius: 4px; }
+                .links a { margin-right: 12px; }
+            </style>
+        </head>
+        <body>
+            <h1>AdvisorAI Data API</h1>
+            <p>Service is running.</p>
+            <div class="links">
+                <a href="/health">/health</a>
+                <a href="/status">/status</a>
+                <a href="/api">/api (JSON)</a>
+            </div>
+        </body>
+        </html>
+        """
+        return HTMLResponse(content=html, status_code=200)
+
+@app.get('/api')
+def api_root():
+        """JSON root for programmatic clients."""
+        return {
+                "message": "AdvisorAI Data API",
+                "version": "1.0.0",
+                "endpoints": {
+                        "/health": "Health check with system metrics",
+                        "/status": "Data pipeline status",
+                        "/api": "This JSON endpoint",
+                        "/": "HTML landing page for Spaces"
+                }
+        }
+
+if __name__ == "__main__":
+    uvicorn.run(
+        "src.api.main:app",
+        host="0.0.0.0",
+        port=10000,
+        workers=1,
+        timeout_keep_alive=30,
+        access_log=True
+    )
\ No newline at end of file
diff --git a/src/api/routes/health.py b/src/api/routes/health.py
new file mode 100644
index 0000000000000000000000000000000000000000..944bd4adde0c7e92de15f998a0404982e6ab2c69
--- /dev/null
+++ b/src/api/routes/health.py
@@ -0,0 +1,67 @@
+import os
+import psutil
+import time
+from datetime import datetime
+from src.config import DATA_DIR, LAST_RUN_PATH
+
+def health_status():
+    """Enhanced health check that monitors actual service health"""
+    try:
+        # Check memory usage
+        process = psutil.Process()
+        memory_mb = process.memory_info().rss / 1024 / 1024
+        cpu_percent = process.cpu_percent()
+
+        # Check if scheduler is running
+        scheduler_running = False
+        try:
+            with open(LAST_RUN_PATH, 'r') as f:
+                last_run_str = f.read().strip()
+                last_run = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S')
+                # Consider scheduler healthy if it ran within last 45 minutes
+                time_since_last_run = (datetime.now() - last_run).total_seconds()
+                scheduler_running = time_since_last_run < 2700  # 45 minutes
+        except Exception:
+            scheduler_running = False
+
+        # Check disk space (prefer DATA_DIR)
+        disk_usage = psutil.disk_usage(DATA_DIR if os.path.exists(DATA_DIR) else '/')
+        disk_free_gb = disk_usage.free / (1024**3)
+
+        # Determine overall health
+        health_issues = []
+        # Memory checks
+        if memory_mb > 1024:  # More than 1GB
+            health_issues.append(f"High memory usage: {memory_mb:.1f}MB (over 1GB)")
+        elif memory_mb > 512:  # More than 512MB for free plan
+            health_issues.append(f"High memory usage: {memory_mb:.1f}MB (over 512MB)")
+
+        if cpu_percent > 80:
+            health_issues.append(f"High CPU usage: {cpu_percent:.1f}%")
+
+        if disk_free_gb < 1:  # Less than 1GB free
+            health_issues.append(f"Low disk space: {disk_free_gb:.1f}GB free")
+
+        if not scheduler_running:
+            health_issues.append("Scheduler not running or stale")
+
+        status = "healthy" if not health_issues else "degraded"
+
+        return {
+            "status": status,
+            "timestamp": datetime.now().isoformat(),
+            "metrics": {
+                "memory_mb": round(memory_mb, 1),
+                "cpu_percent": round(cpu_percent, 1),
+                "disk_free_gb": round(disk_free_gb, 1),
+                "scheduler_running": scheduler_running
+            },
+            "issues": health_issues
+        }
+
+    except Exception as e:
+        return {
+            "status": "error",
+            "timestamp": datetime.now().isoformat(),
+            "error": str(e)
+        }
\ No newline at end of file
diff --git a/src/api/routes/isrunning.py b/src/api/routes/isrunning.py
new file mode 100644
index 0000000000000000000000000000000000000000..97bab94b8ee35d0d40ac5502a620663277937f6b
--- /dev/null
+++ b/src/api/routes/isrunning.py
@@ -0,0 +1,34 @@
+import os
+from datetime import datetime
+from fastapi import APIRouter
+
+from ... import config as app_config
+
+router = APIRouter()
+
+
+@router.get("/status")
+def is_running():
+    """Return a small status dict: whether pipeline appears to be running and last run time."""
+    json_folder = os.path.join(app_config.DATA_DIR, 'merged', 'features')
+    has_json = False
+    if os.path.exists(json_folder):
+        try:
+            has_json = any(f.endswith('.json') for f in os.listdir(json_folder))
+        except Exception:
+            has_json = False
+
+    last_run_file = app_config.LAST_RUN_PATH
+    last_run_display = 'Unknown'
+    try:
+        if os.path.exists(last_run_file):
+            with open(last_run_file, 'r') as f:
+                last_run_str = f.read().strip()
+            last_run_dt = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S')
+            minutes_ago = int((datetime.now() - last_run_dt).total_seconds() // 60)
+            last_run_display = f"{minutes_ago} minutes ago"
+    except Exception:
+        last_run_display = 'Unknown'
+
+    status = "Running" if not has_json else "Not Running"
+    return {"status": status, "last_run": last_run_display}
\ No newline at end of file
diff --git a/src/config.py b/src/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..64ed7d60fbdb3242d94bc7863341fa3c374ba723
--- /dev/null
+++ b/src/config.py
@@ -0,0 +1,66 @@
+import os
+import tempfile
+
+
+def _is_writable(path: str) -> bool:
+    try:
+        if not os.path.exists(path):
+            os.makedirs(path, exist_ok=True)
+        test_fd, test_path = tempfile.mkstemp(prefix='.wtest_', dir=path)
+        os.close(test_fd)
+        os.unlink(test_path)
+        return True
+    except Exception:
+        return False
+
+
+def _detect_data_dir() -> str:
+    # 1) Respect DATA_DIR env only if writable
+    env = os.getenv('DATA_DIR')
+    if env and _is_writable(env):
+        return env
+    # 2) Prefer /data if writable (Spaces)
+    if _is_writable('/data'):
+        return '/data'
+    # 3) Local dev fallback: /app/data if writable
+    if _is_writable('/app/data'):
+        return '/app/data'
+    # 4) Final fallback: /tmp
+    return '/tmp'
+
+
+DATA_DIR = _detect_data_dir()
+
+# Logs: prefer DATA_DIR/logs, fallback to /tmp/logs
+_preferred_logs = os.getenv('LOG_DIR') or os.path.join(DATA_DIR, 'logs')
+try:
+    os.makedirs(_preferred_logs, exist_ok=True)
+    # sanity: try to write
+    if not _is_writable(_preferred_logs):
+        raise PermissionError("Log dir not writable")
+except Exception:
+    _preferred_logs = '/tmp/logs'
+    os.makedirs(_preferred_logs, exist_ok=True)
+
+LOG_DIR = _preferred_logs
+
+# Path for scheduler's last_run marker
+def _compute_last_run_path(base_dir: str) -> str:
+    candidates = [
+        os.path.join(base_dir, 'deployment', 'last_run.txt'),
+        os.path.join(base_dir, 'last_run.txt'),
+        '/tmp/last_run.txt',
+    ]
+    for p in candidates:
+        try:
+            os.makedirs(os.path.dirname(p), exist_ok=True)
+            # test write
+            with open(p, 'a'):
+                pass
+            return p
+        except Exception:
+            continue
+    return '/tmp/last_run.txt'
+
+
+LAST_RUN_PATH = _compute_last_run_path(DATA_DIR)
diff --git a/src/data_cloud/cloud_utils.py b/src/data_cloud/cloud_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..78eeeb352d6071965775f835f93b84fb2e8c9ce6
--- /dev/null
+++ b/src/data_cloud/cloud_utils.py
@@ -0,0 +1,163 @@
+"""
+cloud_utils.py – Unified utilities for HTTP fetch and cloud/local storage operations.
+
+Provides:
+  • fetch_content / fetch_json for HTTP GET
+  • StorageHandler class with upload/download and fallback to local filesystem
+    - Methods set self.last_mode to 'cloud' or 'local'
+    - Local files are stored under a base directory
+
+Usage:
+  from cloud_utils import StorageHandler, fetch_json
+
+Requirements:
+  • boto3 and botocore
+  • requests
+  • ENV vars for cloud credentials (e.g. FILEBASE_*)
+"""
+import os
+import errno
+import requests
+import boto3
+from botocore.config import Config
+from botocore.exceptions import BotoCoreError, ClientError
+
+# HTTP Fetch utilities ---------------------------------------------------------
+def fetch_content(url, headers=None, timeout=15):
+    """Fetch binary content via HTTP GET."""
+    resp = requests.get(url, headers=headers, timeout=timeout, stream=False)
+    resp.raise_for_status()
+    return resp.content
+
+def fetch_json(url, headers=None, timeout=15):
+    """Fetch JSON data via HTTP GET."""
+    resp = requests.get(url, headers=headers, timeout=timeout)
+    resp.raise_for_status()
+    data = resp.json()
+    return data.get("data", data) if isinstance(data, dict) else data
+
+def fetch_text(url, headers=None, timeout=15, encoding='utf-8'):
+    """Fetch text content via HTTP GET."""
+    resp = requests.get(url, headers=headers, timeout=timeout)
+    resp.raise_for_status()
+    resp.encoding = encoding
+    return resp.text
+
+# Storage Handler ---------------------------------------------------------------
+class StorageHandler:
+    def list_prefix(self, prefix):
+        """List all object keys in the given S3 prefix. Returns a list of keys. Local fallback returns empty list."""
+        if self.s3 and self.bucket:
+            paginator = self.s3.get_paginator('list_objects_v2')
+            keys = []
+            for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
+                for obj in page.get('Contents', []):
+                    keys.append(obj['Key'])
+            return keys
+        # Local fallback: not implemented (could walk local filesystem if needed)
+        return []
+    def __init__(self, endpoint_url, access_key, secret_key, bucket_name, local_base="data"):
+        """
+        Initialize cloud storage client and local base path.
+        endpoint_url: S3-compatible endpoint URL
+        bucket_name: target bucket name (if None/empty, operate in local-only mode)
+        local_base: directory prefix for local fallback files
+        """
+        self.bucket = bucket_name
+        self.local_base = local_base.rstrip(os.sep)
+        self.last_mode = None  # 'cloud' or 'local'
+        if bucket_name:
+            # boto3 client config
+            cfg = Config(signature_version="s3v4", s3={"addressing_style": "path"})
+            self.s3 = boto3.client(
+                "s3",
+                endpoint_url=endpoint_url,
+                aws_access_key_id=access_key,
+                aws_secret_access_key=secret_key,
+                config=cfg,
+                region_name='us-east-1'
+            )
+        else:
+            self.s3 = None
+
+    def _ensure_local_dir(self, key):
+        path = os.path.join(self.local_base, key)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        return path
+
+    def download(self, key):
+        """Download object by key. Returns bytes, sets last_mode. Raises FileNotFoundError if not found."""
+        if self.s3 and self.bucket:
+            try:
+                resp = self.s3.get_object(Bucket=self.bucket, Key=key)
+                data = resp['Body'].read()
+                self.last_mode = 'cloud'
+                print(f"[OK] Downloaded {key} from s3://{self.bucket}/{key}")
+                return data
+            except (ClientError, BotoCoreError) as e:
+                print(f"[WARN] Could not download {key} from S3: {e}")
+        # Always fallback to local if S3 is not configured or download fails
+        local_path = self._ensure_local_dir(key)
+        try:
+            with open(local_path, 'rb') as f:
+                data = f.read()
+            self.last_mode = 'local'
+            print(f"[FALLBACK] Loaded {key} from local {local_path}")
+            return data
+        except FileNotFoundError:
+            print(f"[ERROR] {key} not found in S3 or locally at {local_path}")
+            raise
+
+    def upload(self, key, data, content_type='application/octet-stream'):
+        """Upload bytes to cloud, fallback to local. Sets last_mode. Returns True if cloud, False if local."""
+        if self.s3 and self.bucket:
+            try:
+                self.s3.put_object(Bucket=self.bucket, Key=key, Body=data, ContentType=content_type)
+                self.last_mode = 'cloud'
+                print(f"[OK] Uploaded {key} -> s3://{self.bucket}/{key}")
+                return True
+            except (ClientError, BotoCoreError) as e:
+                print(f"[ERROR] Failed uploading {key}: {e}")
+        # Always fallback to local if S3 is not configured or upload fails
+        local_path = self._ensure_local_dir(key)
+        with open(local_path, 'wb') as f:
+            f.write(data)
+        self.last_mode = 'local'
+        print(f"[FALLBACK] Saved {key} locally -> {local_path}")
+        return False
+
+    def exists(self, key):
+        """Check for existence of object. Returns True if found in cloud or local."""
+        if self.s3 and self.bucket:
+            try:
+                self.s3.head_object(Bucket=self.bucket, Key=key)
+                return True
+            except (ClientError, BotoCoreError):
+                pass
+        local_path = os.path.join(self.local_base, key)
+        return os.path.exists(local_path)
+
+    def delete(self, key):
+        """Delete object in cloud or local fallback."""
+        if self.s3 and self.bucket:
+            try:
+                self.s3.delete_object(Bucket=self.bucket, Key=key)
+                self.last_mode = 'cloud'
+                print(f"[OK] Deleted {key} from s3://{self.bucket}/{key}")
+                return
+            except Exception:
+                pass
+        local_path = os.path.join(self.local_base, key)
+        try:
+            os.remove(local_path)
+            self.last_mode = 'local'
+            print(f"[FALLBACK] Deleted {key} locally -> {local_path}")
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+
+    def get_last_mode(self):
+        """Return 'cloud' or 'local' depending on last operation."""
+        return self.last_mode
+
+# End of cloud_utils.py
diff --git a/src/fetchers/advisorai_data/advisorai_data_fetcher.py b/src/fetchers/advisorai_data/advisorai_data_fetcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..888e5306bea158438a8ec5be3491b27cd439f55b
--- /dev/null
+++ b/src/fetchers/advisorai_data/advisorai_data_fetcher.py
@@ -0,0 +1,226 @@
+"""
+advisorai_data_fetcher.py – Fetches feature files from AdvisorAI Data API and MongoDB,
+then uploads them to Filebase S3 instead of local storage.
+
+✱ 2025-07-11 – switched backend from local filesystem to Filebase S3
+  • Uses boto3 against FILEBASE_ENDPOINT
+  • No local disk writes; everything streams directly to S3
+
+Requirements:
+  • FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com
+  • FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY env vars
+  • FILEBASE_BUCKET env var (your bucket name)
+  • ADVISORAI_data_API_URL and ADVISORAI_data_API_KEY env vars for the Data API
+  • MONGODB_URI, MONGODB_DATABASE, MONGODB_COLLECTION_FEATURES env vars for archive fetch
+"""
+
+import os
+import sys
+import requests
+import asyncio
+from io import BytesIO
+
+from dotenv import load_dotenv
+import pandas as pd
+# from pymongo import MongoClient
+
+
+# Ensure src is in sys.path for direct script execution
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from data_cloud.cloud_utils import StorageHandler
+
+# ─── Configuration ────────────────────────────────────────────────────────────
+load_dotenv()
+
+# AdvisorAI Data API
+API_BASE_URL = os.getenv("ADVISORAI_data_API_URL", "http://localhost:8000")
+API_KEY      = os.getenv("ADVISORAI_data_API_KEY")
+if not API_KEY:
+    print("[ERROR] ADVISORAI_data_API_KEY must be set")
+    sys.exit(1)
+HEADERS = {"Authorization": f"Bearer {API_KEY}"}
+
+# MongoDB for archive features
+MONGODB_URI                  = os.getenv("MONGODB_URI", "mongodb://localhost:27017")
+MONGODB_DATABASE             = os.getenv("MONGODB_DATABASE", "AdvisorAI")
+MONGODB_COLLECTION_FEATURES  = os.getenv("MONGODB_COLLECTION_FEATURES", "arch_features")
+
+# Filebase S3 credentials
+FILEBASE_ENDPOINT    = os.getenv("FILEBASE_ENDPOINT")
+FILEBASE_ACCESS_KEY  = os.getenv("FILEBASE_ACCESS_KEY")
+FILEBASE_SECRET_KEY  = os.getenv("FILEBASE_SECRET_KEY")
+FILEBASE_BUCKET      = os.getenv("FILEBASE_BUCKET")
+if not all([FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, FILEBASE_BUCKET]):
+    print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set")
+    sys.exit(1)
+
+
+
+# ─── Fetch and upload functions ───────────────────────────────────────────────
+
+def fetch_and_upload_latest_parquet(storage):
+    """Fetch latest Parquet from API and upload to S3 bucket at features/latest_features.parquet"""
+    url = f"{API_BASE_URL}/features/latest"
+    resp = requests.get(url, headers=HEADERS, stream=True)
+    resp.raise_for_status()
+    data = resp.content
+    key = "advisorai-data/features/latest_features.parquet"
+    try:
+        storage.upload(key, data, content_type="application/octet-stream")
+        print(f"[OK] Uploaded latest_features.parquet -> {storage.get_last_mode()}:{key}")
+        # Also save locally
+        local_path = os.path.join("data", key)
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "wb") as f:
+            f.write(data)
+        print(f"[OK] Saved locally: {local_path}")
+    except Exception as e:
+        print(f"[ERROR] Failed uploading latest_features.parquet: {e}", file=sys.stderr)
+
+async def fetch_and_upload_jsons(storage):
+    """List JSON feature files, fetch them, and upload to S3 under features/"""
+    url = f"{API_BASE_URL}/features"
+    resp = requests.get(url, headers=HEADERS)
+    resp.raise_for_status()
+    files = resp.json().get("files", [])
+    json_files = [f["filename"] for f in files if f.get("file_type") == "json"]
+    if not json_files:
+        print("[INFO] No JSON feature files to upload.")
+        return
+    # Delete all old feature_report_*.json files before saving any new ones (both locally and on S3)
+    import glob
+    import os
+    # Local delete (as before)
+    features_dir = os.path.join("data", "advisorai-data", "features")
+    report_files = glob.glob(os.path.join(features_dir, "feature_report_*.json"))
+    for old_report in report_files:
+        try:
+            os.remove(old_report)
+            print(f"[INFO] Deleted old local report: {old_report}")
+        except Exception as e:
+            print(f"[WARN] Could not delete local {old_report}: {e}", file=sys.stderr)
+
+    # S3 delete (list all files in the prefix and filter manually)
+    try:
+        s3_files = storage.list_prefix("advisorai-data/features/")
+        s3_report_files = [f for f in s3_files if f.startswith("advisorai-data/features/feature_report_") and f.endswith(".json")]
+        for s3_report in s3_report_files:
+            try:
+                storage.delete(s3_report)
+                print(f"[INFO] Deleted old S3 report: {s3_report}")
+            except Exception as e:
+                print(f"[WARN] Could not delete S3 {s3_report}: {e}", file=sys.stderr)
+    except Exception as e:
+        print(f"[WARN] Could not list/delete S3 feature_report_*.json: {e}", file=sys.stderr)
+
+    for fname in json_files:
+        dl_url = f"{API_BASE_URL}/features/{fname}"
+        r = requests.get(dl_url, headers=HEADERS, stream=True)
+        r.raise_for_status()
+        data = r.content
+        key = f"advisorai-data/features/{fname}"
+        try:
+            storage.upload(key, data, content_type="application/json")
+            print(f"[OK] Uploaded {fname} -> {storage.get_last_mode()}:{key}")
+            # Also save locally
+            local_path = os.path.join("data", key)
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            with open(local_path, "wb") as f:
+                f.write(data)
+            print(f"[OK] Saved locally: {local_path}")
+        except Exception as e:
+            print(f"[ERROR] Failed uploading {fname}: {e}", file=sys.stderr)
+
+# async def fetch_and_upload_archive_parquet(storage):
+#     """Fetch archive from MongoDB, convert to Parquet, and upload to S3 at archive/merged_features.parquet"""
+#     client = MongoClient(MONGODB_URI)
+#     db = client[MONGODB_DATABASE]
+#     coll = db[MONGODB_COLLECTION_FEATURES]
+#     docs = list(coll.find())
+#     if not docs:
+#         print("[INFO] No documents in archive collection.")
+#         return
+#     for d in docs:
+#         d.pop("_id", None)
+#     df = pd.DataFrame(docs)
+#     buf = BytesIO()
+#     df.to_parquet(buf, index=False)
+#     data = buf.getvalue()
+#     key = "advisorai-data/archive/merged_features.parquet"
+#     try:
+#         storage.upload(key, data, content_type="application/octet-stream")
+#         print(f"[OK] Uploaded archive Parquet -> {storage.get_last_mode()}:{key}")
+#         # Also save locally
+#         local_path = os.path.join("data", key)
+#         os.makedirs(os.path.dirname(local_path), exist_ok=True)
+#         with open(local_path, "wb") as f:
+#             f.write(data)
+#         print(f"[OK] Saved locally: {local_path}")
+#     except Exception as e:
+#         print(f"[ERROR] Failed uploading archive Parquet: {e}", file=sys.stderr)
+
+def create_train_merged_parquet(storage):
+    """Create advisorai-data/train/merged_features.parquet by merging archive and latest features, deduping by (symbol, interval_timestamp)."""
+    # Download archive/merged_features.parquet
+    from io import BytesIO
+    import pandas as pd
+    archive_key = "advisorai-data/archive/merged_features.parquet"
+    latest_key = "advisorai-data/features/latest_features.parquet"
+    train_key = "advisorai-data/train/merged_features.parquet"
+    try:
+        archive_buf = BytesIO(storage.download(archive_key))
+        df_archive = pd.read_parquet(archive_buf)
+    except Exception as e:
+        print(f"[WARN] Could not load archive parquet: {e}", file=sys.stderr)
+        df_archive = pd.DataFrame()
+    try:
+        latest_buf = BytesIO(storage.download(latest_key))
+        df_latest = pd.read_parquet(latest_buf)
+    except Exception as e:
+        print(f"[WARN] Could not load latest features parquet: {e}", file=sys.stderr)
+        df_latest = pd.DataFrame()
+    if df_archive.empty and df_latest.empty:
+        print("[INFO] No data to merge for train/merged_features.parquet.")
+        return
+    # Concatenate and deduplicate by (symbol, interval_timestamp)
+    df_all = pd.concat([df_archive, df_latest], ignore_index=True)
+    if 'symbol' in df_all.columns and 'interval_timestamp' in df_all.columns:
+        df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
+    else:
+        print("[WARN] 'symbol' or 'interval_timestamp' column missing, skipping deduplication.")
+    # Save to train/merged_features.parquet
+    buf = BytesIO()
+    df_all.to_parquet(buf, index=False)
+    data = buf.getvalue()
+    try:
+        storage.upload(train_key, data, content_type="application/octet-stream")
+        print(f"[OK] Uploaded train merged features -> {storage.get_last_mode()}:{train_key}")
+        # Also save locally
+        local_path = os.path.join("data", train_key)
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "wb") as f:
+            f.write(data)
+        print(f"[OK] Saved locally: {local_path}")
+    except Exception as e:
+        print(f"[ERROR] Failed uploading train merged features: {e}", file=sys.stderr)
+
+# ─── Main entrypoint ─────────────────────────────────────────────────────────
+
+def main():
+    # Use StorageHandler with both S3 and local enabled
+    storage = StorageHandler(
+        endpoint_url=FILEBASE_ENDPOINT,
+        access_key=FILEBASE_ACCESS_KEY,
+        secret_key=FILEBASE_SECRET_KEY,
+        bucket_name=FILEBASE_BUCKET,
+        local_base="data"
+    )
+    fetch_and_upload_latest_parquet(storage)
+    asyncio.run(fetch_and_upload_jsons(storage))
+    # asyncio.run(fetch_and_upload_archive_parquet(storage))
+    create_train_merged_parquet(storage)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/fetchers/alpaca_api/__init__.py b/src/fetchers/alpaca_api/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..816f6d36487f974eb8071b8c265b9ceabc1ce140
--- /dev/null
+++ b/src/fetchers/alpaca_api/__init__.py
@@ -0,0 +1,32 @@
+# alpaca/__init__.py
+
+from .config import settings
+from .clients import StocksClient, CryptoClient, OptionsClient
+from .fetchers import (
+    fetch_stock_bars,
+    fetch_crypto_bars,
+    fetch_option_bars,
+    fetch_stock_trades,
+    fetch_crypto_trades,
+    fetch_stock_quotes,
+    fetch_crypto_quotes,
+)
+from .utils import logger, backoff, to_rfc3339, parse_rfc3339
+
+__all__ = [
+    "settings",
+    "StocksClient",
+    "CryptoClient",
+    "OptionsClient",
+    "fetch_stock_bars",
+    "fetch_crypto_bars",
+    "fetch_option_bars",
+    "fetch_stock_trades",
+    "fetch_crypto_trades",
+    "fetch_stock_quotes",
+    "fetch_crypto_quotes",
+    "logger",
+    "backoff",
+    "to_rfc3339",
+    "parse_rfc3339",
+]
diff --git a/src/fetchers/alpaca_api/clients/__init__.py b/src/fetchers/alpaca_api/clients/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..562a80eb157b30ca070d6cb8d3c71641cda96ea9
--- /dev/null
+++ b/src/fetchers/alpaca_api/clients/__init__.py
@@ -0,0 +1,7 @@
+# alpaca/clients/__init__.py
+
+from .stocks import StocksClient
+from .crypto import CryptoClient
+from .options import OptionsClient
+
+__all__ = ["StocksClient", "CryptoClient", "OptionsClient"]
diff --git a/src/fetchers/alpaca_api/clients/crypto.py b/src/fetchers/alpaca_api/clients/crypto.py
new file mode 100644
index 0000000000000000000000000000000000000000..42faa5e6b7f65f4bba0c2adaf83bfb1431caf5b9
--- /dev/null
+++ b/src/fetchers/alpaca_api/clients/crypto.py
@@ -0,0 +1,95 @@
+# alpaca/clients/crypto.py
+
+from datetime import datetime
+from typing import Optional
+import re
+from alpaca.data.historical import CryptoHistoricalDataClient
+from alpaca.data.requests import (
+    CryptoBarsRequest,
+    CryptoTradesRequest,
+    CryptoQuoteRequest,
+)
+from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
+from ..config import settings
+
+class CryptoClient:
+    def __init__(self):
+        # You can omit api_key/secret for crypto, but providing them raises rate limits
+        self.client = CryptoHistoricalDataClient(
+            api_key=settings.ALPACA_API_KEY,
+            secret_key=settings.ALPACA_API_SECRET,
+        )
+
+    def get_bars(
+        self,
+        symbol: str,
+        timeframe: str | TimeFrame,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        feed: Optional[str] = None,
+    ):
+        """
+        Fetch historical OHLCV bars for a given crypto symbol.
+        Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc.
+        """
+        if isinstance(timeframe, str):
+            m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe)
+            if not m:
+                raise ValueError(f"Invalid timeframe format: {timeframe!r}")
+            amt, unit_str = m.groups()
+            unit_key = unit_str.capitalize().rstrip("s")
+            unit = TimeFrameUnit[unit_key]
+            timeframe = TimeFrame(int(amt), unit)
+        req = CryptoBarsRequest(
+            symbol_or_symbols=symbol,
+            timeframe=timeframe,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=feed,
+        )
+        return self.client.get_crypto_bars(req)
+        # ↳ uses CryptoBarsRequest(symbol_or_symbols, timeframe, start, end, limit, feed) :contentReference[oaicite:0]{index=0}
+
+    def get_trades(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical trade ticks for a given crypto symbol.
+        """
+        req = CryptoTradesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_crypto_trades(req)
+        # ↳ uses CryptoTradesRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1}
+
+    def get_quotes(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical Level-1 quotes for a given crypto symbol.
+        """
+        req = CryptoQuoteRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_crypto_quotes(req)
+        # ↳ uses CryptoQuoteRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:2]{index=2}
diff --git a/src/fetchers/alpaca_api/clients/main.py b/src/fetchers/alpaca_api/clients/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9e2adab8cfa53ef72013d4eab9fb4051b3e3b78
--- /dev/null
+++ b/src/fetchers/alpaca_api/clients/main.py
@@ -0,0 +1,45 @@
+# from datetime import datetime, timedelta
+# import sys
+# import os
+# import pandas as pd
+# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+# from alpaca_api.clients.stocks import StocksClient
+
+# def normalize_records(records):
+#     dicts = [rec.model_dump() for rec in records]
+#     for rec in dicts:
+#         for k, v in rec.items():
+#             if hasattr(v, 'isoformat'):
+#                 rec[k] = v.isoformat()
+#     return dicts
+
+# if __name__ == "__main__":
+#     client = StocksClient()
+#     symbol = "AAPL"
+#     timeframe = "1Day"
+#     end = datetime.utcnow()
+#     start = end - timedelta(days=7)
+
+#     output_dir = os.path.join("..", "..", "..", "data", "alpaca")
+#     os.makedirs(output_dir, exist_ok=True)
+
+#     print(f"Testing get_bars for {symbol} from {start} to {end}")
+#     bars = client.get_bars(symbol, timeframe, start, end, limit=10)
+#     # print("Bars:", bars)
+#     bars_records = normalize_records(bars.data[symbol])
+#     bars_df = pd.DataFrame(bars_records)
+#     bars_df.to_parquet(os.path.join(output_dir, f"{symbol}_bars.parquet"), index=False)
+
+#     print(f"Testing get_trades for {symbol} from {start} to {end}")
+#     trades = client.get_trades(symbol, start, end, limit=10)
+#     # print("Trades:", trades)
+#     trades_records = normalize_records(trades.data[symbol])
+#     trades_df = pd.DataFrame(trades_records)
+#     trades_df.to_parquet(os.path.join(output_dir, f"{symbol}_trades.parquet"), index=False)
+
+#     print(f"Testing get_quotes for {symbol} from {start} to {end}")
+#     quotes = client.get_quotes(symbol, start, end, limit=10)
+#     # print("Quotes:", quotes)
+#     quotes_records = normalize_records(quotes.data[symbol])
+#     quotes_df = pd.DataFrame(quotes_records)
+#     quotes_df.to_parquet(os.path.join(output_dir, f"{symbol}_quotes.parquet"), index=False)
diff --git a/src/fetchers/alpaca_api/clients/options.py b/src/fetchers/alpaca_api/clients/options.py
new file mode 100644
index 0000000000000000000000000000000000000000..acdd3ef2bcfad85fdefb0700efb50d191450b4b5
--- /dev/null
+++ b/src/fetchers/alpaca_api/clients/options.py
@@ -0,0 +1,72 @@
+# alpaca/clients/options.py
+
+from datetime import datetime
+from typing import Optional, Union
+import re
+from alpaca.data.historical import OptionHistoricalDataClient
+from alpaca.data.requests import (
+    OptionBarsRequest,
+    OptionTradesRequest,
+)
+from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
+from ..config import settings
+
+class OptionsClient:
+    def __init__(self):
+        self.client = OptionHistoricalDataClient(
+            api_key=settings.ALPACA_API_KEY,
+            secret_key=settings.ALPACA_API_SECRET,
+        )
+
+    def get_bars(
+        self,
+        symbol: str,
+        timeframe: Union[str, TimeFrame],
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical OHLCV bars for a given option contract.
+        Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc.
+        """
+        if isinstance(timeframe, str):
+            m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe)
+            if not m:
+                raise ValueError(f"Invalid timeframe format: {timeframe!r}")
+            amount, unit_str = m.groups()
+            unit_key = unit_str.capitalize().rstrip("s")
+            unit = TimeFrameUnit[unit_key]
+            timeframe = TimeFrame(int(amount), unit)
+        req = OptionBarsRequest(
+            symbol_or_symbols=symbol,
+            timeframe=timeframe,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_option_bars(req)
+        # ↳ uses OptionBarsRequest(symbol_or_symbols, timeframe, start, end, limit, sort) :contentReference[oaicite:0]{index=0}
+
+    def get_trades(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical trade ticks for a given option contract.
+        """
+        req = OptionTradesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_option_trades(req)
+        # ↳ uses OptionTradesRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1}
diff --git a/src/fetchers/alpaca_api/clients/stocks.py b/src/fetchers/alpaca_api/clients/stocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f97b3e034e3710f61d0dbc546554a1488f7799db
--- /dev/null
+++ b/src/fetchers/alpaca_api/clients/stocks.py
@@ -0,0 +1,90 @@
+# alpaca_api/clients/stocks.py
+
+from datetime import datetime
+import re
+from alpaca.data.historical import StockHistoricalDataClient
+from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
+from alpaca.data.requests import StockBarsRequest, StockTradesRequest, StockQuotesRequest, DataFeed
+import sys, os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+from alpaca_api.config import settings
+
+class StocksClient:
+    def __init__(self):
+        self.client = StockHistoricalDataClient(
+            api_key=settings.ALPACA_API_KEY,
+            secret_key=settings.ALPACA_API_SECRET,
+        )
+
+    def get_bars(
+        self,
+        symbol: str,
+        timeframe: str | TimeFrame,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+    ):
+        """
+        Fetch historical OHLCV bars for a given stock.
+        Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc.
+        """
+        if isinstance(timeframe, str):
+            m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe)
+            if not m:
+                raise ValueError(f"Invalid timeframe format: {timeframe!r}")
+            amount_str, unit_str = m.groups()
+            # Normalize unit name to match TimeFrameUnit keys (Minute, Hour, Day, Week, Month)
+            unit_key = unit_str.capitalize().rstrip("s")
+            unit = TimeFrameUnit[unit_key]
+            timeframe = TimeFrame(int(amount_str), unit)
+        # Now we have a proper TimeFrame instance
+        req = StockBarsRequest(
+            symbol_or_symbols=symbol,
+            timeframe=timeframe,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=DataFeed.IEX,  # use IEX for free delayed data
+        )
+        return self.client.get_stock_bars(req)
+        # ↳ requires StockBarsRequest(symbol_or_symbols, timeframe, start, end, limit) :contentReference[oaicite:0]{index=0}
+
+    def get_trades(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+    ):
+        """
+        Fetch historical trade ticks for a given stock.
+        """
+        req = StockTradesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=DataFeed.IEX,  # use IEX for free delayed trade data
+        )
+        return self.client.get_stock_trades(req)
+        # ↳ takes symbol_or_symbols, start, end, limit :contentReference[oaicite:1]{index=1}
+
+    def get_quotes(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+    ):
+        """
+        Fetch historical Level-1 quotes (bid/ask) for a given stock.
+        """
+        req = StockQuotesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=DataFeed.IEX,  # use IEX for free delayed quote data
+        )
+        return self.client.get_stock_quotes(req)
+        # ↳ takes symbol_or_symbols, start, end, limit :contentReference[oaicite:2]{index=2}
diff --git a/src/fetchers/alpaca_api/config.py b/src/fetchers/alpaca_api/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..21658cec2f15140699b4ee6e8f65f3b67a48fc04
--- /dev/null
+++ b/src/fetchers/alpaca_api/config.py
@@ -0,0 +1,17 @@
+# alpaca/config.py
+
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+class Settings(BaseSettings):
+    ALPACA_API_KEY:    str
+    ALPACA_API_SECRET: str
+    ALPACA_BASE_URL:   str = "https://paper-api.alpaca.markets/v2"
+    PAPER:             bool = True
+
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",  # allow all other .env keys without error
+    )
+
+settings = Settings()
diff --git a/src/fetchers/alpaca_api/fetchers/__init__.py b/src/fetchers/alpaca_api/fetchers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b974a4828d7382e8c2b7b7222005e652f1bfc974
--- /dev/null
+++ b/src/fetchers/alpaca_api/fetchers/__init__.py
@@ -0,0 +1,15 @@
+# alpaca/fetchers/__init__.py
+
+from .bars import fetch_stock_bars, fetch_crypto_bars, fetch_option_bars
+from .trades import fetch_stock_trades, fetch_crypto_trades
+from .quotes import fetch_stock_quotes, fetch_crypto_quotes
+
+__all__ = [
+    "fetch_stock_bars",
+    "fetch_crypto_bars",
+    "fetch_option_bars",
+    "fetch_stock_trades",
+    "fetch_crypto_trades",
+    "fetch_stock_quotes",
+    "fetch_crypto_quotes",
+]
diff --git a/src/fetchers/alpaca_api/fetchers/bars.py b/src/fetchers/alpaca_api/fetchers/bars.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdae456766c66e26ab30bc2d2bae21e32ddaa844
--- /dev/null
+++ b/src/fetchers/alpaca_api/fetchers/bars.py
@@ -0,0 +1,58 @@
+# alpaca/fetchers/bars.py
+
+from datetime import datetime
+from ..clients.stocks import StocksClient
+from ..clients.crypto import CryptoClient
+from ..clients.options import OptionsClient
+from ..utils import backoff, logger
+
+# instantiate once
+stocks_client = StocksClient()
+crypto_client = CryptoClient()
+options_client = OptionsClient()
+
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_stock_bars(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    timeframe: str,
+    limit: int = 1000,
+):
+    """
+    Fetch OHLCV bars for a stock, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching stock bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit}")
+    return stocks_client.get_bars(symbol, timeframe, start, end, limit)
+
+
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_crypto_bars(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    timeframe: str,
+    limit: int = 1000,
+    feed: str | None = None,
+):
+    """
+    Fetch OHLCV bars for a crypto, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching crypto bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit} feed={feed}")
+    return crypto_client.get_bars(symbol, timeframe, start, end, limit, feed)
+
+
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_option_bars(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    timeframe: str,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch OHLCV bars for an option contract, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching option bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit} sort={sort}")
+    return options_client.get_bars(symbol, timeframe, start, end, limit, sort)
diff --git a/src/fetchers/alpaca_api/fetchers/quotes.py b/src/fetchers/alpaca_api/fetchers/quotes.py
new file mode 100644
index 0000000000000000000000000000000000000000..89af096c777550586bc958b128e54139b590714b
--- /dev/null
+++ b/src/fetchers/alpaca_api/fetchers/quotes.py
@@ -0,0 +1,40 @@
+# alpaca/fetchers/quotes.py
+
+from datetime import datetime
+from ..clients.stocks import StocksClient
+from ..clients.crypto import CryptoClient
+from ..utils import backoff, logger
+
+# instantiate clients once
+stocks_client = StocksClient()
+crypto_client = CryptoClient()
+
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_stock_quotes(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical Level-1 quotes (bid/ask) for a stock, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching stock quotes: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return stocks_client.get_quotes(symbol, start, end, limit)
+    # ↳ uses StockQuotesRequest(symbol_or_symbols, start, end, limit) :contentReference[oaicite:0]{index=0}
+
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_crypto_quotes(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical Level-1 quotes for a crypto symbol, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching crypto quotes: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return crypto_client.get_quotes(symbol, start, end, limit)
+    # ↳ uses CryptoQuoteRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1}
diff --git a/src/fetchers/alpaca_api/fetchers/trades.py b/src/fetchers/alpaca_api/fetchers/trades.py
new file mode 100644
index 0000000000000000000000000000000000000000..060ab1d798530c007b1a1082a9e750caca57706e
--- /dev/null
+++ b/src/fetchers/alpaca_api/fetchers/trades.py
@@ -0,0 +1,38 @@
+# alpaca/fetchers/trades.py
+
+from datetime import datetime
+from ..clients.stocks import StocksClient
+from ..clients.crypto import CryptoClient
+from ..utils import backoff, logger
+
+# instantiate clients once
+stocks_client = StocksClient()
+crypto_client = CryptoClient()
+
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_stock_trades(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical trade ticks for a stock, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching stock trades: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return stocks_client.get_trades(symbol, start, end, limit)
+
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_crypto_trades(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical trade ticks for a crypto symbol, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching crypto trades: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return crypto_client.get_trades(symbol, start, end, limit)
diff --git a/src/fetchers/alpaca_api/main.py b/src/fetchers/alpaca_api/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ef5e0592065c30d27e2428097ca895589f6410d
--- /dev/null
+++ b/src/fetchers/alpaca_api/main.py
@@ -0,0 +1,193 @@
+def normalize_crypto_symbol(sym: str) -> str:
+    return sym if "/" in sym else f"{sym}/USD"
+import os
+import sys
+from datetime import datetime, timedelta
+
+import pandas as pd
+
+
+# Add src/fetchers to sys.path for direct execution
+base = os.path.dirname(__file__)
+src_fetchers = os.path.abspath(os.path.join(base, ".."))
+sys.path.insert(0, src_fetchers)
+
+from alpaca_api.fetchers import (
+    fetch_stock_bars,
+    fetch_stock_trades,
+    fetch_stock_quotes,
+    fetch_crypto_bars,
+    fetch_crypto_trades,
+    fetch_option_bars,
+)
+from alpaca_api.config import settings
+
+def normalize_records(records):
+    """Convert Pydantic models to ISO-format dicts."""
+    dicts = [rec.model_dump() for rec in records]
+    for rec in dicts:
+        for k, v in rec.items():
+            if hasattr(v, "isoformat"):
+                rec[k] = v.isoformat()
+    return dicts
+
+def save_df(df: pd.DataFrame, fname: str):
+    out = os.path.join("data", "alpaca", fname)
+    os.makedirs(os.path.dirname(out), exist_ok=True)
+    
+    # Check if file exists and implement incremental loading
+    if os.path.exists(out):
+        try:
+            existing_df = pd.read_parquet(out)
+            print(f"-> existing data has {len(existing_df)} records")
+            
+            # Combine and remove duplicates based on timestamp and symbol
+            combined_df = pd.concat([existing_df, df], ignore_index=True)
+            
+            # Remove duplicates keeping the latest record
+            if 'timestamp' in combined_df.columns and 'symbol' in combined_df.columns:
+                combined_df = combined_df.drop_duplicates(subset=['timestamp', 'symbol'], keep='last')
+            elif 'timestamp' in combined_df.columns:
+                combined_df = combined_df.drop_duplicates(subset=['timestamp'], keep='last')
+            
+            # Sort by timestamp for consistency
+            if 'timestamp' in combined_df.columns:
+                combined_df = combined_df.sort_values('timestamp')
+            
+            combined_df.to_parquet(out, index=False)
+            print(f"-> updated {out} with {len(combined_df)} total records ({len(df)} new)")
+        except Exception as e:
+            print(f"-> error merging with existing data: {e}, overwriting")
+            df.to_parquet(out, index=False)
+            print(f"-> wrote {out} with {len(df)} records")
+    else:
+        df.to_parquet(out, index=False)
+        print(f"-> wrote {out} with {len(df)} records")
+
+def main():
+    # you can also read these from os.getenv or settings if you prefer
+    stock_symbols  = ["AAPL", "TSLA", "GOOGL", "MSFT", "NVDA", "COIN"]  # Added COIN
+    crypto_symbols = ["BTC", "ETH", "SOL", "ADA", "XRP"]
+    # option symbols use the Alpaca format: "<UNDERLYING>_<YYYYMMDD>_<STRIKE>_<C/P>"
+    # option_symbols = ["AAPL_20250718_150_C", "TSLA_20250718_700_P"]
+
+    def normalize_option_symbol(sym: str) -> str:
+        # expects “UNDERLYING_YYYYMMDD_STRIKE_C” or “P”
+        underlying, ymd, strike, cp = sym.split("_")
+        yymmdd = ymd[2:]  # “20250718” → “250718”
+        amt = int(float(strike) * 1000)
+        strike_str = f"{amt:08d}"
+        return f"{underlying}{yymmdd}{cp}{strike_str}"
+    days = "1Day" 
+
+    end = datetime.utcnow()
+    
+    # Check for existing data to determine start date
+    def get_start_date_for_symbol(symbol, data_type="bars"):
+        fname = f"{symbol}_{data_type}.parquet"
+        out = os.path.join("data", "alpaca", fname)
+        
+        if os.path.exists(out):
+            try:
+                existing_df = pd.read_parquet(out)
+                if not existing_df.empty and 'timestamp' in existing_df.columns:
+                    # Get the latest timestamp and add 1 day to avoid duplicates
+                    latest_timestamp = pd.to_datetime(existing_df['timestamp'].max())
+                    start_from_latest = latest_timestamp + timedelta(days=1)
+                    
+                    # Don't go back more than 30 days from now to limit data size
+                    max_lookback = end - timedelta(days=30)
+                    start_date = max(start_from_latest, max_lookback)
+                    
+                    print(f"-> {symbol} {data_type}: continuing from {start_date}")
+                    return start_date
+            except Exception as e:
+                print(f"-> error reading existing {fname}: {e}")
+        
+        # Default: get last 30 days for new symbols
+        default_start = end - timedelta(days=30)
+        print(f"-> {symbol} {data_type}: starting fresh from {default_start}")
+        return default_start
+
+    # STOCKS: bars, trades, quotes
+    for sym in stock_symbols:
+        print(f"\nFetching stock data for {sym}:")
+        
+        # Get appropriate start dates for each data type
+        start_bars = get_start_date_for_symbol(sym, "bars")
+        start_trades = get_start_date_for_symbol(sym, "trades") 
+        start_quotes = get_start_date_for_symbol(sym, "quotes")
+        
+        # Only fetch if there's a meaningful time range
+        if start_bars < end:
+            bars = fetch_stock_bars(sym, start_bars, end, days, limit=1000)  # Increased limit
+            save_df(pd.DataFrame(normalize_records(bars.data[sym])), f"{sym}_bars.parquet")
+        else:
+            print(f"-> {sym} bars: no new data to fetch")
+            
+        if start_trades < end:
+            trades = fetch_stock_trades(sym, start_trades, end, limit=1000)  # Increased limit
+            save_df(pd.DataFrame(normalize_records(trades.data[sym])), f"{sym}_trades.parquet")
+        else:
+            print(f"-> {sym} trades: no new data to fetch")
+            
+        if start_quotes < end:
+            quotes = fetch_stock_quotes(sym, start_quotes, end, limit=1000)  # Increased limit
+            save_df(pd.DataFrame(normalize_records(quotes.data[sym])), f"{sym}_quotes.parquet")
+        else:
+            print(f"-> {sym} quotes: no new data to fetch")
+
+    # CRYPTO: bars, trades
+    for sym in crypto_symbols:
+        pair = normalize_crypto_symbol(sym)
+        print(f"\nFetching crypto data for {pair}:")
+        try:
+            # Get appropriate start dates for crypto data  
+            start_bars = get_start_date_for_symbol(pair.replace('/', '_'), "bars")
+            start_trades = get_start_date_for_symbol(pair.replace('/', '_'), "trades")
+            
+            # Only fetch if there's a meaningful time range
+            bar_records = []
+            trade_records = []
+            
+            if start_bars < end:
+                bars = fetch_crypto_bars(pair, start_bars, end, days, limit=1000)  # Increased limit
+                bar_records = bars.data.get(pair, [])
+            else:
+                print(f"-> {pair} bars: no new data to fetch")
+                
+            if start_trades < end:
+                trades = fetch_crypto_trades(pair, start_trades, end, limit=1000)  # Increased limit
+                trade_records = trades.data.get(pair, [])
+            else:
+                print(f"-> {pair} trades: no new data to fetch")
+
+            if bar_records:
+                save_df(
+                    pd.DataFrame(normalize_records(bar_records)),
+                    f"{pair.replace('/', '_')}_bars.parquet",
+                )
+            else:
+                print(f"-> no bar data for {pair}, skipping")
+
+            if trade_records:
+                save_df(
+                    pd.DataFrame(normalize_records(trade_records)),
+                    f"{pair.replace('/', '_')}_trades.parquet",
+                )
+            else:
+                print(f"-> no trade data for {pair}, skipping")
+
+        except Exception as e:
+            print(f"⚠️  error fetching {pair}: {e!r}, skipping")
+            continue
+
+    # # OPTIONS: bars only
+    # for sym in option_symbols:
+    #     occ = normalize_option_symbol(sym)
+    #     print(f"\nFetching option bars for {occ}:")
+    #     bars = fetch_option_bars(occ, start, end, days, limit=10)
+    #     save_df(pd.DataFrame(normalize_records(bars.data[occ])),   f"{occ}_bars.parquet")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/fetchers/alpaca_api/merge/alpaca_features.py b/src/fetchers/alpaca_api/merge/alpaca_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/fetchers/alpaca_api/utils.py b/src/fetchers/alpaca_api/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..632ff0a59a019048f88c792ac435c166a98d8361
--- /dev/null
+++ b/src/fetchers/alpaca_api/utils.py
@@ -0,0 +1,83 @@
+# alpaca/utils.py
+
+import time
+import functools
+import logging
+from datetime import datetime, timezone
+from typing import Callable, Type, Tuple, Any
+
+# -----------------------------
+# Structured logger
+# -----------------------------
+logger = logging.getLogger("alpaca")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter(
+    "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    datefmt="%Y-%m-%dT%H:%M:%S%z",
+)
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+
+
+# -----------------------------
+# Exponential back-off decorator
+# -----------------------------
+def backoff(
+    max_retries: int = 5,
+    base_delay: float = 1.0,
+    factor: float = 2.0,
+    exceptions: Tuple[Type[BaseException], ...] = (Exception,),
+) -> Callable:
+    """
+    Decorator to retry a function with exponential back-off upon specified exceptions.
+
+    :param max_retries: maximum number of retries before giving up
+    :param base_delay: initial delay between retries (in seconds)
+    :param factor: multiplier for delay on each retry
+    :param exceptions: tuple of exception classes that should trigger a retry
+    """
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            retries = 0
+            delay = base_delay
+            while True:
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as e:
+                    if retries >= max_retries:
+                        logger.error(
+                            f"{func.__name__}: exceeded {max_retries} retries – giving up: {e}"
+                        )
+                        raise
+                    logger.warning(
+                        f"{func.__name__}: error {e!r}, retrying in {delay:.1f}s "
+                        f"(retry {retries + 1}/{max_retries})"
+                    )
+                    time.sleep(delay)
+                    retries += 1
+                    delay *= factor
+        return wrapper
+    return decorator
+
+
+# -----------------------------
+# Time helpers
+# -----------------------------
+def to_rfc3339(dt: datetime) -> str:
+    """
+    Convert a datetime to an RFC 3339–formatted string.
+    If no tzinfo is present, UTC is assumed.
+    """
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt.isoformat()
+
+
+def parse_rfc3339(timestamp: str) -> datetime:
+    """
+    Parse an RFC 3339–formatted string into a datetime.
+    """
+    return datetime.fromisoformat(timestamp)
diff --git a/src/fetchers/coindesk_client/asset_metadata.py b/src/fetchers/coindesk_client/asset_metadata.py
new file mode 100644
index 0000000000000000000000000000000000000000..92c7d3c54edcf765235d7bfb49c419dcd1ff561d
--- /dev/null
+++ b/src/fetchers/coindesk_client/asset_metadata.py
@@ -0,0 +1,26 @@
+"""
+asset_metadata.py – Asset metadata endpoints for CoinDesk API client.
+
+- list_assets(): List all supported assets with basic metadata.
+- get_asset_details(symbol): Fetch detailed metadata for a specific asset.
+"""
+
+from client import BaseClient
+
+class AssetMetadataClient(BaseClient):
+    def list_assets(self):
+        """
+        Get a list of all supported assets and their basic metadata.
+
+        :return: JSON response containing assets list.
+        """
+        return self._get("assets")
+
+    def get_asset_details(self, symbol):
+        """
+        Get detailed metadata for a specific asset.
+
+        :param symbol: Asset symbol, e.g., "BTC" or "ETH".
+        :return: JSON response with asset details.
+        """
+        return self._get(f"assets/{symbol}")
diff --git a/src/fetchers/coindesk_client/client.py b/src/fetchers/coindesk_client/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5df5d4029a72d7da2b3f28fe5a27b31e5657ce1
--- /dev/null
+++ b/src/fetchers/coindesk_client/client.py
@@ -0,0 +1,218 @@
+"""
+client.py – Base HTTP client for CoinDesk API.
+
+This module provides the BaseClient class that handles HTTP requests
+to the CoinDesk API with proper authentication and error handling.
+"""
+
+import requests
+import json
+from typing import Dict, Any, Optional
+from urllib.parse import urljoin, urlencode
+import config
+
+
+class APIError(Exception):
+    """Custom exception for API errors."""
+    def __init__(self, message: str, status_code: int = None, response: Any = None):
+        self.message = message
+        self.status_code = status_code
+        self.response = response
+        super().__init__(self.message)
+
+
+class BaseClient:
+    """
+    Base HTTP client for CoinDesk API requests.
+    
+    Handles authentication, request formatting, and error handling.
+    """
+    
+    def __init__(self, base_url: str = None, headers: Dict[str, str] = None):
+        """
+        Initialize the base client.
+        
+        Args:
+            base_url: Base URL for the API (defaults to config.BASE_URL)
+            headers: Default headers (defaults to config.HEADERS)
+        """
+        self.base_url = base_url or config.BASE_URL
+        self.headers = headers or config.HEADERS.copy()
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+    
+    def _make_request(self, method: str, endpoint: str, params: Dict[str, Any] = None, 
+                     data: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make an HTTP request to the API.
+        
+        Args:
+            method: HTTP method (GET, POST, PUT, DELETE)
+            endpoint: API endpoint path
+            params: URL parameters
+            data: Request body data
+            **kwargs: Additional arguments for requests
+            
+        Returns:
+            dict: JSON response from the API
+            
+        Raises:
+            APIError: If the request fails or returns an error status
+        """
+        # Construct full URL
+        url = urljoin(self.base_url, endpoint.lstrip('/'))
+        
+        # Clean up parameters (remove None values)
+        if params:
+            params = {k: v for k, v in params.items() if v is not None}
+        
+        try:
+            # Make the request
+            response = self.session.request(
+                method=method,
+                url=url,
+                params=params,
+                json=data,
+                **kwargs
+            )
+            
+            # Log the request for debugging
+            print(f"[DEBUG] {method} {url}")
+            if params:
+                print(f"[DEBUG] Params: {params}")
+            print(f"[DEBUG] Status: {response.status_code}")
+            
+            # Check if request was successful
+            if response.status_code == 200:
+                try:
+                    return response.json()
+                except json.JSONDecodeError:
+                    # If response is not JSON, return the text
+                    return {"data": response.text, "status": "success"}
+            else:
+                # Handle different error status codes
+                error_message = f"API request failed with status {response.status_code}"
+                
+                try:
+                    error_data = response.json()
+                    if 'error' in error_data:
+                        error_message = error_data['error']
+                    elif 'message' in error_data:
+                        error_message = error_data['message']
+                except json.JSONDecodeError:
+                    error_message = f"{error_message}: {response.text}"
+                
+                raise APIError(
+                    message=error_message,
+                    status_code=response.status_code,
+                    response=response
+                )
+                
+        except requests.exceptions.RequestException as e:
+            raise APIError(f"Request failed: {str(e)}")
+    
+    def get(self, endpoint: str, params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a GET request.
+        
+        Args:
+            endpoint: API endpoint path
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+            
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('GET', endpoint, params=params, **kwargs)
+    
+    def post(self, endpoint: str, data: Dict[str, Any] = None, 
+             params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a POST request.
+        
+        Args:
+            endpoint: API endpoint path
+            data: Request body data
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+            
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('POST', endpoint, params=params, data=data, **kwargs)
+    
+    def put(self, endpoint: str, data: Dict[str, Any] = None, 
+            params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a PUT request.
+        
+        Args:
+            endpoint: API endpoint path
+            data: Request body data
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+            
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('PUT', endpoint, params=params, data=data, **kwargs)
+    
+    def delete(self, endpoint: str, params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a DELETE request.
+        
+        Args:
+            endpoint: API endpoint path
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+            
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('DELETE', endpoint, params=params, **kwargs)
+    
+    def close(self):
+        """Close the HTTP session."""
+        self.session.close()
+    
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.close()
+
+
+# Convenience function to create a client instance
+def create_client(base_url: str = None, headers: Dict[str, str] = None) -> BaseClient:
+    """
+    Create a new BaseClient instance.
+    
+    Args:
+        base_url: Base URL for the API
+        headers: Default headers
+        
+    Returns:
+        BaseClient: Configured client instance
+    """
+    return BaseClient(base_url=base_url, headers=headers)
+
+
+# Test function to verify the client works
+def test_client():
+    """Test the base client functionality."""
+    try:
+        with create_client() as client:
+            # Test a simple endpoint (you might need to adjust this based on your API)
+            response = client.get("/index/cc/v1/markets")
+            print("Client test successful!")
+            print(f"Response keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}")
+            return True
+    except Exception as e:
+        print(f"Client test failed: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    test_client()
\ No newline at end of file
diff --git a/src/fetchers/coindesk_client/coindesk_utils.py b/src/fetchers/coindesk_client/coindesk_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6543eb4da1da75b7a0d49ce19c6db462673838b4
--- /dev/null
+++ b/src/fetchers/coindesk_client/coindesk_utils.py
@@ -0,0 +1,49 @@
+"""
+coindesk_utils.py – Utilities for saving, merging, and managing CoinDesk data as Parquet using StorageHandler.
+
+Features:
+- save_and_merge_parquet: Save new data, merge with existing Parquet, dedupe by date, keep N days.
+"""
+import os
+import pandas as pd
+from datetime import datetime, timedelta
+from src.data_cloud.cloud_utils import StorageHandler
+
+
+def save_and_merge_parquet(
+    storage: StorageHandler,
+    key: str,
+    new_data: pd.DataFrame,
+    date_col: str = "timestamp",
+    days: int = 7,
+    content_type: str = "application/octet-stream",
+):
+    """
+    Save new_data as Parquet, merging with existing file by date_col, keeping only the last N days.
+    - storage: StorageHandler instance
+    - key: storage key (e.g., 'coindesk/spot_markets.parquet')
+    - new_data: DataFrame to save
+    - date_col: column to use for date filtering (must be datetime-like)
+    - days: keep only this many days of data
+    - content_type: MIME type for Parquet
+    """
+    # Try to load existing data
+    try:
+        existing_bytes = storage.download(key)
+        df_old = pd.read_parquet(pd.io.common.BytesIO(existing_bytes))
+    except Exception:
+        df_old = pd.DataFrame()
+
+    # Combine and dedupe
+    df_all = pd.concat([df_old, new_data], ignore_index=True)
+    if date_col in df_all.columns:
+        df_all[date_col] = pd.to_datetime(df_all[date_col], errors="coerce")
+        cutoff = datetime.utcnow() - timedelta(days=days)
+        df_all = df_all[df_all[date_col] >= cutoff]
+        df_all = df_all.sort_values(date_col).drop_duplicates()
+
+    # Save merged Parquet
+    buf = pd.io.common.BytesIO()
+    df_all.to_parquet(buf, index=False)
+    storage.upload(key, buf.getvalue(), content_type=content_type)
+    return df_all
diff --git a/src/fetchers/coindesk_client/config.py b/src/fetchers/coindesk_client/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77b522e5180ecd9efeb3c9f9de5a8d7c4fa2c11
--- /dev/null
+++ b/src/fetchers/coindesk_client/config.py
@@ -0,0 +1,30 @@
+"""
+config.py – Configuration and secrets for CoinDesk API client.
+
+- Defines API_KEY, BASE_URL, and optional TIMEZONE constants
+- Loads environment variables securely (e.g., via python-dotenv)
+- Configures default headers (Authorization, Content-Type)
+"""
+
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
+
+API_KEY = os.getenv("COINDESK_API_KEY")
+BASE_URL = os.getenv("COINDESK_BASE_URL", "https://data-api.coindesk.com/").rstrip('/')
+TIMEZONE = os.getenv("COINDESK_TIMEZONE", "UTC")
+
+# Flexible parameters for data collection
+MARKET = os.getenv("COINDESK_MARKET", "binance")
+SYMBOL = os.getenv("COINDESK_SYMBOL", "BTC-USD")
+INSTRUMENTS = os.getenv("COINDESK_INSTRUMENTS", "BTC-USD").split(",")
+DAYS = int(os.getenv("COINDESK_DAYS_OLD", 7))
+FUTURES_LIMIT = int(os.getenv("COINDESK_FUTURES_LIMIT", 50))
+SENTIMENT_LIMIT = int(os.getenv("COINDESK_SENTIMENT_LIMIT", 50))
+BLOCK_NUMBER = int(os.getenv("COINDESK_BLOCK_NUMBER", 100000))
+
+HEADERS = {
+    "Authorization": f"Bearer {API_KEY}",
+    "Content-Type": "application/json"
+}
diff --git a/src/fetchers/coindesk_client/d.txt b/src/fetchers/coindesk_client/d.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98ff7af86c9fc478c85d0a4174e1a2a42dd0adda
--- /dev/null
+++ b/src/fetchers/coindesk_client/d.txt
@@ -0,0 +1,12 @@
+Latest Tick:/index/cc/v1/latest/tick?market=cadli&instruments=BTC-USD,ETH-USD&apply_mapping=true
+Historical OHLCV+:/index/cc/v1/historical/days?market=cadli&instrument=BTC-USD&limit=30&aggregate=1&fill=true&apply_mapping=true&response_format=JSON
+DA Fixings:/index/cc/v1/historical/days/ccda?instrument=BTC-USD&timezone=Europe/London&date=2023-10-30&close_time=16:00&limit=5&response_format=JSON
+Index Updates:/index/cc/v2/historical/messages/hour?market=cadli&instrument=BTC-USD&hour_ts=1701176400&apply_mapping=true&response_format=JSON
+Index Composition:/index/cc/v1/historical/days/composition?market=cd_mc&instrument=CD20-USD&timezone=Europe/London&date=2025-05-09&close_time=16:00&limit=5&response_format=JSON
+Instrument Metadata:/index/cc/v1/latest/instrument/metadata?market=cadli&instruments=BTC-USD,ETH-USD&apply_mapping=true
+Markets:/index/cc/v1/markets?market=cadli
+Markets + Instruments:/index/cc/v1/markets/instruments?market=cadli&instruments=BTC-USD,ETH-USD&instrument_status=ACTIVE
+Forex Rates: /index/cc/v1/latest/tick/forex?instruments=GBP-USD,MYR-USD
+EOD Markets + Instruments: /index/cc/v1/markets/instruments/unmapped/eod?market=cdifti&instruments=BTIUSF-USD&instrument_status=ACTIVE
+EOD Historical OHLCV+ Day:/index/cc/v1/historical/days/eod?market=cdifti&instrument=BTIUSF-USD&limit=5&response_format=JSON
+Index Reconstitution: /index/cc/v1/reconstitution?market=cd_mc&instrument=CD20-USD
\ No newline at end of file
diff --git a/src/fetchers/coindesk_client/derivatives.py b/src/fetchers/coindesk_client/derivatives.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a7f7ffc7ffe0d15153eae68d769856c647104b8
--- /dev/null
+++ b/src/fetchers/coindesk_client/derivatives.py
@@ -0,0 +1,68 @@
+"""
+derivatives.py – Derivatives endpoints for CoinDesk API client.
+
+- list_markets(): List all available derivatives markets.
+- get_latest_futures(symbol=None): Fetch the latest futures data, optionally for a symbol.
+- get_futures_historical(days, limit=None): Retrieve futures historical data over N days.
+- list_options(symbol=None): List available options or option chain for a given asset.
+- get_options_historical(symbol, start, end=None, limit=None): Fetch options historical data over a timeframe.
+"""
+
+from client import BaseClient
+
+class DerivativesClient(BaseClient):
+    def list_markets(self):
+        """
+        List all available derivatives markets.
+        """
+        return self._get("derivatives/markets")
+
+    def get_latest_futures(self, symbol=None):
+        """
+        Get the most recent futures data. If `symbol` is provided, returns data for that symbol.
+
+        :param symbol: Futures symbol, e.g., "BTC-USD" (optional).
+        """
+        path = "derivatives/futures"
+        if symbol:
+            path += f"/{symbol}"
+        return self._get(path)
+
+    def get_futures_historical(self, days, limit=None):
+        """
+        Fetch historical futures data for the past `days` days.
+
+        :param days: Number of days of history to retrieve.
+        :param limit: Maximum number of records to return (optional).
+        """
+        params = {"days": days}
+        if limit is not None:
+            params["limit"] = limit
+        return self._get("derivatives/futures/historical", params=params)
+
+    def list_options(self, symbol=None):
+        """
+        List all available options or get the option chain for a symbol.
+
+        :param symbol: Asset symbol for option chain, e.g., "BTC-USD" (optional).
+        """
+        path = "derivatives/options"
+        if symbol:
+            path += f"/{symbol}"
+        return self._get(path)
+
+    def get_options_historical(self, symbol, start, end=None, limit=None):
+        """
+        Fetch historical options data for a symbol over a timeframe.
+
+        :param symbol: Asset symbol, e.g., "BTC-USD".
+        :param start: ISO8601 start datetime string.
+        :param end: ISO8601 end datetime string (optional).
+        :param limit: Maximum number of records to return (optional).
+        """
+        params = {"start": start}
+        if end:
+            params["end"] = end
+        if limit is not None:
+            params["limit"] = limit
+        return self._get(f"derivatives/options/{symbol}/historical", params=params)
diff --git a/src/fetchers/coindesk_client/doc.txt b/src/fetchers/coindesk_client/doc.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b60272b10a7b4123222aac9dcea4d53edeeac522
--- /dev/null
+++ b/src/fetchers/coindesk_client/doc.txt
@@ -0,0 +1,122 @@
+Below is the complete sidebar navigation structure under Data API → Introduction, with each endpoint’s link text and URL path.
+
+## Introduction
+
+* [Introduction](https://developers.coindesk.com/documentation/data-api/introduction)
+
+## Indices & Ref. Rates
+
+* [Latest Tick](https://developers.coindesk.com/documentation/data-api/index_cc_v1_latest_tick)
+* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days)
+* [DA Fixings](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days_ccda)
+* [Index Updates](https://developers.coindesk.com/documentation/data-api/index_cc_v2_historical_messages_hour)
+* [Index Composition](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days_composition)
+* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/index_cc_v1_latest_instrument_metadata)
+* [Markets](https://developers.coindesk.com/documentation/data-api/index_cc_v1_markets)
+* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/index_cc_v1_markets_instruments)
+* [Forex Rates](https://developers.coindesk.com/documentation/data-api/index_cc_v1_latest_tick_forex)
+* [EOD Markets + Instruments](https://developers.coindesk.com/documentation/data-api/index_cc_v1_markets_instruments_unmapped_eod)
+* [EOD Historical OHLCV+ Day](https://developers.coindesk.com/documentation/data-api/index_cc_v1_historical_days_eod)
+* [Index Reconstitution](https://developers.coindesk.com/documentation/data-api/index_v1_reconstitution)
+
+## Spot
+
+* [Latest Tick](https://developers.coindesk.com/documentation/data-api/spot_v1_latest_tick)
+* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/spot_v1_historical_days)
+* [Trades](https://developers.coindesk.com/documentation/data-api/spot_v2_historical_trades_hour)
+* [Order Book](https://developers.coindesk.com/documentation/data-api/spot_v1_historical_orderbook_l2_metrics_minute)
+* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/spot_v1_latest_instrument_metadata)
+* [Markets](https://developers.coindesk.com/documentation/data-api/spot_v1_markets)
+* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/spot_v1_markets_instruments)
+
+## Futures
+
+* [Latest Tick](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_tick)
+* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/futures_v1_historical_days)
+* [Trades](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_trades_hour)
+* [Order Book](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_orderbook_l2_metrics_minute)
+* [Latest Tick (OI)](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_open_interest_tick)
+* [Historical OHLC+ (OI)](https://developers.coindesk.com/documentation/data-api/futures_v1_historical_open_interest_days)
+* [Updates (OI)](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_open_interest_messages_hour)
+* [Latest Tick (FR)](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_funding_rate_tick)
+* [Historical OHLC+ (FR)](https://developers.coindesk.com/documentation/data-api/futures_v1_historical_funding_rate_days)
+* [Updates (FR)](https://developers.coindesk.com/documentation/data-api/futures_v2_historical_funding_rate_messages_hour)
+* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/futures_v1_latest_instrument_metadata)
+* [Markets](https://developers.coindesk.com/documentation/data-api/futures_v1_markets)
+* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/futures_v1_markets_instruments)
+
+## Options
+
+* [Latest Tick](https://developers.coindesk.com/documentation/data-api/options_v1_latest_tick)
+* [Historical OHLCV+](https://developers.coindesk.com/documentation/data-api/options_v1_historical_days)
+* [Trades](https://developers.coindesk.com/documentation/data-api/options_v2_historical_trades_hour)
+* [Order Book](https://developers.coindesk.com/documentation/data-api/options_v1_historical_orderbook_l2_metrics_minute)
+* [Latest Tick (OI)](https://developers.coindesk.com/documentation/data-api/options_v1_latest_open_interest_tick)
+* [Historical OHLC+ (OI)](https://developers.coindesk.com/documentation/data-api/options_v1_historical_open_interest_days)
+* [Updates (OI)](https://developers.coindesk.com/documentation/data-api/options_v2_historical_open_interest_messages_hour)
+* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/options_v1_latest_instrument_metadata)
+* [Markets](https://developers.coindesk.com/documentation/data-api/options_v1_markets)
+* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/options_v1_markets_instruments)
+
+## Derivatives Indices
+
+* [Latest Tick](https://developers.coindesk.com/documentation/data-api/index_v1_latest_tick)
+* [Historical OHLC+](https://developers.coindesk.com/documentation/data-api/index_v1_historical_days)
+* [Index Updates](https://developers.coindesk.com/documentation/data-api/index_v2_historical_messages_hour)
+* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/index_v1_latest_instrument_metadata)
+* [Markets](https://developers.coindesk.com/documentation/data-api/index_v1_markets)
+* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/index_v1_markets_instruments)
+
+## On-Chain DEX
+
+* [Latest Tick (Swap)](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_latest_swap_tick)
+* [Historical OHLCV+ (Swap)](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_historical_swap_days)
+* [Swaps](https://developers.coindesk.com/documentation/data-api/onchain_v2_amm_historical_swap_messages_hour)
+* [Liquidity Updates](https://developers.coindesk.com/documentation/data-api/onchain_v2_amm_historical_liquidity_update_messages_hour)
+* [Instrument Metadata](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_latest_instrument_metadata)
+* [Markets](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_markets)
+* [Markets + Instruments](https://developers.coindesk.com/documentation/data-api/onchain_v1_amm_markets_instruments)
+
+## On-Chain Core
+
+* [ETH Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_2)
+* [BSC Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_8)
+* [BTC Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_1)
+* [BASE Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_2410)
+* [ARB Blocks](https://developers.coindesk.com/documentation/data-api/onchain_v1_block_808)
+* [ETH Address](https://developers.coindesk.com/documentation/data-api/onchain_v1_address_metadata_2)
+* [Assets By Chain](https://developers.coindesk.com/documentation/data-api/onchain_v3_summary_by_chain)
+* [Asset By Address](https://developers.coindesk.com/documentation/data-api/onchain_v2_data_by_address)
+* [Historical Supply](https://developers.coindesk.com/documentation/data-api/onchain_v2_historical_supply_days)
+
+## Asset
+
+* [Metadata](https://developers.coindesk.com/documentation/data-api/asset_v2_metadata)
+* [Top List](https://developers.coindesk.com/documentation/data-api/asset_v1_top_list)
+* [Search](https://developers.coindesk.com/documentation/data-api/asset_v1_search)
+* [Summary List](https://developers.coindesk.com/documentation/data-api/asset_v1_summary_list)
+* [Events](https://developers.coindesk.com/documentation/data-api/asset_v1_events)
+* [Historical Social](https://developers.coindesk.com/documentation/data-api/asset_v1_historical_code_repository_days)
+
+## News
+
+* [Latest Articles](https://developers.coindesk.com/documentation/data-api/news_v1_article_list)
+* [Sources](https://developers.coindesk.com/documentation/data-api/news_v1_source_list)
+* [Categories](https://developers.coindesk.com/documentation/data-api/news_v1_category_list)
+* [Single Article](https://developers.coindesk.com/documentation/data-api/news_v1_article_get)
+* [Search](https://developers.coindesk.com/documentation/data-api/news_v1_search)
+
+## Overview
+
+* [MktCap Latest Tick](https://developers.coindesk.com/documentation/data-api/overview_v1_latest_marketcap_all_tick)
+* [MktCap Historical OHLCV](https://developers.coindesk.com/documentation/data-api/overview_v1_historical_marketcap_all_assets_days)
+
+## Utilities
+
+* [Rate Limit Verification](https://developers.coindesk.com/documentation/data-api/admin_v2_rate_limit)
+* [Version](https://developers.coindesk.com/documentation/data-api/info_v1_version)
+* [OpenAPI](https://developers.coindesk.com/documentation/data-api/info_v1_openapi)
+
+## Deprecated
+
+*(toggle to view deprecated endpoints)*
diff --git a/src/fetchers/coindesk_client/index.py b/src/fetchers/coindesk_client/index.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ea695c28a4a0de0ac9162ebc3f90c2d0c761e4
--- /dev/null
+++ b/src/fetchers/coindesk_client/index.py
@@ -0,0 +1,552 @@
+"""
+Optimized CoinDesk API Client with Smart Market Discovery and Endpoint Compatibility
+Enhanced version with improved error handling and market validation
+"""
+
+import argparse
+import json
+import os
+from client import BaseClient, APIError
+from typing import Union, List, Optional, Dict, Tuple
+from datetime import datetime, timedelta
+import time
+from dataclasses import dataclass
+from enum import Enum
+import sys
+
+def safe_print(*args, **kwargs):
+    """Prints unicode safely even if the terminal encoding is not UTF-8."""
+    text = " ".join(str(arg) for arg in args)
+    try:
+        sys.stdout.buffer.write((text + '\n').encode('utf-8', errors='replace'))
+    except Exception:
+        # Fallback to plain print if all else fails
+        print(text.encode('ascii', errors='replace').decode('ascii'), **kwargs)
+
+import config
+
+import pathlib
+
+def ensure_data_dir():
+    """Ensure the data directory exists."""
+    data_dir = pathlib.Path("dta/coindesk/indexes")
+    data_dir.mkdir(parents=True, exist_ok=True)
+    return data_dir
+
+def save_json_result(filename: str, data: dict):
+    """Save data as JSON to the dta/coindesk/indexes directory."""
+    data_dir = ensure_data_dir()
+    file_path = data_dir / filename
+    with open(file_path, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+class EndpointStatus(Enum):
+    SUPPORTED = "supported"
+    UNSUPPORTED = "unsupported"
+    UNKNOWN = "unknown"
+
+@dataclass
+class MarketInfo:
+    """Market information with endpoint compatibility"""
+    market_id: str
+    name: str
+    endpoints: Dict[str, EndpointStatus]
+    instruments: List[str]
+    last_checked: datetime
+
+class IndexClient(BaseClient):
+    """
+    Enhanced Index & Reference Rates endpoints for CoinDesk Data API.
+    Includes smart market discovery and endpoint compatibility checking.
+    """
+    
+    def __init__(self):
+        super().__init__()
+        self._market_cache = {}
+        self._endpoint_compatibility = {}
+
+    def list_markets(self) -> dict:
+        """List all available markets (index families)."""
+        return self.get("/index/cc/v1/markets")
+
+    def list_markets_instruments(self,
+                                market: str,
+                                instruments: Optional[List[str]] = None,
+                                instrument_status: str = "ACTIVE"
+                               ) -> dict:
+        """
+        List instruments for a given market. If instruments is None,
+        retrieves *all* mapped instruments from the API.
+        """
+        params = {
+            "market": market,
+            "instrument_status": instrument_status
+        }
+        if instruments:
+            params["instruments"] = ",".join(instruments)
+        return self.get("/index/cc/v1/markets/instruments", params=params)
+
+    def get_latest_tick(self, market: str, instruments: List[str], 
+                       apply_mapping: bool = True) -> dict:
+        """
+        Latest OHLCV+ tick data.
+        
+        Args:
+            market: Index family identifier (e.g., 'sda', 'cdifti')
+            instruments: List of instrument tickers (e.g., ['XBX-USD', 'ETX-USD'])
+            apply_mapping: Whether to apply instrument mapping
+        """
+        if not instruments:
+            raise ValueError("The 'instruments' parameter is required")
+        
+        params = {
+            'market': market,
+            'instruments': ','.join(instruments),
+            'apply_mapping': str(apply_mapping).lower()
+        }
+        
+        return self.get("/index/cc/v1/latest/tick", params=params)
+
+    def get_historical_days(self, market: str, instrument: str, limit: int = 30,
+                           aggregate: int = 1, fill: bool = True, 
+                           apply_mapping: bool = True, response_format: str = "JSON") -> dict:
+        """
+        Historical OHLCV+ by day.
+        
+        Args:
+            market: Index family identifier (e.g., 'sda', 'cdifti')
+            instrument: Single instrument ticker (e.g., 'XBX-USD')
+            limit: Number of days to retrieve
+            aggregate: Aggregation period
+            fill: Whether to fill missing data
+            apply_mapping: Whether to apply instrument mapping
+            response_format: Response format
+        """
+        if not instrument:
+            raise ValueError("The 'instrument' parameter is required")
+            
+        params = {
+            'market': market,
+            'instrument': instrument,
+            'limit': limit,
+            'aggregate': aggregate,
+            'fill': str(fill).lower(),
+            'apply_mapping': str(apply_mapping).lower(),
+            'response_format': response_format
+        }
+        
+        return self.get("/index/cc/v1/historical/days", params=params)
+
+    def get_latest_instrument_metadata(self, market: str, instruments: List[str],
+                                     apply_mapping: bool = True) -> dict:
+        """
+        Latest instrument metadata.
+        
+        Args:
+            market: Index family identifier (e.g., 'sda', 'cdifti')
+            instruments: List of instrument tickers (e.g., ['XBX-USD', 'ETX-USD'])
+            apply_mapping: Whether to apply instrument mapping
+        """
+        if not instruments:
+            raise ValueError("The 'instruments' parameter is required")
+            
+        params = {
+            'market': market,
+            'instruments': ','.join(instruments),
+            'apply_mapping': str(apply_mapping).lower()
+        }
+        
+        return self.get("/index/cc/v1/latest/instrument/metadata", params=params)
+
+    def list_eod_markets_instruments(self, market: str, instruments: List[str] = None,
+                                   instrument_status: str = "ACTIVE") -> dict:
+        """
+        List EOD (unmapped) instruments - most reliable for instrument discovery.
+        
+        Args:
+            market: Index family identifier (e.g., 'cdifti')
+            instruments: Optional list of instruments to filter
+            instrument_status: Status filter (default: 'ACTIVE')
+        """
+        params = {
+            'market': market,
+            'instrument_status': instrument_status
+        }
+        if instruments:
+            params['instruments'] = ','.join(instruments)
+            
+        return self.get("/index/cc/v1/markets/instruments/unmapped/eod", params=params)
+
+    def get_historical_days_eod(self, market: str, instrument: str, limit: int = 5,
+                               response_format: str = "JSON") -> dict:
+        """
+        EOD historical OHLCV+ by day.
+        
+        Args:
+            market: Index family identifier (e.g., 'cdifti')
+            instrument: Single instrument ticker
+            limit: Number of days to retrieve
+            response_format: Response format
+        """
+        params = {
+            'market': market,
+            'instrument': instrument,
+            'limit': limit,
+            'response_format': response_format
+        }
+        
+        return self.get("/index/cc/v1/historical/days/eod", params=params)
+
+    def check_endpoint_compatibility(self, market: str) -> Dict[str, EndpointStatus]:
+        """
+        Check which endpoints are supported for a specific market.
+        
+        Args:
+            market: Market identifier to check
+            
+        Returns:
+            Dictionary mapping endpoint names to their support status
+        """
+        if market in self._endpoint_compatibility:
+            return self._endpoint_compatibility[market]
+        
+        endpoints = {}
+        test_instruments = ["BTC-USD", "ETH-USD", "XBX-USD"]  # Common test instruments
+        
+        # Test EOD instruments endpoint
+        try:
+            self.list_eod_markets_instruments(market=market)
+            endpoints["eod_instruments"] = EndpointStatus.SUPPORTED
+        except APIError as e:
+            endpoints["eod_instruments"] = EndpointStatus.UNSUPPORTED if e.status_code == 400 else EndpointStatus.UNKNOWN
+        except Exception:
+            endpoints["eod_instruments"] = EndpointStatus.UNKNOWN
+        
+        # Test mapped instruments endpoint (requires valid instruments)
+        try:
+            # First try to get some instruments
+            instruments = self.discover_instruments_for_market(market, silent=True)
+            if instruments:
+                self.list_markets_instruments(market=market, instruments=instruments[:2])
+                endpoints["mapped_instruments"] = EndpointStatus.SUPPORTED
+            else:
+                endpoints["mapped_instruments"] = EndpointStatus.UNKNOWN
+        except APIError as e:
+            endpoints["mapped_instruments"] = EndpointStatus.UNSUPPORTED if e.status_code == 400 else EndpointStatus.UNKNOWN
+        except Exception:
+            endpoints["mapped_instruments"] = EndpointStatus.UNKNOWN
+        
+        # Test tick data endpoint
+        try:
+            instruments = self.discover_instruments_for_market(market, silent=True)
+            if instruments:
+                self.get_latest_tick(market=market, instruments=instruments[:2])
+                endpoints["tick_data"] = EndpointStatus.SUPPORTED
+            else:
+                endpoints["tick_data"] = EndpointStatus.UNKNOWN
+        except APIError as e:
+            endpoints["tick_data"] = EndpointStatus.UNSUPPORTED if e.status_code in [400, 404] else EndpointStatus.UNKNOWN
+        except Exception:
+            endpoints["tick_data"] = EndpointStatus.UNKNOWN
+        
+        # Test historical data endpoint
+        try:
+            instruments = self.discover_instruments_for_market(market, silent=True)
+            if instruments:
+                self.get_historical_days(market=market, instrument=instruments[0], limit=1)
+                endpoints["historical_data"] = EndpointStatus.SUPPORTED
+            else:
+                endpoints["historical_data"] = EndpointStatus.UNKNOWN
+        except APIError as e:
+            endpoints["historical_data"] = EndpointStatus.UNSUPPORTED if e.status_code in [400, 404] else EndpointStatus.UNKNOWN
+        except Exception:
+            endpoints["historical_data"] = EndpointStatus.UNKNOWN
+        
+        # Test metadata endpoint
+        try:
+            instruments = self.discover_instruments_for_market(market, silent=True)
+            if instruments:
+                self.get_latest_instrument_metadata(market=market, instruments=instruments[:2])
+                endpoints["metadata"] = EndpointStatus.SUPPORTED
+            else:
+                endpoints["metadata"] = EndpointStatus.UNKNOWN
+        except APIError as e:
+            endpoints["metadata"] = EndpointStatus.UNSUPPORTED if e.status_code in [400, 404] else EndpointStatus.UNKNOWN
+        except Exception:
+            endpoints["metadata"] = EndpointStatus.UNKNOWN
+        
+        self._endpoint_compatibility[market] = endpoints
+        return endpoints
+
+    def discover_markets_with_compatibility(self) -> List[MarketInfo]:
+        """
+        Discover all markets with their endpoint compatibility and instruments.
+        
+        Returns:
+            List of MarketInfo objects with full compatibility information
+        """
+        safe_print("🔍 Discovering markets with endpoint compatibility...")
+        
+        try:
+            resp = self.list_markets()
+            raw_markets = resp.get('Data', [])
+            
+            if not raw_markets:
+                safe_print("❌ No markets found in API response")
+                return []
+                
+            market_infos = []
+            
+            for entry in raw_markets:
+                if isinstance(entry, dict):
+                    market_id = entry.get('market')
+                    market_name = entry.get('name')
+                else:
+                    market_id = entry
+                    market_name = None
+                
+                if not market_id:
+                    continue
+                
+                safe_print(f"\n📊 Analyzing market: {market_id} ({market_name or 'Unknown'})")
+                
+                # Check endpoint compatibility
+                endpoints = self.check_endpoint_compatibility(market_id)
+                
+                # Get instruments if possible
+                instruments = self.discover_instruments_for_market(market_id)
+                
+                # Create market info
+                market_info = MarketInfo(
+                    market_id=market_id,
+                    name=market_name or market_id,
+                    endpoints=endpoints,
+                    instruments=instruments,
+                    last_checked=datetime.now()
+                )
+                
+                market_infos.append(market_info)
+                
+                # Print compatibility summary
+                supported_count = sum(1 for status in endpoints.values() if status == EndpointStatus.SUPPORTED)
+                total_count = len(endpoints)
+                safe_print(f"   ✅ Supported endpoints: {supported_count}/{total_count}")
+                safe_print(f"   🔧 Available instruments: {len(instruments)}")
+                
+            return market_infos
+            
+        except Exception as e:
+            safe_print(f"❌ Error discovering markets: {e}")
+            return []
+
+    def discover_instruments_for_market(self, market: str, silent: bool = False) -> List[str]:
+        """
+        Discover available instruments for a specific market using multiple approaches.
+        
+        Args:
+            market: Market identifier (e.g., 'sda', 'cdifti')
+            silent: If True, suppress output messages
+        
+        Returns:
+            List of available instrument tickers
+        """
+        if not silent:
+            safe_print(f"🔍 Discovering instruments for market '{market}'...")
+
+        # 1) EOD endpoint
+        try:
+            eod = self.list_eod_markets_instruments(market=market)
+            data = eod.get("Data", {}).get(market, {}).get("instruments", {})
+            if data:
+                instruments = list(data.keys())
+                if not silent:
+                    safe_print(f"   ✅ {len(instruments)} via EOD")
+                return instruments
+        except Exception as e:
+            if not silent:
+                safe_print(f"   ⚠️ EOD failed: {e}")
+
+        # 2) Metadata fallback
+        common = ["BTC-USD", "ETH-USD", "XBX-USD", "ETX-USD"]
+        try:
+            meta = self.get_latest_instrument_metadata(market, common)
+            if meta.get("Data"):
+                instruments = list(meta["Data"].keys())
+                if not silent:
+                    safe_print(f"   ✅ {len(instruments)} via metadata")
+                return instruments
+        except Exception as e:
+            if not silent:
+                safe_print(f"   ⚠️ Metadata failed: {e}")
+
+        # 3) General mapped instruments fallback
+        try:
+            mapped = self.list_markets_instruments(market=market)
+            data = mapped.get("Data", {}).get(market, {})
+            if data:
+                instruments = list(data.keys())
+                if not silent:
+                    safe_print(f"   ✅ {len(instruments)} via general mapped endpoint")
+                return instruments
+        except Exception as e:
+            if not silent:
+                safe_print(f"   ⚠️ General mapped failed: {e}")
+
+        if not silent:
+            safe_print(f"   ❌ No instruments for {market}")
+        return []
+
+    def get_market_summary(self, market: str) -> Dict:
+        """
+        Get a comprehensive summary of a market's capabilities.
+        
+        Args:
+            market: Market identifier
+            
+        Returns:
+            Dictionary with market summary information
+        """
+        endpoints = self.check_endpoint_compatibility(market)
+        instruments = self.discover_instruments_for_market(market, silent=True)
+        
+        supported_endpoints = [name for name, status in endpoints.items() if status == EndpointStatus.SUPPORTED]
+        
+        return {
+            "market_id": market,
+            "total_instruments": len(instruments),
+            "sample_instruments": instruments[:5],
+            "supported_endpoints": supported_endpoints,
+            "endpoint_details": endpoints,
+            "is_functional": len(supported_endpoints) > 0 and len(instruments) > 0
+        }
+
+
+def test_market_comprehensively(client: IndexClient, market: str):
+    """
+    Run comprehensive tests on a market with smart endpoint selection.
+    
+    Args:
+        client: IndexClient instance
+        market: Market identifier to test
+    """
+    safe_print(f"\n{'='*60}")
+    safe_print(f"🧪 COMPREHENSIVE MARKET TEST: {market}")
+    safe_print(f"{'='*60}")
+    
+    # Get market summary
+    summary = client.get_market_summary(market)
+    
+    safe_print(f"📊 Market Summary:")
+    safe_print(f"   Market ID: {summary['market_id']}")
+    safe_print(f"   Total Instruments: {summary['total_instruments']}")
+    safe_print(f"   Functional: {'✅' if summary['is_functional'] else '❌'}")
+    safe_print(f"   Supported Endpoints: {', '.join(summary['supported_endpoints'])}")
+    
+    if not summary['is_functional']:
+        safe_print("⚠️  Market is not functional - skipping detailed tests")
+        return
+    
+    instruments = summary['sample_instruments'][:3]  # Use first 3 for testing
+    safe_print(f"🔧 Testing with instruments: {instruments}")
+    
+    # Test each supported endpoint
+    endpoint_tests = {
+        "eod_instruments": lambda: client.list_eod_markets_instruments(market=market),
+        "mapped_instruments": lambda: client.list_markets_instruments(market=market, instruments=instruments),
+        "tick_data": lambda: client.get_latest_tick(market=market, instruments=instruments),
+        "historical_data": lambda: client.get_historical_days(market=market, instrument=instruments[0], limit=3),
+        "metadata": lambda: client.get_latest_instrument_metadata(market=market, instruments=instruments)
+    }
+    
+    results = {}
+    
+    for endpoint_name, test_func in endpoint_tests.items():
+        if endpoint_name in summary['supported_endpoints']:
+            safe_print(f"\n🧪 Testing {endpoint_name}...")
+            try:
+                response = test_func()
+                data_count = len(response.get('Data', []))
+                results[endpoint_name] = "✅ SUCCESS"
+                safe_print(f"   ✅ SUCCESS - Retrieved {data_count} data points")
+                safe_print(f"   📋 Response keys: {list(response.keys())}")
+            except Exception as e:
+                results[endpoint_name] = f"❌ FAILED: {str(e)[:100]}"
+                safe_print(f"   ❌ FAILED: {str(e)[:100]}")
+        else:
+            results[endpoint_name] = "⏭️  SKIPPED (unsupported)"
+            safe_print(f"\n⏭️  Skipping {endpoint_name} (unsupported)")
+    
+    # Print test summary
+    safe_print(f"\n📋 Test Results Summary:")
+    for endpoint, result in results.items():
+        safe_print(f"   {endpoint}: {result}")
+    
+    safe_print(f"\n{'='*60}")
+
+
+
+def fetch_all_functional_markets():
+    """
+    Fetch latest tick and 30-day history for BTC-USD, SOL-USD, ETH-USD
+    across all functional markets.
+    Save results in dta/coindesk/indexes.
+    """
+    import config
+    from client import APIError
+
+    if not config.API_KEY:
+        safe_print("❌ Error: COINDESK_API_KEY not set.")
+        return
+
+    client = IndexClient()
+    safe_print("🚀 Fetching data for all functional markets and BTC/SOL/ETH...")
+
+    markets = [
+        "cadli", "ccix", "ccxrp", "ccxrpperp",
+        "cd_mc", "cdi_b", "cdi_mda", "cdor", "sda"
+    ]
+    instruments = ["BTC-USD", "SOL-USD", "ETH-USD"]
+
+    for m in markets:
+        safe_print(f"\n📊 Market: {m}")
+        market_results = {}
+        for inst in instruments:
+            # Latest tick
+            try:
+                tick = client.get_latest_tick(market=m, instruments=[inst])
+                data = tick.get("Data", {}).get(inst, {})
+                safe_print(f" 🔸 {inst} latest price: {data.get('price', 'n/a')}")
+                market_results[f"{inst}_latest_tick"] = tick
+            except APIError as e:
+                safe_print(f" ⚠️ {inst} tick failed (status {e.status_code})")
+                market_results[f"{inst}_latest_tick"] = {"error": f"APIError {e.status_code}"}
+            except Exception as e:
+                safe_print(f" ⚠️ {inst} tick error: {e}")
+                market_results[f"{inst}_latest_tick"] = {"error": str(e)}
+
+            # 30-day historical
+            try:
+                hist = client.get_historical_days(
+                    market=m,
+                    instrument=inst,
+                    limit=30,
+                    aggregate=1,
+                    fill=True
+                )
+                days = hist.get("Data", {}).get("values", [])
+                safe_print(f"   • {len(days)} days of history (first: {days[0] if days else 'n/a'})")
+                market_results[f"{inst}_history"] = hist
+            except APIError as e:
+                safe_print(f" ⚠️ {inst} history failed (status {e.status_code})")
+                market_results[f"{inst}_history"] = {"error": f"APIError {e.status_code}"}
+            except Exception as e:
+                safe_print(f" ⚠️ {inst} history error: {e}")
+                market_results[f"{inst}_history"] = {"error": str(e)}
+
+        # Save results for this market
+        save_json_result(f"{m}.json", market_results)
+
+if __name__ == "__main__":
+    # main()
+    # To run the fetch-all-markets script, uncomment below:
+    fetch_all_functional_markets()
\ No newline at end of file
diff --git a/src/fetchers/coindesk_client/main.py b/src/fetchers/coindesk_client/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdeffba68ea29f6538615c84a4c56947ac149044
--- /dev/null
+++ b/src/fetchers/coindesk_client/main.py
@@ -0,0 +1,360 @@
+"""
+main.py – Fetch CoinDesk On-Chain **and** AMM (Uniswap‑style) data
+=================================================================
+Patched 2025‑07‑13
+------------------
+* **Fixed** positional/keyword mismatch for `get_block`.
+* **Flatten + sanitize** CoinDesk AMM responses so Parquet writes succeed.
+* **Direct overwrite** for list/dict‑rich endpoints to prevent merge type errors.
+"""
+from __future__ import annotations
+
+import sys
+import os
+import argparse
+import logging
+import datetime as _dt
+import json as _json
+from typing import List, Optional, Any, Dict
+
+from dotenv import load_dotenv
+import pandas as pd
+
+# ---------------------------------------------------------------------------
+# Tier-locked endpoint skip flag
+# ---------------------------------------------------------------------------
+SKIP_TIER_LOCKED = os.getenv("COINDESK_SKIP_TIER_LOCKED", "true").lower() in ("1", "true", "yes")
+
+# ---------------------------------------------------------------------------
+# Path bootstrap – ensure project root is import‑able
+# ---------------------------------------------------------------------------
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+PROJECT_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
+if PROJECT_ROOT not in sys.path:
+    sys.path.insert(0, PROJECT_ROOT)
+
+# ---------------------------------------------------------------------------
+# Local imports (resolved after path bootstrap)
+# ---------------------------------------------------------------------------
+from onchain import OnChainClient, normalize_data  # noqa: E402
+from src.data_cloud.cloud_utils import StorageHandler  # noqa: E402
+from src.fetchers.coindesk_client.coindesk_utils import save_and_merge_parquet  # noqa: E402
+
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+CHAIN_ASSET_MAP: Dict[str, int] = {
+    "ETH": 2,
+    "BSC": 8,
+    "BTC": 1,
+    "BASE": 2410,
+    "ARB": 808,
+}
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _flatten_records(resp: Any, id_field: str = "id") -> pd.DataFrame:
+    """Flatten dict‑of‑dict → rows DataFrame; else defer to normalize_data()."""
+    if isinstance(resp, dict) and all(isinstance(v, dict) for v in resp.values()):
+        return pd.DataFrame([{id_field: k, **v} for k, v in resp.items()])
+    return normalize_data(resp)
+
+
+def _sanitize_for_parquet(df: pd.DataFrame) -> pd.DataFrame:
+    """Convert any nested dict/list columns to JSON strings for Arrow compatibility."""
+    for col in df.columns:
+        if df[col].dtype == "object":
+            df[col] = df[col].apply(lambda x: _json.dumps(x) if isinstance(x, (dict, list)) else str(x))
+    return df
+
+
+def _save_merge(storage: StorageHandler, filename: str, df: pd.DataFrame, *, date_col: str, days: int):
+    """Sanitize then merge new df into history via save_and_merge_parquet()."""
+    if df.empty:
+        logger.debug("→ %s empty, skip merge", filename)
+        return
+    df = _sanitize_for_parquet(df)
+    save_and_merge_parquet(storage, filename, df, date_col=date_col, days=days)
+    logger.info("✔ Merged %s (%d rows)", filename, len(df))
+
+
+def _save_overwrite(storage: StorageHandler, filename: str, df: pd.DataFrame):
+    """Sanitize then overwrite local Parquet—bypass merge to avoid mixed types."""
+    if df.empty:
+        logger.debug("→ %s empty, skip overwrite", filename)
+        return
+    df = _sanitize_for_parquet(df)
+    local_dir = storage.local_base
+    path = os.path.join(local_dir, filename)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    df.to_parquet(path, index=False)
+    logger.info("✔ Overwrote %s (%d rows)", filename, len(df))
+
+# ---------------------------------------------------------------------------
+# On‑chain batch
+# ---------------------------------------------------------------------------
+
+def fetch_onchain_all(
+    onchain: OnChainClient,
+    storage: StorageHandler,
+    symbols: List[str],
+    days_old: int,
+    block_configs: List[dict],
+):
+    # Address metadata – overwrite to prevent nested-list merges
+    for sym in symbols:
+        chain_sym, address = sym.split("-", 1)
+        chain_id = CHAIN_ASSET_MAP.get(chain_sym)
+
+        try:
+            logger.info("→ Address metadata %s on %s", address, chain_sym)
+            resp = onchain.get_address_metadata(chain_id, address).get("Data", {})
+            df = pd.DataFrame([resp])
+            _save_overwrite(storage, f"{sym}_address_metadata.parquet", df)
+        except Exception:
+            logger.exception("✗ Address metadata %s", sym)
+
+        # Asset‑by‑address – overwrite for list‑rich fields
+        try:
+            logger.info("→ Asset‑by‑address %s on %s", address, chain_sym)
+            resp = onchain.get_data_by_address(
+                chain_asset=chain_sym,
+                address=address,
+                asset_lookup_priority="SYMBOL",
+                quote_asset="USD",
+            ).get("Data", {})
+            df = normalize_data(resp)
+            _save_overwrite(storage, f"{sym}_data_by_address.parquet", df)
+        except Exception as e:
+            if getattr(getattr(e, "response", None), "status_code", None) == 404:
+                logger.warning("→ Asset‑by‑address unsupported for %s", sym)
+            else:
+                logger.exception("✗ Asset‑by‑address %s", sym)
+
+    # Historical supply – safe merge
+    for chain_sym in {s.split("-", 1)[0] for s in symbols}:
+        # ── Historical supply (premium) ──
+        if SKIP_TIER_LOCKED:
+            logger.info("← Skipping historical supply for %s (tier-locked)", chain_sym)
+        else:
+            try:
+                logger.info("→ Supply days %s", chain_sym)
+                resp = onchain.get_historical_supply_days(
+                    asset=chain_sym,
+                    asset_lookup_priority="SYMBOL",
+                    quote_asset="USD",
+                ).get("Data", {})
+                df = normalize_data(resp)
+                _save_merge(storage, f"{chain_sym}_historical_supply_days.parquet", df, date_col="timestamp", days=days_old)
+            except Exception as e:
+                if getattr(getattr(e, "response", None), "status_code", None) == 401:
+                    logger.warning("→ Supply tier-locked for %s", chain_sym)
+                else:
+                    logger.exception("✗ Supply days %s", chain_sym)
+
+    # Summary by chain – overwrite nested struct
+    for chain_sym in {s.split("-", 1)[0] for s in symbols}:
+        try:
+            logger.info("→ Chain summary %s", chain_sym)
+            resp = onchain.get_summary_by_chain(
+                chain_asset=chain_sym,
+                asset_lookup_priority="SYMBOL",
+            ).get("Data", {})
+            df = pd.DataFrame([resp])
+            _save_overwrite(storage, f"{chain_sym}_chain_summary.parquet", df)
+        except Exception:
+            logger.exception("✗ Chain summary %s", chain_sym)
+
+    # Block data – safe merge
+    for cfg in block_configs:
+        ca, bn, groups = cfg["chain_asset"], cfg["block_number"], cfg["groups"]
+        try:
+            logger.info("→ Block %s:%s", ca, bn)
+            resp = onchain.get_block(ca, bn, groups=groups).get("Data", {})
+            df = pd.DataFrame([resp])
+            _save_merge(storage, f"block_{ca}_{bn}.parquet", df, date_col="timestamp", days=days_old)
+        except Exception:
+            logger.exception("✗ Block %s:%s", ca, bn)
+
+# ---------------------------------------------------------------------------
+# AMM batch
+# ---------------------------------------------------------------------------
+
+def fetch_amm_all(
+    onchain: OnChainClient,
+    storage: StorageHandler,
+    *,
+    market: str,
+    instruments: List[str],
+    days_old: int,
+    pairs: Optional[List[str]] = None,
+):
+    logger.info("=== AMM %s – %s ===", market, ", ".join(instruments))
+
+    # Latest tick – safe merge
+    try:
+        tick = onchain.get_latest_swap_tick(market=market, instruments=instruments).get("Data", {})
+        df = _flatten_records(tick, "instrument")
+        _save_merge(storage, f"{market}_latest_swap_tick.parquet", df, date_col="timestamp", days=days_old)
+    except Exception:
+        logger.exception("✗ Latest tick %s", market)
+
+    # Historical OHLCV – safe merge
+    for inst in instruments:
+        try:
+            hist = onchain.get_historical_swap_days(
+                market=market,
+                instrument=inst,
+                limit=30,
+                aggregate=1,
+                fill=True,
+            ).get("Data", {})
+            df = normalize_data(hist)
+            _save_merge(storage, f"{inst}_historical_swap_days.parquet", df, date_col="timestamp", days=days_old)
+        except Exception:
+            logger.exception("✗ OHLCV %s", inst)
+
+    # Hourly messages – safe merge with warning
+    hour_ts = int(_dt.datetime.utcnow().replace(minute=0, second=0, microsecond=0).timestamp())
+    for inst in instruments:
+        # ── Swap messages (premium) ──
+        if SKIP_TIER_LOCKED:
+            logger.info("← Skipping swap-messages for %s (tier-locked)", inst)
+        else:
+            try:
+                swaps = onchain.get_swap_messages_hour(market=market, instrument=inst, hour_ts=hour_ts).get("Data", {})
+                df = normalize_data(swaps)
+                _save_merge(storage, f"{inst}_swap_messages_{hour_ts}.parquet", df, date_col="timestamp", days=days_old)
+            except Exception as e:
+                if getattr(getattr(e, "response", None), "status_code", None) == 401:
+                    logger.warning("→ swap-messages tier-locked for %s", inst)
+                else:
+                    logger.exception("✗ swap messages %s", inst)
+        try:
+            liq = onchain.get_liquidity_update_messages_hour(market=market, instrument=inst, hour_ts=hour_ts).get("Data", {})
+            df = normalize_data(liq)
+            _save_merge(storage, f"{inst}_liquidity_updates_{hour_ts}.parquet", df, date_col="timestamp", days=days_old)
+        except Exception as e:
+            if SKIP_TIER_LOCKED:
+                logger.info("← Skipping liquidity-updates for %s (tier-locked)", inst)
+            elif getattr(getattr(e, "response", None), "status_code", None) == 401:
+                logger.warning("→ liquidity-updates tier-locked for %s", inst)
+            else:
+                logger.exception("✗ liquidity updates %s", inst)
+
+    # Instrument metadata – safe merge
+    try:
+        meta = onchain.get_latest_instrument_metadata(market=market, instruments=instruments).get("Data", {})
+        df = _flatten_records(meta, "instrument")
+        _save_merge(storage, f"{market}_instrument_metadata.parquet", df, date_col="timestamp", days=days_old)
+    except Exception:
+        logger.exception("✗ Instrument metadata %s", market)
+
+    # Market overview – safe merge
+    try:
+        mkts = onchain.get_amm_markets(market=market).get("Data", {})
+        df = _flatten_records(mkts, "market")
+        _save_merge(storage, f"{market}_markets.parquet", df, date_col="timestamp", days=days_old)
+    except Exception:
+        logger.exception("✗ Markets %s", market)
+
+    # Optional pairs listing – safe merge
+    if pairs:
+        try:
+            lst = onchain.get_amm_markets_instruments(market=market, instruments=pairs).get("Data", {})
+            df = _flatten_records(lst, "pair")
+            _save_merge(storage, f"{market}_markets_instruments.parquet", df, date_col="timestamp", days=days_old)
+        except Exception:
+            logger.exception("✗ Markets+instruments %s", market)
+
+# ---------------------------------------------------------------------------
+# Orchestrator & CLI
+# ---------------------------------------------------------------------------
+
+def fetch_all(config: Dict[str, Any] | None = None):
+    load_dotenv()
+    cfg = config or {}
+
+    # Fix: check both 'api_key' and 'api-key' (CLI uses --api-key), fallback to env
+    api_key = (
+        cfg.get("api_key")
+        or cfg.get("api-key")
+        or os.getenv("COINDESK_API_KEY")
+    )
+    print("Using API key:", api_key)
+    host = cfg.get("host") or os.getenv("COINDESK_API_HOST", "data-api.coindesk.com")
+    base_url = f"https://{host}/"
+
+    days_old = int(cfg.get("days") or os.getenv("COINDESK_DAYS_OLD", 7))
+
+    symbols_arg = cfg.get("symbols") or os.getenv("COINDESK_SYMBOLS", "ETH-0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2")
+    symbols = [s.strip() for s in symbols_arg.split(",") if s.strip()]
+
+    amm_market = cfg.get("amm_market") or os.getenv("COINDESK_AMM_MARKET", "uniswapv2")
+    amm_instruments_arg = cfg.get("amm_instruments") or os.getenv("COINDESK_AMM_INSTRUMENTS", "0x0d4a11d5eeaac28ec3f61d100daf4d40471f1852_2,0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc_2")
+    amm_instruments = [s.strip() for s in amm_instruments_arg.split(",") if s.strip()]
+
+    amm_pairs_arg = cfg.get("amm_pairs") or os.getenv("COINDESK_AMM_PAIRS", "WETH-USDC,WETH-USDT")
+    amm_pairs = [p.strip() for p in amm_pairs_arg.split(",") if p.strip()]
+
+    block_configs = [
+        {"chain_asset": 2, "block_number": 19501436, "groups": ["ID", "METADATA", "TRANSACTIONS"]},
+        {"chain_asset": 8, "block_number": 33459930, "groups": ["ID", "METADATA", "TRANSACTIONS"]},
+        {"chain_asset": 1, "block_number": 840946, "groups": ["ID", "METADATA", "TRANSACTIONS"]},
+        {"chain_asset": 2410, "block_number": 17014740, "groups": ["ID", "METADATA", "TRANSACTIONS"]},
+        {"chain_asset": 808, "block_number": 284999999,"groups": ["ID", "METADATA", "TRANSACTIONS"]},
+    ]
+
+    onchain = OnChainClient(api_key=api_key, base_url=base_url)
+    storage = StorageHandler(
+        endpoint_url=None,
+        access_key=None,
+        secret_key=None,
+        bucket_name=None,
+        local_base="data/coindesk/onchain",
+    )
+
+    # ------------------------------------------------------------------
+    # Execute batches
+    # ------------------------------------------------------------------
+    logger.info("=== Fetching on-chain data ===")
+    fetch_onchain_all(onchain, storage, symbols, days_old, block_configs)
+
+    logger.info("=== Fetching AMM (%s) data ===", amm_market)
+    fetch_amm_all(
+        onchain,
+        storage,
+        market=amm_market,
+        instruments=amm_instruments,
+        days_old=days_old,
+        pairs=amm_pairs,
+    )
+
+# ---------------------------------------------------------------------------
+# CLI wrapper
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Fetch CoinDesk On-Chain & AMM data")
+    parser.add_argument("--symbols", help="comma-separated chain-symbol addresses (e.g. 'ETH-0x...,BTC-...')")
+    parser.add_argument("--days", type=int, help="merge window in days (default 7)")
+    parser.add_argument("--api-key", help="CoinDesk API key")
+    parser.add_argument("--host", help="API host override")
+    # AMM extras ------------------------------------------------------
+    parser.add_argument("--amm-market", help="AMM market (e.g. 'uniswapv2')")
+    parser.add_argument("--amm-instruments", help="comma-separated instrument addresses")
+    parser.add_argument("--amm-pairs", help="comma-separated token pairs for markets+instruments")
+
+    args = parser.parse_args()
+    cfg = {k: v for k, v in vars(args).items() if v is not None}
+
+    # Fallbacks to env handled inside fetch_all
+    fetch_all(cfg)
diff --git a/src/fetchers/coindesk_client/marketcap.py b/src/fetchers/coindesk_client/marketcap.py
new file mode 100644
index 0000000000000000000000000000000000000000..8faaac2c225ed90afd8c40f3b4c507f12c6f99fa
--- /dev/null
+++ b/src/fetchers/coindesk_client/marketcap.py
@@ -0,0 +1,33 @@
+"""
+marketcap.py – Market capitalization endpoints for CoinDesk API client.
+
+- get_latest_marketcap(): Fetch the latest market capitalization snapshot.
+- get_historical_marketcap(symbol, start, end=None, limit=None): Retrieve historical market cap data for a given asset.
+"""
+
+from client import BaseClient
+
+class MarketCapClient(BaseClient):
+    def get_latest_marketcap(self) -> dict:
+        """
+        GET /overview/v1/latest/marketcap/all/tick
+        Returns the latest tick-level market-capitalisation snapshot for all assets.
+        """
+        return self._get("overview/v1/latest/marketcap/all/tick")
+
+    def get_historical_marketcap(self, symbol, start, end=None, limit=None):
+        """
+        Fetch historical market capitalization for a specific asset over a timeframe.
+
+        :param symbol: Asset symbol, e.g., "BTC-USD".
+        :param start: ISO8601 start datetime string.
+        :param end: ISO8601 end datetime string (optional).
+        :param limit: Maximum number of records to return (optional).
+        :return: JSON response with historical market cap data.
+        """
+        params = {"start": start}
+        if end:
+            params["end"] = end
+        if limit is not None:
+            params["limit"] = limit
+        return self._get(f"marketcap/{symbol}/history", params=params)
diff --git a/src/fetchers/coindesk_client/onchain.py b/src/fetchers/coindesk_client/onchain.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd13221e0383383d94ca139e0bbe2b8c329d739
--- /dev/null
+++ b/src/fetchers/coindesk_client/onchain.py
@@ -0,0 +1,303 @@
+"""
+onchain.py – CoinDesk Data API On-Chain & AMM endpoints.
+
+This client wraps the publicly-documented CoinDesk **/onchain/** routes, now including
+Automated Market Maker (AMM) queries for Uniswap-style DEXs.
+
+Provided functionality
+----------------------
+* Processed block data (multi-chain)
+* Address metadata & asset summaries
+* Historical supply-day metrics
+* **NEW – AMM endpoints**
+  · Latest swap tick (price/volume snapshot)
+  · Historical OHLCV+ for swaps (daily aggregation)
+  · Raw swap messages (per-hour granularity)
+  · Liquidity-update messages (per-hour granularity)
+  · Instrument metadata
+  · Market & instrument discovery
+
+All helper methods return the raw `requests.Response` JSON.  You can pass the output
+through `normalize_data()` to obtain a tidy *pandas* `DataFrame`.
+
+Example
+~~~~~~~
+>>> client = OnChainClient(api_key="YOUR_COIN_DESK_KEY")
+>>> df = normalize_data(
+...     client.get_latest_swap_tick(
+...         market="uniswapv2",
+...         instruments=[
+...             "0x0d4a11d5eeaac28ec3f61d100daf4d40471f1852_2",  # WETH/USDT
+...             "0xb4e16d0168e52d35cacd2c6185b44281ec28c9dc_2",  # WETH/USDC
+...         ]
+...     )
+... )
+>>> df.head()
+
+Dependencies
+------------
+* `pandas`  (tabular manipulation)
+* `requests` (via BaseClient)
+"""
+from __future__ import annotations
+
+from typing import Any, List, Optional, Dict
+import pandas as pd
+
+from client import BaseClient  # ← must expose ._get()
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def normalize_data(raw: Any) -> pd.DataFrame:
+    """Best-effort conversion of API *raw* JSON into a :class:`pandas.DataFrame`.  Handles
+    the three typical response shapes returned by CoinDesk:
+
+    * **list[dict]** – directly convertible via :pyclass:`pandas.DataFrame`
+    * **dict[str, list]** where all lists are equal length – idem
+    * **dict[str, Any]** heterogeneous – wrapped in a single-row DataFrame
+    """
+    if isinstance(raw, list):
+        return pd.DataFrame(raw)
+
+    if isinstance(raw, dict):
+        try:
+            return pd.DataFrame(raw)
+        except ValueError:  # unequal length sequences → single row
+            return pd.DataFrame([raw])
+
+    # Fallback – unknown shape
+    return pd.DataFrame()
+
+
+# ---------------------------------------------------------------------------
+# Main client
+# ---------------------------------------------------------------------------
+
+class OnChainClient(BaseClient):
+    """Typed thin wrapper around the CoinDesk On-Chain REST API."""
+
+    # ---------------------------------------------------------------------
+    # Core (already present) ------------------------------------------------
+    # ---------------------------------------------------------------------
+
+    def get_block(self, chain_asset: int, block_number: int, *, groups: List[str]):
+        """Processed block data for *chain_asset* at *block_number*.
+
+        ``groups`` is a list such as ``["ID", "METADATA", "TRANSACTIONS"]``.
+        Maps to ``/onchain/v1/block/{chain_asset}``.
+        """
+        return self._get(
+            f"onchain/v1/block/{chain_asset}",
+            params={"block_number": block_number, "groups": ",".join(groups)},
+        )
+
+    def get_address_metadata(self, chain_asset: int, address: str):
+        """Rich metadata for an *address* on *chain_asset*."""
+        return self._get(
+            f"onchain/v1/address/metadata/{chain_asset}", params={"address": address}
+        )
+
+    def get_summary_by_chain(self, chain_asset: str, *, asset_lookup_priority: str = "SYMBOL"):
+        """Summary view of assets for a blockchain network."""
+        return self._get(
+            "onchain/v3/summary/by/chain",
+            params={
+                "chain_asset": chain_asset,
+                "asset_lookup_priority": asset_lookup_priority,
+            },
+        )
+
+    def get_data_by_address(
+        self,
+        chain_asset: str,
+        address: str,
+        *,
+        asset_lookup_priority: str = "SYMBOL",
+        quote_asset: str = "USD",
+    ):
+        """Look-up asset data (balance, value, etc.) by *address*."""
+        return self._get(
+            "onchain/v2/data/by/address",
+            params={
+                "chain_asset": chain_asset,
+                "address": address,
+                "asset_lookup_priority": asset_lookup_priority,
+                "quote_asset": quote_asset,
+            },
+        )
+
+    def get_historical_supply_days(
+        self,
+        asset: str,
+        *,
+        asset_lookup_priority: str = "SYMBOL",
+        quote_asset: Optional[str] = None,
+    ):
+        """Daily historical supply for *asset* – available for major networks."""
+        params: Dict[str, str] = {
+            "asset": asset,
+            "asset_lookup_priority": asset_lookup_priority,
+        }
+        if quote_asset:
+            params["quote_asset"] = quote_asset
+        return self._get("onchain/v2/historical/supply/days", params=params)
+
+    # ---------------------------------------------------------------------
+    # AMM (new) ------------------------------------------------------------
+    # ---------------------------------------------------------------------
+
+    # Helpers – convert booleans to lower-case strings required by API
+    _bool = staticmethod(lambda x: str(bool(x)).lower())
+
+    def get_latest_swap_tick(
+        self,
+        *,
+        market: str,
+        instruments: List[str],
+        instrument_lookup_strategy: str = "ALL_OPTIONS",
+        apply_mapping: bool = True,
+    ):
+        """Latest tick (price, volume, liquidity) for one or many *instruments* on an AMM
+        *market* (e.g. ``"uniswapv2"``).
+
+        **Endpoint**  ``/onchain/v1/amm/latest/swap/tick``
+        """
+        return self._get(
+            "onchain/v1/amm/latest/swap/tick",
+            params={
+                "market": market,
+                "instruments": ",".join(instruments),
+                "instrument_lookup_strategy": instrument_lookup_strategy,
+                "apply_mapping": self._bool(apply_mapping),
+            },
+        )
+
+    def get_historical_swap_days(
+        self,
+        *,
+        market: str,
+        instrument: str,
+        limit: int = 30,
+        aggregate: int = 1,
+        fill: bool = True,
+        instrument_lookup_strategy: str = "ALL_OPTIONS",
+        apply_mapping: bool = True,
+    ):
+        """Daily OHLCV+ history for a swap *instrument* (e.g. LP address _tokenId).
+
+        **Endpoint** ``/onchain/v1/amm/historical/swap/days``
+        """
+        return self._get(
+            "onchain/v1/amm/historical/swap/days",
+            params={
+                "market": market,
+                "instrument": instrument,
+                "limit": limit,
+                "aggregate": aggregate,
+                "fill": self._bool(fill),
+                "instrument_lookup_strategy": instrument_lookup_strategy,
+                "apply_mapping": self._bool(apply_mapping),
+            },
+        )
+
+    def get_swap_messages_hour(
+        self,
+        *,
+        market: str,
+        instrument: str,
+        hour_ts: int,
+        instrument_lookup_strategy: str = "ALL_OPTIONS",
+        apply_mapping: bool = True,
+    ):
+        """Raw swap messages (mints/burns/swaps) for a given *hour_ts* (UNIX seconds).
+
+        **Endpoint** ``/onchain/v2/amm/historical/swap-messages/hour``
+        """
+        return self._get(
+            "onchain/v2/amm/historical/swap-messages/hour",
+            params={
+                "market": market,
+                "instrument": instrument,
+                "hour_ts": hour_ts,
+                "instrument_lookup_strategy": instrument_lookup_strategy,
+                "apply_mapping": self._bool(apply_mapping),
+            },
+        )
+
+    def get_liquidity_update_messages_hour(
+        self,
+        *,
+        market: str,
+        instrument: str,
+        hour_ts: int,
+        instrument_lookup_strategy: str = "ALL_OPTIONS",
+        apply_mapping: bool = True,
+    ):
+        """Liquidity add/remove messages for the specified *hour_ts*.
+
+        **Endpoint** ``/onchain/v2/amm/historical/liquidity-update-messages/hour``
+        """
+        return self._get(
+            "onchain/v2/amm/historical/liquidity-update-messages/hour",
+            params={
+                "market": market,
+                "instrument": instrument,
+                "hour_ts": hour_ts,
+                "instrument_lookup_strategy": instrument_lookup_strategy,
+                "apply_mapping": self._bool(apply_mapping),
+            },
+        )
+
+    def get_latest_instrument_metadata(
+        self,
+        *,
+        market: str,
+        instruments: List[str],
+        instrument_lookup_strategy: str = "ALL_OPTIONS",
+        apply_mapping: bool = True,
+    ):
+        """Token-pair metadata (decimals, symbols, etc.) for *instruments*.
+
+        **Endpoint** ``/onchain/v1/amm/latest/instrument/metadata``
+        """
+        return self._get(
+            "onchain/v1/amm/latest/instrument/metadata",
+            params={
+                "market": market,
+                "instruments": ",".join(instruments),
+                "instrument_lookup_strategy": instrument_lookup_strategy,
+                "apply_mapping": self._bool(apply_mapping),
+            },
+        )
+
+    # ------------------------------------------------------------------
+    # Market discovery
+    # ------------------------------------------------------------------
+
+    def get_amm_markets(self, *, market: str):
+        """List details about an AMM *market* (e.g. pools count, TVL)."""
+        return self._get("onchain/v1/amm/markets", params={"market": market})
+
+    def get_amm_markets_instruments(
+        self,
+        *,
+        market: str,
+        instruments: List[str],
+        instrument_status: str = "ACTIVE",
+        instrument_lookup_strategy: str = "ALL_OPTIONS",
+    ):
+        """Enumerate instruments on an AMM *market* filtered by *instrument_status*.
+
+        **Endpoint** ``/onchain/v1/amm/markets/instruments``
+        """
+        return self._get(
+            "onchain/v1/amm/markets/instruments",
+            params={
+                "market": market,
+                "instruments": ",".join(instruments),
+                "instrument_status": instrument_status,
+                "instrument_lookup_strategy": instrument_lookup_strategy,
+            },
+        )
diff --git a/src/fetchers/coindesk_client/sentiment.py b/src/fetchers/coindesk_client/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..7feba589be26f675cfa33953b0197ba98ee4e096
--- /dev/null
+++ b/src/fetchers/coindesk_client/sentiment.py
@@ -0,0 +1,32 @@
+"""
+sentiment.py – Sentiment data endpoints for CoinDesk API client.
+
+- get_asset_sentiment(symbol): Fetch the latest sentiment score for a given asset.
+- get_historical_sentiment(symbol, days, limit=None): Retrieve sentiment history over N days.
+"""
+
+from client import BaseClient
+
+class SentimentClient(BaseClient):
+    def get_asset_sentiment(self, symbol):
+        """
+        Fetch the latest sentiment score for the specified symbol.
+
+        :param symbol: Asset symbol, e.g., "BTC-USD".
+        :return: JSON response with sentiment score.
+        """
+        return self._get(f"sentiment/{symbol}")
+
+    def get_historical_sentiment(self, symbol, days, limit=None):
+        """
+        Fetch sentiment history for a symbol over the past `days` days.
+
+        :param symbol: Asset symbol, e.g., "BTC-USD".
+        :param days: Number of days of history to retrieve.
+        :param limit: Maximum number of records to return (optional).
+        :return: JSON response with historical sentiment data.
+        """
+        params = {"days": days}
+        if limit is not None:
+            params["limit"] = limit
+        return self._get(f"sentiment/{symbol}/historical", params=params)
diff --git a/src/fetchers/coindesk_client/spot.py b/src/fetchers/coindesk_client/spot.py
new file mode 100644
index 0000000000000000000000000000000000000000..9420cd6867149de809c954e3c33906c2690d75ad
--- /dev/null
+++ b/src/fetchers/coindesk_client/spot.py
@@ -0,0 +1,83 @@
+# spot.py
+
+import logging
+import requests
+from typing import Union, List, Dict, Any
+from requests import HTTPError, Session
+from client import BaseClient
+
+logger = logging.getLogger(__name__)
+
+
+class SpotClient(BaseClient):
+    """
+    Spot market endpoints for CCData (CryptoCompare / CoinDesk) Data API.
+
+    - list_markets_instruments(market): all supported instrument codes for a spot market.
+    - list_markets(market, groups): all spot markets, optionally filtered.
+    - get_latest_tick(market, instruments): latest tick data for one or more instruments.
+    """
+
+    def __init__(self, api_key: str, base_url: str = None, timeout: int = 10):
+        super().__init__(api_key=api_key, base_url=base_url)
+        self.timeout = timeout
+        # Use a Session for connection pooling & retries
+        self.session = Session()
+        adapter = requests.adapters.HTTPAdapter(max_retries=3)
+        self.session.mount("https://", adapter)
+
+    def list_markets_instruments(self, market: str) -> Dict[str, Any]:
+        """
+        GET /spot/v1/markets/instruments
+        :param market: Exchange slug (e.g. "binance")
+        :returns:     {"Data": [ {instrument, ...}, … ]}
+        """
+        params = {"market": market}
+        return self._get("spot/v1/markets/instruments", params=params, timeout=self.timeout)
+
+    def list_markets(self, market: str = None, groups: str = "BASIC") -> Dict[str, Any]:
+        """
+        GET /spot/v1/markets
+        :param market: optional exchange slug to filter by
+        :param groups: filter group name (e.g. "BASIC", "ADVANCED")
+        :returns:      {"Data": [ {market info…}, … ]}
+        """
+        params: Dict[str, Any] = {"groups": groups}
+        if market:
+            params["market"] = market
+        return self._get("spot/v1/markets", params=params, timeout=self.timeout)
+
+    def get_latest_tick(
+        self,
+        market: str,
+        instruments: Union[str, List[str]]
+    ) -> Dict[str, Any]:
+        """
+        GET /spot/v1/latest/tick
+        :param market:      Exchange slug (e.g. "binance")
+        :param instruments: Single ID or list (e.g. "BTC-USDT" or ["BTC-USDT","ETH-USDT"])
+        :returns:           {"Data": [ { ...tick fields... }, … ]}
+        """
+        # allow list-of-str or comma-string
+        if isinstance(instruments, (list, tuple)):
+            instr_param = ",".join(instruments)
+        else:
+            instr_param = instruments
+
+        params = {"market": market, "instruments": instr_param}
+
+        try:
+            resp = self._get("spot/v1/latest/tick", params=params, timeout=self.timeout)
+        except HTTPError as e:
+            logger.warning(
+                "Failed to fetch latest tick(%s) on %s: %s",
+                instr_param, market, e
+            )
+            return {"Data": []}
+
+        data = resp.get("Data", [])
+        # ensure we always return a list
+        if isinstance(data, dict):
+            data = [data]
+
+        return {"Data": data}
diff --git a/src/fetchers/coindesk_client/utils.py b/src/fetchers/coindesk_client/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb79a3024aad9d8d3fee3606d8f69a61b26a9960
--- /dev/null
+++ b/src/fetchers/coindesk_client/utils.py
@@ -0,0 +1,33 @@
+"""
+utils.py – Common helpers for CoinDesk API client.
+
+- Parsing and formatting helpers (e.g., date conversion)
+- Logging setup
+- Retry/backoff utilities (for transient errors)
+"""
+
+import logging
+from datetime import datetime
+import time
+
+def parse_date(date_str):
+    return datetime.fromisoformat(date_str)
+
+def setup_logger(name):
+    logger = logging.getLogger(name)
+    if not logger.handlers:
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter('[%(asctime)s] %(levelname)s %(name)s: %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+    return logger
+
+def retry(func, retries=3, delay=2):
+    for attempt in range(retries):
+        try:
+            return func()
+        except Exception as e:
+            if attempt == retries - 1:
+                raise
+            time.sleep(delay)
diff --git a/src/fetchers/crawl4ai/crawl_news.py b/src/fetchers/crawl4ai/crawl_news.py
new file mode 100644
index 0000000000000000000000000000000000000000..671ce296f599fcbf6937d9316ca05a483c42c734
--- /dev/null
+++ b/src/fetchers/crawl4ai/crawl_news.py
@@ -0,0 +1,205 @@
+"""
+crawl_news.py – Crawls a list of RSS feeds, grabs full-text when needed,
+merges with any existing Parquet in Filebase S3 and uploads the fresh file.
+
+✱ 2025-07-11 – switched backend to Filebase S3
+  • Uses boto3 pointed at Filebase's S3-compatible endpoint
+  • No local caching of seen URLs: state lives in S3 under seen_urls.txt
+
+Requirements:
+  • FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com
+  • FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY env vars
+  • FILEBASE_BUCKET env var with your bucket name
+"""
+
+import os
+import sys
+import asyncio
+import tempfile
+from datetime import datetime
+from io import BytesIO
+from pathlib import Path
+
+from dotenv import load_dotenv
+import feedparser
+import trafilatura
+import pandas as pd
+import rich.console
+from crawl4ai import AsyncWebCrawler
+
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from data_cloud.cloud_utils import StorageHandler
+
+# ─── Configuration ────────────────────────────────────────────────────────────
+load_dotenv()
+
+FEED_URLS = [
+    "https://www.marketwatch.com/rss/topstories",
+    "https://thedefiant.io/feed/",
+    "https://www.coindesk.com/arc/outboundfeeds/rss/?outputType=xml",
+    "https://cointelegraph.com/rss",
+    "https://cryptopotato.com/feed/",
+    "https://cryptoslate.com/feed/",
+    "https://cryptonews.com/news/feed/",
+    "https://smartliquidity.info/feed/",
+    "https://www.cnbc.com/id/10000664/device/rss/rss.html",
+    "https://time.com/nextadvisor/feed/",
+]
+MAX_AGE_DAYS      = 1
+MIN_SUMMARY_LEN   = 200
+MIN_CRAWL_LEN     = 100
+CRAWL_CONCURRENCY = 4
+
+S3_NEWS_PATH      = "news/crawled_news/news-latest.parquet"
+S3_SEEN_PATH      = "news/crawled_news/seen_urls.txt"
+
+# Filebase S3 creds & endpoint ------------------------------------------------
+FILEBASE_ENDPOINT    = os.getenv("FILEBASE_ENDPOINT")
+FILEBASE_ACCESS_KEY  = os.getenv("FILEBASE_ACCESS_KEY")
+FILEBASE_SECRET_KEY  = os.getenv("FILEBASE_SECRET_KEY")
+FILEBASE_BUCKET      = os.getenv("FILEBASE_BUCKET")
+
+if not (FILEBASE_ENDPOINT and FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY and FILEBASE_BUCKET):
+    print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set")
+    sys.exit(1)
+
+# Silence logs ----------------------------------------------------------------
+rich.console.Console.print = lambda *a, **k: None
+os.environ.update({
+    "RICH_NO_COLOR": "1",
+    "RICH_DISABLE": "1",
+    "CRAWL4AI_LOG_LEVEL": "CRITICAL",
+})
+
+# ─── Main routine ─────────────────────────────────────────────────────────────
+async def main() -> None:
+    # Setup storage handler
+    storage = StorageHandler(
+        endpoint_url=FILEBASE_ENDPOINT,
+        access_key=FILEBASE_ACCESS_KEY,
+        secret_key=FILEBASE_SECRET_KEY,
+        bucket_name=FILEBASE_BUCKET,
+        local_base="data"
+    )
+
+    # Load seen-URL cache from S3 only, do not fallback to local or create locally
+    seen_urls: set[str] = set()
+    try:
+        seen_data = storage.s3.get_object(Bucket=storage.bucket, Key=S3_SEEN_PATH)['Body'].read()
+        text = seen_data.decode()
+        seen_urls = {line.strip() for line in text.splitlines() if line.strip()}
+        print(f"[INFO] Loaded {len(seen_urls)} seen URLs from S3")
+    except Exception:
+        print(f"[INFO] No seen URLs found in S3. Treating as empty.")
+        seen_urls = set()
+
+    # Fetch & parse RSS feeds -------------------------------------------------
+    to_crawl, immediate = [], []
+    now_utc = datetime.utcnow()
+    for url in FEED_URLS:
+        feed = feedparser.parse(url)
+        new_count = 0
+        for e in feed.entries:
+            ts = e.get("published_parsed") or e.get("updated_parsed")
+            if not ts:
+                continue
+            link = e.link
+            if link in seen_urls:
+                continue
+            new_count += 1
+
+            content = e.get("content")
+            if content:
+                txt = "".join(p.value for p in content).strip()
+                if len(txt) >= MIN_CRAWL_LEN:
+                    immediate.append({"url": link, "text": txt, "timestamp": now_utc.isoformat()})
+                    seen_urls.add(link)
+                    continue
+
+            summ = e.get("summary", "").strip()
+            if len(summ) >= MIN_SUMMARY_LEN:
+                immediate.append({"url": link, "text": summ, "timestamp": now_utc.isoformat()})
+                seen_urls.add(link)
+            else:
+                to_crawl.append(link)
+
+        print(f"• Feed {url} -> {new_count} new items")
+
+    # Selective crawl for short summaries ------------------------------------
+    crawled = []
+    if to_crawl:
+        print(f"[INFO] Crawling {len(to_crawl)} pages…")
+        async with AsyncWebCrawler(
+            seeds=to_crawl,
+            max_pages=len(to_crawl),
+            concurrency=CRAWL_CONCURRENCY,
+            obey_robots_txt=True,
+        ) as crawler:
+            pages = await asyncio.gather(*(crawler.arun(u) for u in to_crawl))
+            for sub in pages:
+                for page in sub:
+                    if page.url not in seen_urls:
+                        txt = trafilatura.extract(page.html, favor_recall=True)
+                        if txt and len(txt.strip()) >= MIN_CRAWL_LEN:
+                            crawled.append({"url": page.url, "text": txt.strip(), "timestamp": now_utc.isoformat()})
+                            seen_urls.add(page.url)
+
+    # Merge, filter & dedupe --------------------------------------------------
+    new_results = immediate + crawled
+    if not new_results:
+        print("[WARNING] No new articles to process")
+        return
+
+    df_new = pd.DataFrame(new_results)
+    df_new["timestamp"] = pd.to_datetime(df_new["timestamp"], utc=True)
+
+    # Load existing Parquet (cloud or local)
+    df_old = pd.DataFrame()
+    try:
+        parquet_bytes = storage.download(S3_NEWS_PATH)
+        with tempfile.NamedTemporaryFile(suffix=".parquet", delete=False) as tmp:
+            tmp.write(parquet_bytes)
+            tmp_path = tmp.name
+        df_old = pd.read_parquet(tmp_path)
+        os.remove(tmp_path)
+        print(f"[INFO] Loaded {len(df_old)} existing articles from {storage.get_last_mode()}")
+    except Exception:
+        print(f"[INFO] No existing Parquet found in cloud or local storage.")
+
+    df = pd.concat([df_old, df_new], ignore_index=True)
+    cutoff = pd.Timestamp.utcnow() - pd.Timedelta(days=MAX_AGE_DAYS)
+    df = df[df.timestamp >= cutoff]
+    df = df.sort_values("timestamp").drop_duplicates("url", keep="last")
+    print(f"[DEBUG] old rows: {len(df_old)}, new rows: {len(df_new)}, merged: {len(df)}")
+
+    # Upload updated Parquet to S3 only
+    parquet_buf = BytesIO()
+    df.to_parquet(parquet_buf, index=False)
+    data = parquet_buf.getvalue()
+    if not data:
+        raise RuntimeError("Refusing to upload empty Parquet")
+    storage.s3.put_object(Bucket=storage.bucket, Key=S3_NEWS_PATH, Body=data, ContentType="application/octet-stream")
+    print(f"[OK] Parquet updated: S3:{S3_NEWS_PATH}")
+
+    # Persist seen URLs to S3 only
+    seen_body = "\n".join(sorted(seen_urls)) + "\n"
+    storage.s3.put_object(Bucket=storage.bucket, Key=S3_SEEN_PATH, Body=seen_body.encode(), ContentType="text/plain")
+    print(f"[OK] Seen URLs updated: S3:{S3_SEEN_PATH}")
+
+    # Upload all files in data/crawled-news to S3 under news/ (no local fallback)
+    local_news_dir = os.path.join("data", "crawled-news")
+    s3_news_prefix = "news/crawled_news/"
+    for root, _, files in os.walk(local_news_dir):
+        for fname in files:
+            local_path = os.path.join(root, fname)
+            rel_path = os.path.relpath(local_path, local_news_dir)
+            s3_key = s3_news_prefix + rel_path.replace("\\", "/")
+            with open(local_path, "rb") as f:
+                file_bytes = f.read()
+            storage.s3.put_object(Bucket=storage.bucket, Key=s3_key, Body=file_bytes, ContentType="application/octet-stream")
+            print(f"[OK] Uploaded {local_path} -> S3:{s3_key}")
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/src/fetchers/crypto_bubbles/fetch_crypto_bubbles.py b/src/fetchers/crypto_bubbles/fetch_crypto_bubbles.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29d33ed6ae87833797b62fd132dd0c6d976dd47
--- /dev/null
+++ b/src/fetchers/crypto_bubbles/fetch_crypto_bubbles.py
@@ -0,0 +1,194 @@
+"""
+fetch_crypto_bubbles.py – Fetches CryptoBubbles data, converts to Parquet and JSON report,
+then uploads both directly to Filebase S3 instead of local storage.
+
+✱ 2025-07-11 – switched backend from local filesystem to Filebase S3
+  • Uses boto3 against FILEBASE_ENDPOINT
+  • No local disk writes; everything streams directly to S3
+
+Requirements:
+  • FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com
+  • FILEBASE_ACCESS_KEY and FILEBASE_SECRET_KEY env vars
+  • FILEBASE_BUCKET env var with your bucket name
+  • dotenv for loading env vars from .env (optional)
+"""
+
+import os
+import sys
+import json
+import datetime as _dt
+import argparse
+from io import BytesIO
+
+from collections import defaultdict
+import numpy as np
+import pandas as pd
+import requests
+
+
+# Ensure src is in sys.path for direct script execution
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from data_cloud.cloud_utils import StorageHandler
+from dotenv import load_dotenv
+
+# ─── Configuration ────────────────────────────────────────────────────────────
+load_dotenv()
+
+URL = os.getenv("CRYPTOBUBBLES_URL", "https://cryptobubbles.net/backend/data/bubbles1000.usd.json")
+
+# Filebase S3 credentials
+FILEBASE_ENDPOINT   = os.getenv("FILEBASE_ENDPOINT")
+FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY")
+FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY")
+FILEBASE_BUCKET     = os.getenv("FILEBASE_BUCKET")
+
+if not all([FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, FILEBASE_BUCKET]):
+    print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set")
+    sys.exit(1)
+
+# boto3 S3 client config
+from botocore.config import Config
+CFG = Config(
+    signature_version="s3v4",
+    s3={"addressing_style": "path"},
+)
+
+
+
+# ─── Data fetch & processing ─────────────────────────────────────────────────
+
+def fetch_json(url: str = URL, timeout: int = 15):
+    resp = requests.get(url, timeout=timeout)
+    resp.raise_for_status()
+    payload = resp.json()
+    return payload.get("data", payload) if isinstance(payload, dict) else payload
+
+
+def to_dataframe(raw):
+    return pd.json_normalize(raw)
+
+
+def categorize_columns(df: pd.DataFrame):
+    groups = defaultdict(list)
+    for col in df.columns:
+        if "." in col:
+            prefix, _ = col.split('.', 1)
+            groups[prefix].append(col)
+        else:
+            groups['base'].append(col)
+
+    nice = {
+        'base': 'Base Features',
+        'symbols': 'Symbols',
+        'performance': 'Performance',
+        'rankDiffs': 'Rank Differences',
+        'exchangePrices': 'Exchange Prices',
+        'links': 'Links',
+    }
+
+    fc = {}
+    for key, cols in groups.items():
+        name = nice.get(key, key.capitalize())
+        fc[name] = {'count': len(cols), 'features': cols}
+    return fc
+
+
+def generate_report(df, configuration):
+    now = _dt.datetime.utcnow().isoformat()
+    mem_mb = df.memory_usage(deep=True).sum() / 1024**2
+    dataset_info = {
+        'shape': [df.shape[0], df.shape[1]],
+        'memory_usage_mb': mem_mb,
+        'time_range': {'start': None, 'end': None},
+    }
+
+    fc = categorize_columns(df)
+
+    missing = df.isna().sum().to_dict()
+    total_cells = df.shape[0] * df.shape[1]
+    non_missing = df.count().sum()
+    completeness = non_missing / total_cells * 100
+    col_quals = [(df.shape[0] - m) / df.shape[0] for m in missing.values()]
+    avg_quality = float(np.mean(col_quals))
+
+    data_quality = {
+        'completeness': completeness,
+        'missing_values_by_column': missing,
+        'avg_quality_score': avg_quality,
+    }
+
+    report = {
+        'timestamp': now,
+        'dataset_info': dataset_info,
+        'feature_categories': fc,
+        'data_quality': data_quality,
+        'feature_importance': {},
+        'configuration': configuration,
+    }
+    return report
+
+# ─── Main ─────────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description='Fetch CryptoBubbles, upload to Filebase')
+    parser.add_argument('--prefix', default='crypto-bubbles', help='S3 key prefix')
+    args = parser.parse_args()
+
+    prefix = args.prefix.rstrip('/')
+    today = _dt.date.today().isoformat()
+
+    raw = fetch_json()
+    df = to_dataframe(raw)
+
+    # configuration placeholder
+    configuration = {
+        'enable_advanced_indicators': True,
+        'enable_feature_selection': True,
+        'enable_anomaly_detection': True,
+        'max_correlation_threshold': 0.95,
+        'min_feature_importance': 0.001,
+        'outlier_detection_method': 'iqr',
+        'feature_scaling': True,
+    }
+
+    report = generate_report(df, configuration)
+
+    # prepare Parquet bytes
+    buf = BytesIO()
+    df.to_parquet(buf, index=False)
+    parquet_data = buf.getvalue()
+
+    # prepare JSON report bytes
+    report_json = json.dumps(report, indent=2).encode()
+
+
+    # Use StorageHandler for unified cloud/local upload
+    storage = StorageHandler(
+        endpoint_url=None,
+        access_key=None,
+        secret_key=None,
+        bucket_name=None,
+        local_base="data"
+    )
+
+    key_parquet = f"{prefix}/crypto_bubbles_{today}.parquet"
+    key_report  = f"{prefix}/crypto_bubbles_report_{today}.json"
+
+    # Upload Parquet
+    try:
+        storage.upload(key_parquet, parquet_data, content_type='application/octet-stream')
+        print(f"[OK] Uploaded Parquet -> {storage.get_last_mode()}:{key_parquet}")
+    except Exception as e:
+        print(f"[ERROR] Failed uploading Parquet: {e}", file=sys.stderr)
+
+    # Upload JSON report
+    try:
+        storage.upload(key_report, report_json, content_type='application/json')
+        print(f"[OK] Uploaded report -> {storage.get_last_mode()}:{key_report}")
+    except Exception as e:
+        print(f"[ERROR] Failed uploading report: {e}", file=sys.stderr)
+
+if __name__ == '__main__':
+    main()
diff --git a/src/fetchers/cryptocompare/client.py b/src/fetchers/cryptocompare/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/fetchers/cryptocompare/forum_trending.py b/src/fetchers/cryptocompare/forum_trending.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff94a9070b0aacfd376461797eb196405b1cdd53
--- /dev/null
+++ b/src/fetchers/cryptocompare/forum_trending.py
@@ -0,0 +1,26 @@
+"""
+forum_trending.py – fetch_forum_trending(extraParams, ...)
+"""
+
+from .client import CryptoCompareClient
+
+
+FORUM_BASE_URL = "https://www.cryptocompare.com/"
+
+class ForumTrending:
+    def __init__(self):
+        self.client = CryptoCompareClient()
+
+    def fetch_forum_trending(self, extraParams=None, **kwargs):
+        """
+        Fetch trending forum topics.
+        API: https://www.cryptocompare.com/api/forum/get/trending/
+        """
+        params = {"extraParams": extraParams} if extraParams else {}
+        params.update(kwargs)
+        # Use requests directly for this endpoint, as it is not on min-api
+        import requests
+        url = FORUM_BASE_URL + "api/forum/get/trending/"
+        resp = requests.get(url, params=params)
+        resp.raise_for_status()
+        return resp.json()
diff --git a/src/fetchers/cryptocompare/histohour.py b/src/fetchers/cryptocompare/histohour.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad2057abdfd25c8407397b9c68441471e9289891
--- /dev/null
+++ b/src/fetchers/cryptocompare/histohour.py
@@ -0,0 +1,17 @@
+"""
+histohour.py – fetch_histohour(symbol, limit, aggregate, ...)
+
+API sample:
+https://min-api.cryptocompare.com/data/v2/histohour?aggregate=1&e=CCCAGG&extraParams=https:%2F%2Fwww.cryptocompare.com&fsym=BTC&limit=24&tryConversion=false&tsym=USD
+"""
+
+from .client import CryptoCompareClient
+
+class HistoHour:
+    def __init__(self):
+        self.client = CryptoCompareClient()
+
+    def fetch_histohour(self, fsym, tsym, limit=24, aggregate=1, **kwargs):
+        params = {"fsym": fsym, "tsym": tsym, "limit": limit, "aggregate": aggregate}
+        params.update(kwargs)
+        return self.client.get("v2/histohour", params=params)
diff --git a/src/fetchers/cryptocompare/recommended.py b/src/fetchers/cryptocompare/recommended.py
new file mode 100644
index 0000000000000000000000000000000000000000..7526a18a5a1cce24ac55a5266b6d2b5eb3111d45
--- /dev/null
+++ b/src/fetchers/cryptocompare/recommended.py
@@ -0,0 +1,17 @@
+"""
+recommended.py – fetch_recommended_all(tsym, ...)
+
+API sample:
+http://min-api.cryptocompare.com/data/recommended/all?tsym=USD
+"""
+
+from .client import CryptoCompareClient
+
+class Recommended:
+    def __init__(self):
+        self.client = CryptoCompareClient()
+
+    def fetch_recommended_all(self, tsym, **kwargs):
+        params = {"tsym": tsym}
+        params.update(kwargs)
+        return self.client.get("top/recommended", params=params)
diff --git a/src/fetchers/cryptocompare/top_toptier_volume.py b/src/fetchers/cryptocompare/top_toptier_volume.py
new file mode 100644
index 0000000000000000000000000000000000000000..97d245dab570ce2f65a5ec9c4b9f77bac67710ee
--- /dev/null
+++ b/src/fetchers/cryptocompare/top_toptier_volume.py
@@ -0,0 +1,17 @@
+"""
+top_toptier_volume.py – fetch_top_toptier_volume(assetClass, ...)
+
+API sample:
+https://min-api.cryptocompare.com/data/top/totaltoptiervol?ascending=true&assetClass=ALL&extraParams=https:%2F%2Fwww.cryptocompare.com&limit=100&page=0&tsym=USD
+"""
+
+from .client import CryptoCompareClient
+
+class TopTopTierVolume:
+    def __init__(self):
+        self.client = CryptoCompareClient()
+
+    def fetch_top_toptier_volume(self, assetClass, **kwargs):
+        params = {"assetClass": assetClass}
+        params.update(kwargs)
+        return self.client.get("top/totaltoptiervolfull", params=params)
diff --git a/src/fetchers/cryptocompare/utils.py b/src/fetchers/cryptocompare/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b19f02f194ff85f718207091b4bc72ca0275a75
--- /dev/null
+++ b/src/fetchers/cryptocompare/utils.py
@@ -0,0 +1,36 @@
+"""
+utils.py – Shared helpers for CryptoCompare API client.
+- Timestamp conversion, caching, and rate-limiting
+"""
+
+import time
+from functools import wraps
+
+def timestamp_to_iso(ts):
+    return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(ts))
+
+# Simple in-memory cache (for demonstration)
+_cache = {}
+def cache_result(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        key = (func.__name__, args, tuple(sorted(kwargs.items())))
+        if key in _cache:
+            return _cache[key]
+        result = func(*args, **kwargs)
+        _cache[key] = result
+        return result
+    return wrapper
+
+# Simple rate limiter (1 call/sec)
+def rate_limited(func):
+    last_called = [0]
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        elapsed = time.time() - last_called[0]
+        if elapsed < 1:
+            time.sleep(1 - elapsed)
+        result = func(*args, **kwargs)
+        last_called[0] = time.time()
+        return result
+    return wrapper
diff --git a/src/fetchers/finnhub/generate_finnhub_features.py b/src/fetchers/finnhub/generate_finnhub_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..34f731e294b8a08dfda1192a3fe28f9b1a645b10
--- /dev/null
+++ b/src/fetchers/finnhub/generate_finnhub_features.py
@@ -0,0 +1,169 @@
+# """generate_finnhub_features.py
+
+# Automatic feature generator for a local Finnhub data dump (Parquet files).
+
+# Usage
+# -----
+# python generate_finnhub_features.py --data /path/to/finnhub \
+#     --out-features features_all.parquet \
+#     --out-report feature_report.json
+
+# The script walks through **all** Parquet files contained in the directory
+# structure exported from Finnhub, concatenates them on the **timestamp** index
+# (if present) or on the DataFrame index otherwise, prefixes every column with a
+# stable identifier built from its file path to guarantee *no data loss*, and
+# computes a lightweight metadata report inspired by AdvisorAI's format.
+
+# Key design principles
+# ---------------------
+# * No column is dropped – every raw field ends-up in the final output.
+# * Column names are namespaced with `<relative_file_path_without_ext>__`.
+# * When multiple DataFrames contain an explicit timestamp or date column, they
+#   are converted to pandas `datetime64[ns]` and merged on the outer union of
+#   timestamps to preserve every record.
+# * Numeric features are left untouched; you may append your own engineered
+#   columns in `extend_features()` without altering the originals.
+# * The JSON report contains:
+#     - basic shape / memory stats
+#     - global time range from the merged index
+#     - missing-value analysis (per column null counts & completeness %)
+#     - feature category counts (simple heuristic)
+# """
+
+# from __future__ import annotations
+
+# import argparse
+# import json
+# from pathlib import Path
+# from typing import Dict, List, Tuple
+
+# import pandas as pd
+# import numpy as np
+
+
+# TIMESTAMP_CANDIDATES = {"timestamp", "time", "date", "datetime", "t", "ts", "priced_at"}
+
+
+# def find_parquet_files(root: Path) -> List[Path]:
+#     return [p for p in root.rglob("*.parquet") if p.is_file()]
+
+
+# def build_prefix(file_path: Path, root: Path) -> str:
+#     rel = file_path.relative_to(root)
+#     no_ext = rel.as_posix().replace("/", "_").rsplit(".", 1)[0]
+#     return f"{no_ext}__"
+
+
+# def load_and_prefix(file_path: Path, root: Path) -> Tuple[pd.DataFrame, str]:
+#     df = pd.read_parquet(file_path)
+#     prefix = build_prefix(file_path, root)
+#     df = df.rename(columns={c: f"{prefix}{c}" for c in df.columns})
+#     # Identify/standardise timestamp column (if any)
+#     for c in list(df.columns):
+#         base = c.split("__")[-1].lower()
+#         if base in TIMESTAMP_CANDIDATES:
+#             # convert numeric seconds or string dates to datetime
+#             df[f"{prefix}__ts"] = pd.to_datetime(df[c], errors="coerce", unit="s")
+#             df = df.drop(columns=[c])
+#             # Do NOT set index, just keep as column
+#             break
+#     # Always reset index to avoid merge errors
+#     df = df.reset_index(drop=True)
+#     return df, prefix
+
+
+# def merge_frames(frames: List[pd.DataFrame]) -> pd.DataFrame:
+#     # All frames have RangeIndex, so concat columns
+#     return pd.concat(frames, axis=1)
+
+
+# def extend_features(df: pd.DataFrame) -> pd.DataFrame:
+#     """Add engineered features without touching original columns."""
+#     numeric_cols = df.select_dtypes(include=[np.number]).columns
+#     pct_df = df[numeric_cols].pct_change(fill_method=None)
+#     pct_df.columns = [f"{c}_pct_change1" for c in pct_df.columns]
+#     return pd.concat([df, pct_df], axis=1)
+
+
+# def feature_category(col: str) -> str:
+#     c = col.lower()
+#     if any(k in c for k in ("open", "close", "high", "low", "price", "volume")):
+#         return "Price / Volume"
+#     if any(k in c for k in ("pe", "cash", "debt", "income", "margin")):
+#         return "Fundamentals"
+#     if any(k in c for k in ("rsi", "macd", "ema", "sma", "bb", "stoch")):
+#         return "Technical"
+#     if any(k in c for k in ("news", "sentiment", "social")):
+#         return "Sentiment"
+#     return "Other"
+
+
+# def build_report(df: pd.DataFrame) -> Dict:
+#     # Use merged datetime index for time range
+#     idx = df.index
+#     start = str(idx.min()) if not idx.empty else None
+#     end = str(idx.max()) if not idx.empty else None
+
+#     report: Dict = {
+#         "timestamp": pd.Timestamp.utcnow().isoformat(),
+#         "dataset_info": {
+#             "shape": list(df.shape),
+#             "memory_usage_mb": round(df.memory_usage(deep=True).sum() / 1024**2, 3),
+#             "time_range": {"start": start, "end": end},
+#         },
+#     }
+
+#     # Feature categories
+#     cats: Dict[str, List[str]] = {}
+#     for col in df.columns:
+#         cat = feature_category(col)
+#         cats.setdefault(cat, []).append(col)
+#     report["feature_categories"] = {c: {"count": len(v), "features": v[:10]} for c, v in cats.items()}
+
+#     # Data quality
+#     missing = df.isna().sum().to_dict()
+#     completeness = 100 - (sum(missing.values()) / df.size * 100)
+#     report["data_quality"] = {"completeness": completeness, "missing_values_by_column": missing}
+
+#     return report
+
+
+# def save_outputs(df: pd.DataFrame, features_path: Path, report_path: Path):
+#     features_path.parent.mkdir(parents=True, exist_ok=True)
+#     report_path.parent.mkdir(parents=True, exist_ok=True)
+#     # Ensure all columns are unique by appending suffixes to duplicates
+#     cols = pd.Series(df.columns)
+#     for dup in cols[cols.duplicated()].unique():
+#         dups = cols[cols == dup].index.tolist()
+#         for i, idx in enumerate(dups):
+#             if i == 0:
+#                 continue
+#             cols[idx] = f"{dup}_{i}"
+#     df.columns = cols
+#     df.to_parquet(features_path)
+#     with report_path.open("w") as f:
+#         json.dump(build_report(df), f, indent=2)
+
+
+# def main():
+#     # Hardcoded paths for direct script execution
+#     data_dir = Path("data/finnhub")
+#     out_features = Path("data/finnhub/merged_features.parquet")
+#     out_report = Path("data/finnhub/feature_report.json")
+
+#     frames = []
+#     for fp in find_parquet_files(data_dir):
+#         df, _ = load_and_prefix(fp, data_dir)
+#         frames.append(df)
+
+#     if not frames:
+#         raise RuntimeError("No Parquet files found in the specified data directory.")
+
+#     merged = merge_frames(frames)
+#     merged = extend_features(merged)
+#     save_outputs(merged, out_features, out_report)
+#     print(f"OK: Features saved to {out_features}, report to {out_report}")
+
+
+# if __name__ == "__main__":
+#     main()
diff --git a/src/fetchers/finnhub/incomp/__init__.py b/src/fetchers/finnhub/incomp/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/src/fetchers/finnhub/incomp/alternative_data/__init__.py b/src/fetchers/finnhub/incomp/alternative_data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53571122bcc88ff94431af500db337493749d2a5
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/alternative_data/__init__.py
@@ -0,0 +1,17 @@
+"""
+alternative_data/__init__.py – Package exports for the alternative_data sub-package.
+"""
+
+from .esg_scores import get_esg_scores
+from .insider_sentiment import get_insider_sentiment
+from .insider_transactions import get_insider_transactions
+from .lobbying import get_lobbying
+from .social_sentiment import get_social_sentiment
+
+__all__ = [
+    "get_esg_scores",
+    "get_insider_sentiment",
+    "get_insider_transactions",
+    "get_lobbying",
+    "get_social_sentiment",
+]
diff --git a/src/fetchers/finnhub/incomp/alternative_data/esg_scores.py b/src/fetchers/finnhub/incomp/alternative_data/esg_scores.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4b12a55e505aa16b2ed9a72f94eafd6594d65e1
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/alternative_data/esg_scores.py
@@ -0,0 +1,25 @@
+"""
+alternative_data/esg_scores.py – Retrieve ESG (Environmental, Social, and Governance) scores for a given ticker.
+"""
+
+from typing import Dict
+from ..client import FinnhubClient
+
+def get_esg_scores(client: FinnhubClient, symbol: str) -> Dict:
+    """
+    Fetch ESG scores for the specified symbol.
+
+    :param client: An instance of FinnhubClient
+    :param symbol: Stock ticker (e.g., "AAPL")
+    :return:       A dict with ESG metrics, for example:
+                   {
+                     'symbol': 'AAPL',
+                     'year': 2024,
+                     'esgScore': 73.5,
+                     'environmentScore': 68.2,
+                     'socialScore': 75.1,
+                     'governanceScore': 78.9
+                   }
+    """
+    params = {"symbol": symbol}
+    return client.get("stock/esg", params=params)
diff --git a/src/fetchers/finnhub/incomp/alternative_data/insider_sentiment.py b/src/fetchers/finnhub/incomp/alternative_data/insider_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d43e73da86bf869d7d298dd9038f41fc1f2f67e
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/alternative_data/insider_sentiment.py
@@ -0,0 +1,31 @@
+def get_insider_sentiment(client, symbol, from_date=None, to_date=None):
+    """
+    Fetch insider sentiment using the provided FinnhubClient.
+    :param client: FinnhubClient instance
+    :param symbol: Stock symbol
+    :param from_date: Start date (YYYY-MM-DD)
+    :param to_date: End date (YYYY-MM-DD)
+    :return: Insider sentiment data
+    """
+    params = {"symbol": symbol}
+    if from_date:
+        params["from"] = from_date
+    if to_date:
+        params["to"] = to_date
+    return client.get("stock/insider-sentiment", params=params)
+"""
+insider_sentiment.py – GET /stock/insider-sentiment
+"""
+from ..client import FinnhubClient
+
+class InsiderSentiment:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_insider_sentiment(self, symbol, from_date=None, to_date=None):
+        params = {"symbol": symbol}
+        if from_date:
+            params["from"] = from_date
+        if to_date:
+            params["to"] = to_date
+        return self.client.get("stock/insider-sentiment", params=params)
diff --git a/src/fetchers/finnhub/incomp/alternative_data/insider_transactions.py b/src/fetchers/finnhub/incomp/alternative_data/insider_transactions.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1bf8b37f1c0c1c089c113e434cbe0727b53b899
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/alternative_data/insider_transactions.py
@@ -0,0 +1,31 @@
+def get_insider_transactions(client, symbol, from_date=None, to_date=None):
+    """
+    Fetch insider transactions using the provided FinnhubClient.
+    :param client: FinnhubClient instance
+    :param symbol: Stock symbol
+    :param from_date: Start date (YYYY-MM-DD)
+    :param to_date: End date (YYYY-MM-DD)
+    :return: Insider transactions data
+    """
+    params = {"symbol": symbol}
+    if from_date:
+        params["from"] = from_date
+    if to_date:
+        params["to"] = to_date
+    return client.get("stock/insider-transactions", params=params)
+"""
+insider_transactions.py – GET /stock/insider-transactions
+"""
+from ..client import FinnhubClient
+
+class InsiderTransactions:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_insider_transactions(self, symbol, from_date=None, to_date=None):
+        params = {"symbol": symbol}
+        if from_date:
+            params["from"] = from_date
+        if to_date:
+            params["to"] = to_date
+        return self.client.get("stock/insider-transactions", params=params)
diff --git a/src/fetchers/finnhub/incomp/alternative_data/lobbying.py b/src/fetchers/finnhub/incomp/alternative_data/lobbying.py
new file mode 100644
index 0000000000000000000000000000000000000000..074e6da66a032f6a95573f716fffc016a5e8ad71
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/alternative_data/lobbying.py
@@ -0,0 +1,35 @@
+"""
+alternative_data/lobbying.py – Retrieve lobbying disclosure data for a given ticker.
+"""
+
+from typing import Dict
+from ..client import FinnhubClient
+
+def get_lobbying(client: FinnhubClient, symbol: str, start_date: str, end_date: str) -> Dict:
+    """
+    Fetch registered lobbying activities for the specified symbol between start_date and end_date.
+
+    :param client:     An instance of FinnhubClient
+    :param symbol:     Stock ticker or company symbol (e.g., "AAPL")
+    :param start_date: Start date in YYYY-MM-DD format
+    :param end_date:   End date in YYYY-MM-DD format
+    :return:           A dict containing lobbying records, typically:
+                       {
+                         'symbol': 'AAPL',
+                         'data': [
+                           {
+                             'disclosureDate': '2025-05-15',
+                             'client': 'Big Tech Lobbyists LLC',
+                             'amount': 250000,
+                             'subject': 'Regulatory Affairs'
+                           },
+                           ...
+                         ]
+                       }
+    """
+    params = {
+        "symbol": symbol,
+        "from": start_date,
+        "to": end_date,
+    }
+    return client.get("stock/lobbying", params=params)
diff --git a/src/fetchers/finnhub/incomp/alternative_data/social_sentiment.py b/src/fetchers/finnhub/incomp/alternative_data/social_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea1141143ffb8412f9f53ba9f04a38cf0753e62d
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/alternative_data/social_sentiment.py
@@ -0,0 +1,40 @@
+"""
+alternative_data/social_sentiment.py – Retrieve social media sentiment metrics for a given ticker.
+"""
+
+from typing import Dict
+from ..client import FinnhubClient
+
+def get_social_sentiment(
+    client: FinnhubClient,
+    symbol: str,
+    start_date: str,
+    end_date: str,
+    source: str = "reddit"
+) -> Dict:
+    """
+    Fetch social sentiment data for the specified symbol between start_date and end_date.
+
+    :param client:     An instance of FinnhubClient
+    :param symbol:     Stock ticker (e.g., "AAPL")
+    :param start_date: Start date in YYYY-MM-DD format
+    :param end_date:   End date in YYYY-MM-DD format
+    :param source:     Sentiment source, either "reddit" or "twitter" (default: "reddit")
+    :return:           A dict containing the social sentiment data, typically:
+                       {
+                         "symbol": "AAPL",
+                         "from": "2025-06-01",
+                         "to": "2025-06-07",
+                         "data": [
+                           {"date": "2025-06-01", "mention": 123, "sentiment": 0.45},
+                           ...
+                         ]
+                       }
+    """
+    params = {
+        "symbol": symbol,
+        "from": start_date,
+        "to": end_date,
+        "source": source
+    }
+    return client.get("stock/social-sentiment", params=params)
diff --git a/src/fetchers/finnhub/incomp/client.py b/src/fetchers/finnhub/incomp/client.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ef6d022ccf3b6e36822da8db22ed2a8866f9846
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/client.py
@@ -0,0 +1,42 @@
+# """
+# client.py – Manages base URL, API key, and rate-limiting for Finnhub API (60 calls/min).
+# """
+
+# import os
+# from dotenv import load_dotenv
+# import requests
+# from .utils import rate_limited
+
+# load_dotenv()
+
+# # Load your API key and optional base URL from environment
+# API_KEY = os.getenv("FINHUB_API_KEY")
+# BASE_URL = os.getenv("FINHUB_BASE_URL", "https://finnhub.io/api/v1/")
+
+# # Default headers for every request
+# HEADERS = {
+#     "X-Finnhub-Token": API_KEY,
+#     "Content-Type": "application/json"
+# }
+
+# class FinnhubClient:
+#     def __init__(self):
+#         """
+#         Initialize a session with the default headers.
+#         """
+#         self.session = requests.Session()
+#         self.session.headers.update(HEADERS)
+
+#     @rate_limited()
+#     def get(self, endpoint: str, params: dict = None) -> dict:
+#         """
+#         Perform a GET request to the given Finnhub endpoint, respecting rate limits.
+
+#         :param endpoint: API path (e.g. "quote", "stock/candle")
+#         :param params:   Query parameters as a dict
+#         :return:         Parsed JSON response as a dict
+#         """
+#         url = BASE_URL.rstrip("/") + "/" + endpoint.lstrip("/")
+#         response = self.session.get(url, params=params)
+#         response.raise_for_status()
+#         return response.json()
diff --git a/src/fetchers/finnhub/incomp/economic_data/__init__.py b/src/fetchers/finnhub/incomp/economic_data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a249e5e02de8322aea2a50a241088a29f64e13fe
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/economic_data/__init__.py
@@ -0,0 +1,11 @@
+"""
+economic_data/__init__.py – Package exports for the economic_data sub-package.
+"""
+
+from .calendar import get_economic_calendar
+from .indicators import get_economic_indicators
+
+__all__ = [
+    "get_economic_calendar",
+    "get_economic_indicators",
+]
diff --git a/src/fetchers/finnhub/incomp/economic_data/calendar.py b/src/fetchers/finnhub/incomp/economic_data/calendar.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d706f368ca3f6f483e900ded89194e3236e5192
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/economic_data/calendar.py
@@ -0,0 +1,33 @@
+def get_economic_calendar(client, start_date=None, end_date=None, country=None):
+    """
+    Fetch economic calendar events using the provided FinnhubClient.
+    :param client: FinnhubClient instance
+    :param start_date: Start date (YYYY-MM-DD)
+    :param end_date: End date (YYYY-MM-DD)
+    :param country: Country code (optional)
+    :return: Economic calendar events
+    """
+    params = {}
+    if start_date:
+        params["from"] = start_date
+    if end_date:
+        params["to"] = end_date
+    if country:
+        params["country"] = country
+    return client.get("economic/calendar", params=params)
+"""
+calendar.py – GET /economic/calendar
+"""
+from ..client import FinnhubClient
+
+class EconomicCalendar:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_calendar(self, _from=None, to=None):
+        params = {}
+        if _from:
+            params["from"] = _from
+        if to:
+            params["to"] = to
+        return self.client.get("economic/calendar", params=params)
diff --git a/src/fetchers/finnhub/incomp/economic_data/indicators.py b/src/fetchers/finnhub/incomp/economic_data/indicators.py
new file mode 100644
index 0000000000000000000000000000000000000000..df09dcfc508673eb8fb11000acd3755d77d0824a
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/economic_data/indicators.py
@@ -0,0 +1,37 @@
+"""
+economic_data/indicators.py – Fetches time series data for specified macroeconomic indicators via the Finnhub API.
+"""
+
+from typing import Dict, Any
+from ..client import FinnhubClient
+
+def get_economic_indicators(
+    client: FinnhubClient,
+    indicator: str,
+    start_date: str,
+    end_date: str
+) -> Dict[str, Any]:
+    """
+    Retrieve macroeconomic indicator data between start_date and end_date.
+
+    :param client:      An instance of FinnhubClient
+    :param indicator:   The code of the macro indicator (e.g., 'US_GDP', 'CPI', 'UnemploymentRate')
+    :param start_date:  Start date in YYYY-MM-DD format
+    :param end_date:    End date in YYYY-MM-DD format
+    :return:            A dict containing the time series for the requested indicator, typically:
+                        {
+                          'indicator': 'US_GDP',
+                          'from': '2020-01-01',
+                          'to': '2025-06-30',
+                          'data': [
+                              {'date': '2020-01-01', 'value': 21000.0},
+                              ...
+                          ]
+                        }
+    """
+    params = {
+        "indicator": indicator,
+        "from": start_date,
+        "to": end_date
+    }
+    return client.get("economic/indicator", params=params)
diff --git a/src/fetchers/finnhub/incomp/fundamentals/__init__.py b/src/fetchers/finnhub/incomp/fundamentals/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f54e7b6c4c5e170e1d0e3712312310005eab42ed
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/__init__.py
@@ -0,0 +1,15 @@
+"""
+fundamentals/__init__.py – Package exports for the fundamentals sub-package.
+"""
+
+from .basic_financials import BasicFinancials
+from .corporate_actions import CorporateActions
+from .earnings import Earnings
+from .financials_reported import FinancialsReported
+
+__all__ = [
+    "BasicFinancials",
+    "CorporateActions",
+    "Earnings",
+    "FinancialsReported",
+]
diff --git a/src/fetchers/finnhub/incomp/fundamentals/basic_financials.py b/src/fetchers/finnhub/incomp/fundamentals/basic_financials.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bcd78c5bc8fe5ecba1320b24bb543652453c3e8
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/basic_financials.py
@@ -0,0 +1,23 @@
+"""
+basic_financials.py – GET /company-basic-financials
+"""
+
+from typing import Optional, Dict, Any
+from ..client import FinnhubClient
+
+class BasicFinancials:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_basic_financials(self, symbol: str, metric: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Fetch basic financial metrics for the specified symbol.
+
+        :param symbol: Stock ticker (e.g., "AAPL")
+        :param metric: Specific metric to fetch (e.g., 'pe', 'eps'); if None, returns all available metrics
+        :return:       A dict containing the requested financial metrics.
+        """
+        params = {"symbol": symbol}
+        if metric:
+            params["metric"] = metric
+        return self.client.get("stock/metric", params=params)
diff --git a/src/fetchers/finnhub/incomp/fundamentals/corporate_actions.py b/src/fetchers/finnhub/incomp/fundamentals/corporate_actions.py
new file mode 100644
index 0000000000000000000000000000000000000000..1852be8c0a952ca653b5f460332039e1975aeef5
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/corporate_actions.py
@@ -0,0 +1,89 @@
+"""
+fundamentals/corporate_actions.py – Retrieve corporate actions: dividends, splits, and price targets.
+"""
+
+from typing import Optional, Dict, Any
+from ..client import FinnhubClient
+
+class CorporateActions:
+    def __init__(self):
+        """
+        Initialize CorporateActions with a FinnhubClient instance.
+        """
+        self.client = FinnhubClient()
+
+    def get_dividends(
+        self,
+        symbol: str,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Fetch dividend history for the specified symbol between start_date and end_date.
+
+        :param symbol:     Stock ticker (e.g., "AAPL")
+        :param start_date: Start date in YYYY-MM-DD format
+        :param end_date:   End date in YYYY-MM-DD format
+        :return:           A dict containing dividend data, for example:
+                           {
+                             'symbol': 'AAPL',
+                             'data': [
+                               {'paymentDate': '2025-06-01', 'amount': 0.23, 'recordDate': '2025-05-20'},
+                               ...
+                             ]
+                           }
+        """
+        params = {"symbol": symbol}
+        if start_date:
+            params["from"] = start_date
+        if end_date:
+            params["to"] = end_date
+        return self.client.get("stock/dividend", params=params)
+
+    def get_splits(
+        self,
+        symbol: str,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """
+        Fetch stock split history for the specified symbol between start_date and end_date.
+
+        :param symbol:     Stock ticker (e.g., "AAPL")
+        :param start_date: Start date in YYYY-MM-DD format
+        :param end_date:   End date in YYYY-MM-DD format
+        :return:           A dict containing split data, for example:
+                           {
+                             'symbol': 'AAPL',
+                             'data': [
+                               {'date': '2020-08-31', 'splitRatio': '4:1'},
+                               ...
+                             ]
+                           }
+        """
+        params = {"symbol": symbol}
+        if start_date:
+            params["from"] = start_date
+        if end_date:
+            params["to"] = end_date
+        return self.client.get("stock/split", params=params)
+
+    def get_price_target(self, symbol: str) -> Dict[str, Any]:
+        """
+        Fetch the price target data for the specified symbol.
+
+        :param symbol: Stock ticker (e.g., "AAPL")
+        :return:       A dict containing price target information, for example:
+                       {
+                         'symbol': 'AAPL',
+                         'buy': 165.0,
+                         'hold': 155.0,
+                         'sell': 145.0,
+                         'average': 155.0,
+                         'high': 170.0,
+                         'low': 140.0,
+                         'lastUpdated': '2025-06-15'
+                       }
+        """
+        params = {"symbol": symbol}
+        return self.client.get("stock/price-target", params=params)
diff --git a/src/fetchers/finnhub/incomp/fundamentals/earnings.py b/src/fetchers/finnhub/incomp/fundamentals/earnings.py
new file mode 100644
index 0000000000000000000000000000000000000000..8345d111da2c9d3827af1062e65d36e336e130c2
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/earnings.py
@@ -0,0 +1,38 @@
+"""
+fundamentals/earnings.py – Fetch historical earnings data for a given ticker.
+"""
+
+from typing import Dict, Any, List, Optional
+from ..client import FinnhubClient
+
+class Earnings:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_earnings(
+        self,
+        symbol: str,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Retrieve earnings data for the specified symbol.
+
+        :param symbol:     Stock ticker (e.g., "AAPL")
+        :param start_date: (optional) Start date in YYYY-MM-DD format
+        :param end_date:   (optional) End date in YYYY-MM-DD format
+        :return:           A list of earnings records, each dict typically containing:
+                           {
+                             'period': '2025-06-30',
+                             'actual': 1.30,
+                             'estimate': 1.25,
+                             'surprise': 0.05,
+                             'surprisePercent': 4.0
+                           }
+        """
+        params = {"symbol": symbol}
+        if start_date:
+            params["from"] = start_date
+        if end_date:
+            params["to"] = end_date
+        return self.client.get("stock/earnings", params=params)
diff --git a/src/fetchers/finnhub/incomp/fundamentals/financials_reported.py b/src/fetchers/finnhub/incomp/fundamentals/financials_reported.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa0e573840f1b2c6fc1cf75cfef43047e6ec4c10
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/financials_reported.py
@@ -0,0 +1,46 @@
+"""
+fundamentals/financials_reported.py – Retrieve reported financial statements for a given ticker.
+"""
+
+from typing import Dict, Any
+from ..client import FinnhubClient
+
+class FinancialsReported:
+    def __init__(self):
+        """
+        Initialize FinancialsReported with a FinnhubClient instance.
+        """
+        self.client = FinnhubClient()
+
+    def get_financials_reported(
+        self,
+        symbol: str,
+        freq: str = "annual"
+    ) -> Dict[str, Any]:
+        """
+        Fetch reported financial statements for the specified symbol.
+
+        :param symbol: Stock ticker (e.g., "AAPL")
+        :param freq:   Frequency of reports: "annual" or "quarter"
+        :return:       A dict containing financial statements, typically:
+                       {
+                         'symbol': 'AAPL',
+                         'metric': 'ic',        # e.g., 'ic' for income statement, 'bs' for balance sheet, 'cf' for cash flow
+                         'report': [
+                             {
+                                 'reportDate': '2025-03-31',
+                                 'ic': {
+                                     'revenue': 89000.0,
+                                     'grossProfit': 38000.0,
+                                     ...
+                                 }
+                             },
+                             ...
+                         ]
+                       }
+        """
+        params = {
+            "symbol": symbol,
+            "freq": freq
+        }
+        return self.client.get("stock/financials-reported", params=params)
diff --git a/src/fetchers/finnhub/incomp/fundamentals/key_metrics.py b/src/fetchers/finnhub/incomp/fundamentals/key_metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b75a294eefbf76d41dfaab483ac378073b93abe
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/key_metrics.py
@@ -0,0 +1,11 @@
+"""
+key_metrics.py – GET /stock/metric
+"""
+from ..client import FinnhubClient
+
+class KeyMetrics:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_key_metrics(self, symbol):
+        return self.client.get("stock/metric", params={"symbol": symbol})
diff --git a/src/fetchers/finnhub/incomp/fundamentals/ownership.py b/src/fetchers/finnhub/incomp/fundamentals/ownership.py
new file mode 100644
index 0000000000000000000000000000000000000000..a44ec196b1c13bedd12adc0bacab7d22918d1d4b
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/ownership.py
@@ -0,0 +1,11 @@
+"""
+ownership.py – GET /stock/institutional-ownership
+"""
+from ..client import FinnhubClient
+
+class Ownership:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_ownership(self, symbol):
+        return self.client.get("stock/institutional-ownership", params={"symbol": symbol})
diff --git a/src/fetchers/finnhub/incomp/fundamentals/peers.py b/src/fetchers/finnhub/incomp/fundamentals/peers.py
new file mode 100644
index 0000000000000000000000000000000000000000..337f459f93df0d816295bfdf1f89f691b26c78d3
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/peers.py
@@ -0,0 +1,11 @@
+"""
+peers.py – GET /stock/peers
+"""
+from ..client import FinnhubClient
+
+class Peers:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_peers(self, symbol):
+        return self.client.get("stock/peers", params={"symbol": symbol})
diff --git a/src/fetchers/finnhub/incomp/fundamentals/profile.py b/src/fetchers/finnhub/incomp/fundamentals/profile.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0bea41d254a6f356217c92e5bcc7ea12812972b
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/fundamentals/profile.py
@@ -0,0 +1,11 @@
+"""
+profile.py – GET /stock/profile2
+"""
+from ..client import FinnhubClient
+
+class Profile:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_profile(self, symbol):
+        return self.client.get("stock/profile2", params={"symbol": symbol})
diff --git a/src/fetchers/finnhub/incomp/market_data/__init__.py b/src/fetchers/finnhub/incomp/market_data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..502d1b6e79a517034e8fde1e4754d1cb7a73e418
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/__init__.py
@@ -0,0 +1,15 @@
+"""
+market_data/__init__.py – Package exports for the market_data sub-package.
+"""
+
+from .symbol_listings import get_symbol_list
+from .stock_candle import get_stock_candles
+from .crypto_candle import get_crypto_candles
+from .forex_candle import get_forex_candles
+
+__all__ = [
+    "get_symbol_list",
+    "get_stock_candles",
+    "get_crypto_candles",
+    "get_forex_candles",
+]
diff --git a/src/fetchers/finnhub/incomp/market_data/crypto_candle.py b/src/fetchers/finnhub/incomp/market_data/crypto_candle.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29e2be493ad8620ffc06d2af6650740410f3ce0
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/crypto_candle.py
@@ -0,0 +1,40 @@
+"""
+market_data/crypto_candle.py – Retrieve historical OHLC (open, high, low, close) and volume data for cryptocurrencies.
+"""
+
+from typing import Dict, Any
+from ..client import FinnhubClient
+
+def get_crypto_candles(
+    client: FinnhubClient,
+    symbol: str,
+    resolution: str,
+    start_timestamp: int,
+    end_timestamp: int
+) -> Dict[str, Any]:
+    """
+    Fetch candlestick (OHLC + volume) data for a given crypto symbol over a time range.
+
+    :param client:          An instance of FinnhubClient
+    :param symbol:          Crypto symbol (e.g., "BINANCE:BTCUSDT")
+    :param resolution:      Time resolution. Supported values: "1", "5", "15", "30", "60", "D", "W", "M"
+    :param start_timestamp: UNIX timestamp (in seconds) for the start of the range
+    :param end_timestamp:   UNIX timestamp (in seconds) for the end of the range
+    :return:                A dict with keys:
+                            {
+                              'c': [close prices],
+                              'h': [high prices],
+                              'l': [low prices],
+                              'o': [open prices],
+                              'v': [volumes],
+                              't': [timestamps],
+                              's': 'ok'
+                            }
+    """
+    params = {
+        "symbol": symbol,
+        "resolution": resolution,
+        "from": start_timestamp,
+        "to": end_timestamp
+    }
+    return client.get("crypto/candle", params=params)
diff --git a/src/fetchers/finnhub/incomp/market_data/crypto_trades.py b/src/fetchers/finnhub/incomp/market_data/crypto_trades.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56bea05d41cf4712edc3401e85d9eafdccd48aa
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/crypto_trades.py
@@ -0,0 +1,11 @@
+"""
+crypto_trades.py – GET /crypto/trades
+"""
+from ..client import FinnhubClient
+
+class CryptoTrades:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_crypto_trades(self, symbol):
+        return self.client.get("crypto/trades", params={"symbol": symbol})
diff --git a/src/fetchers/finnhub/incomp/market_data/exchange_listings.py b/src/fetchers/finnhub/incomp/market_data/exchange_listings.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbf9522adbb285cd71aeef1f5f6dc7ddfb314ea1
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/exchange_listings.py
@@ -0,0 +1,14 @@
+"""
+exchange_listings.py – GET /stock/exchange, GET /crypto/exchange
+"""
+from ..client import FinnhubClient
+
+class ExchangeListings:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_stock_exchanges(self):
+        return self.client.get("stock/exchange")
+
+    def get_crypto_exchanges(self):
+        return self.client.get("crypto/exchange")
diff --git a/src/fetchers/finnhub/incomp/market_data/forex_candle.py b/src/fetchers/finnhub/incomp/market_data/forex_candle.py
new file mode 100644
index 0000000000000000000000000000000000000000..bccbb83d19b9b139f8659f298e7a7691db3b4635
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/forex_candle.py
@@ -0,0 +1,40 @@
+"""
+market_data/forex_candle.py – Retrieve historical OHLC (open, high, low, close) and volume data for forex pairs.
+"""
+
+from typing import Dict, Any
+from ..client import FinnhubClient
+
+def get_forex_candles(
+    client: FinnhubClient,
+    symbol: str,
+    resolution: str,
+    start_timestamp: int,
+    end_timestamp: int
+) -> Dict[str, Any]:
+    """
+    Fetch candlestick (OHLC + volume) data for a given forex symbol over a time range.
+
+    :param client:          An instance of FinnhubClient
+    :param symbol:          Forex pair symbol (e.g., "OANDA:EUR_USD")
+    :param resolution:      Time resolution. Supported values: "1", "5", "15", "30", "60", "D", "W", "M"
+    :param start_timestamp: UNIX timestamp (in seconds) for the start of the range
+    :param end_timestamp:   UNIX timestamp (in seconds) for the end of the range
+    :return:                A dict with keys:
+                            {
+                              'c': [close prices],
+                              'h': [high prices],
+                              'l': [low prices],
+                              'o': [open prices],
+                              'v': [volumes],
+                              't': [timestamps],
+                              's': 'ok'
+                            }
+    """
+    params = {
+        "symbol": symbol,
+        "resolution": resolution,
+        "from": start_timestamp,
+        "to": end_timestamp
+    }
+    return client.get("forex/candle", params=params)
diff --git a/src/fetchers/finnhub/incomp/market_data/forex_rates.py b/src/fetchers/finnhub/incomp/market_data/forex_rates.py
new file mode 100644
index 0000000000000000000000000000000000000000..0334c2bd832eb1d6ebbb923185e77c4d8a9d3db3
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/forex_rates.py
@@ -0,0 +1,11 @@
+"""
+forex_rates.py – GET /forex/rates
+"""
+from ..client import FinnhubClient
+
+class ForexRates:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_forex_rates(self):
+        return self.client.get("forex/rates")
diff --git a/src/fetchers/finnhub/incomp/market_data/quote.py b/src/fetchers/finnhub/incomp/market_data/quote.py
new file mode 100644
index 0000000000000000000000000000000000000000..77dc29c46c44cd9fe00505965f0d2263d59e45d8
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/quote.py
@@ -0,0 +1,11 @@
+"""
+quote.py – GET /quote
+"""
+from ..client import FinnhubClient
+
+class Quote:
+    def __init__(self):
+        self.client = FinnhubClient()
+
+    def get_quote(self, symbol):
+        return self.client.get("quote", params={"symbol": symbol})
diff --git a/src/fetchers/finnhub/incomp/market_data/stock_candle.py b/src/fetchers/finnhub/incomp/market_data/stock_candle.py
new file mode 100644
index 0000000000000000000000000000000000000000..124da28b0e657ae97798c93d59b516d9fe236cc2
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/stock_candle.py
@@ -0,0 +1,40 @@
+"""
+market_data/stock_candle.py – Retrieve historical OHLC (open, high, low, close) and volume data for stocks.
+"""
+
+from typing import Dict, Any, Optional
+from ..client import FinnhubClient
+
+def get_stock_candles(
+    client: FinnhubClient,
+    symbol: str,
+    resolution: str,
+    start_timestamp: int,
+    end_timestamp: int
+) -> Dict[str, Any]:
+    """
+    Fetch candlestick (OHLC + volume) data for a given stock symbol over a time range.
+
+    :param client:          An instance of FinnhubClient
+    :param symbol:          Stock ticker (e.g., "AAPL")
+    :param resolution:      Time resolution. Supported values: "1", "5", "15", "30", "60", "D", "W", "M"
+    :param start_timestamp: UNIX timestamp (in seconds) for the start of the range
+    :param end_timestamp:   UNIX timestamp (in seconds) for the end of the range
+    :return:                A dict with keys:
+                            {
+                              'c': [close prices],
+                              'h': [high prices],
+                              'l': [low prices],
+                              'o': [open prices],
+                              'v': [volumes],
+                              't': [timestamps],
+                              's': 'ok'
+                            }
+    """
+    params = {
+        "symbol": symbol,
+        "resolution": resolution,
+        "from": start_timestamp,
+        "to": end_timestamp
+    }
+    return client.get("stock/candle", params=params)
diff --git a/src/fetchers/finnhub/incomp/market_data/symbol_listings.py b/src/fetchers/finnhub/incomp/market_data/symbol_listings.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f00ddbf874c13ffc366918ce594c99ad444a2e8
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/market_data/symbol_listings.py
@@ -0,0 +1,19 @@
+"""
+market_data/symbol_listings.py – Fetches the list of tradable symbols from Finnhub.
+"""
+
+from typing import List, Dict
+from ..client import FinnhubClient
+
+def get_symbol_list(client: FinnhubClient, exchange: str = "US") -> List[Dict]:
+    """
+    Retrieve all symbols available on a given exchange.
+
+    :param client:   An instance of FinnhubClient
+    :param exchange: Exchange code (e.g., "US", "NASDAQ", "BINANCE")
+    :return:         A list of symbol metadata dicts, each containing keys like
+                     'symbol', 'description', 'type', etc.
+    """
+    params = {"exchange": exchange}
+    symbols = client.get("stock/symbol", params=params)
+    return symbols
diff --git a/src/fetchers/finnhub/incomp/news/__init__.py b/src/fetchers/finnhub/incomp/news/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fdf81efef21177694a6ff3d4c0b833fff0de84e
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/news/__init__.py
@@ -0,0 +1,11 @@
+"""
+news/__init__.py – Package exports for the news sub-package.
+"""
+
+from .company_news import CompanyNews
+from .general_news import GeneralNews
+
+__all__ = [
+    "CompanyNews",
+    "GeneralNews",
+]
diff --git a/src/fetchers/finnhub/incomp/news/company_news.py b/src/fetchers/finnhub/incomp/news/company_news.py
new file mode 100644
index 0000000000000000000000000000000000000000..11a5719ddf1f7292be87588591d000b8ed2a36be
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/news/company_news.py
@@ -0,0 +1,45 @@
+"""
+news/company_news.py – Fetch company-specific news articles from Finnhub.
+"""
+
+from typing import List, Dict, Optional
+from ..client import FinnhubClient
+
+class CompanyNews:
+    def __init__(self):
+        """
+        Initialize CompanyNews with a FinnhubClient instance.
+        """
+        self.client = FinnhubClient()
+
+    def get_company_news(
+        self,
+        symbol: str,
+        start_date: Optional[str] = None,
+        end_date: Optional[str] = None
+    ) -> List[Dict]:
+        """
+        Retrieve news articles for a specific company.
+
+        :param symbol:     Stock ticker (e.g., "AAPL")
+        :param start_date: Filter news from this date (YYYY-MM-DD)
+        :param end_date:   Filter news up to this date (YYYY-MM-DD)
+        :return:           A list of news article dicts, each containing keys such as:
+                           {
+                             'category': 'general',
+                             'datetime': 1623777600,
+                             'headline': 'Apple releases new product...',
+                             'id': 12345,
+                             'image': 'https://...',
+                             'related': 'AAPL',
+                             'source': 'CNBC',
+                             'summary': 'Apple announced...',
+                             'url': 'https://...'
+                           }
+        """
+        params: Dict[str, str] = {"symbol": symbol}
+        if start_date:
+            params["from"] = start_date
+        if end_date:
+            params["to"] = end_date
+        return self.client.get("company-news", params=params)
diff --git a/src/fetchers/finnhub/incomp/news/general_news.py b/src/fetchers/finnhub/incomp/news/general_news.py
new file mode 100644
index 0000000000000000000000000000000000000000..d911db49cd8a03170a7e09bbfac93382938f74f7
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/news/general_news.py
@@ -0,0 +1,44 @@
+"""
+news/general_news.py – Fetch general market news articles from Finnhub.
+"""
+
+from typing import List, Dict, Optional
+from ..client import FinnhubClient
+
+class GeneralNews:
+    def __init__(self):
+        """
+        Initialize GeneralNews with a FinnhubClient instance.
+        """
+        self.client = FinnhubClient()
+
+    def get_general_news(
+        self,
+        category: Optional[str] = None,
+        min_id: Optional[int] = None
+    ) -> List[Dict]:
+        """
+        Retrieve general news articles.
+
+        :param category: (optional) News category filter, one of:
+                         'general', 'forex', 'crypto', 'merger', 'sentiment',
+                         'ipo', 'private equity', 'public offerings', etc.
+        :param min_id:   (optional) Return articles with ID greater than this value
+        :return:         A list of news article dicts, each containing keys such as:
+                         {
+                           'category': 'general',
+                           'datetime': 1623777600,
+                           'headline': 'Market rallies on positive earnings...',
+                           'id': 12345,
+                           'image': 'https://...',
+                           'source': 'Reuters',
+                           'summary': 'Stocks rallied today...',
+                           'url': 'https://...'
+                         }
+        """
+        params: Dict[str, str] = {}
+        if category:
+            params["category"] = category
+        if min_id is not None:
+            params["minId"] = str(min_id)
+        return self.client.get("news", params=params)
diff --git a/src/fetchers/finnhub/incomp/utils.py b/src/fetchers/finnhub/incomp/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea194e45f20f066ed71e978334775594e79f3a6a
--- /dev/null
+++ b/src/fetchers/finnhub/incomp/utils.py
@@ -0,0 +1,30 @@
+"""
+utils.py – Utility functions, including rate limiting decorator.
+"""
+
+import threading
+import time
+from functools import wraps
+
+def rate_limited(max_per_minute: int = 60):
+    """
+    Decorator that limits a function to at most `max_per_minute` calls per minute.
+    Uses a simple token bucket algorithm.
+    """
+    interval = 60.0 / max_per_minute
+    lock = threading.Lock()
+    last_time = {"t": 0.0}
+
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            with lock:
+                elapsed = time.time() - last_time["t"]
+                wait = interval - elapsed
+                if wait > 0:
+                    time.sleep(wait)
+                result = func(*args, **kwargs)
+                last_time["t"] = time.time()
+                return result
+        return wrapper
+    return decorator
diff --git a/src/fetchers/finnhub/main.py b/src/fetchers/finnhub/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9766a5cf9816798947e678cf7ce1e744464c098
--- /dev/null
+++ b/src/fetchers/finnhub/main.py
@@ -0,0 +1,476 @@
+import finnhub
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
+import os
+import json
+import time
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+from typing import List, Dict, Any, Optional
+# import logging
+
+# Load environment variables
+load_dotenv()
+
+class FinnhubDataFetcher:
+    
+    def __init__(self):
+        # Parse multiple API keys from environment variable
+        api_keys_string = os.getenv('FINNHUB_API_KEY')
+        if not api_keys_string:
+            raise ValueError("FINNHUB_API_KEY not found in environment variables")
+        
+        # Support both comma-separated and single API key formats
+        self.api_keys = [key.strip() for key in api_keys_string.split(',') if key.strip()]
+        if not self.api_keys:
+            raise ValueError("No valid API keys found in FINNHUB_API_KEY")
+        
+        print(f"[INFO] Loaded {len(self.api_keys)} Finnhub API key(s)")
+        
+        self.current_key_index = 0
+        self.rate_limit_switches = 0
+        
+        self.stock_symbols = os.getenv('STOCK_SYMBOLS', 'AAPL,NVDA,TSLA,GOOGL,MSFT,AMZN').split(',')
+        self.stock_symbols = [symbol.strip() for symbol in self.stock_symbols]
+        
+        self.crypto_symbols = os.getenv('CRYPTO_SYMBOLS', 'BTC,ETH,ADA,DOT,LINK,UNI,AVAX,MATIC').split(',')
+        self.crypto_symbols = [symbol.strip() for symbol in self.crypto_symbols]
+        
+        self.crypto_pairs = os.getenv('CRYPTO_PAIRS', 'BINANCE:BTCUSDT,BINANCE:ETHUSDT,BINANCE:ADAUSDT,COINBASE:BTC-USD,COINBASE:ETH-USD').split(',')
+        self.crypto_pairs = [pair.strip() for pair in self.crypto_pairs]
+        
+        self.forex_pairs = os.getenv('FOREX_PAIRS', 'OANDA:EUR_USD,OANDA:GBP_USD,OANDA:USD_JPY,OANDA:AUD_USD,OANDA:USD_CAD').split(',')
+        self.forex_pairs = [pair.strip() for pair in self.forex_pairs]
+        
+        # Initialize client with first API key
+        from finnhub import Client
+        self.client = self._create_client()
+        self.base_dir = "data/finnhub"
+        
+        # Create base directory structure
+        self.create_directory_structure()
+        
+        # Date ranges for historical data
+        self.end_date = int(datetime.now().timestamp())
+        self.start_date = int((datetime.now() - timedelta(days=365)).timestamp())
+        self.date_str_from = (datetime.now() - timedelta(days=365)).strftime('%Y-%m-%d')
+        self.date_str_to = datetime.now().strftime('%Y-%m-%d')
+    
+    def _create_client(self):
+        """Create a Finnhub client with the current API key"""
+        from finnhub import Client
+        current_key = self.api_keys[self.current_key_index]
+        print(f"[INFO] Using API key #{self.current_key_index + 1}: {current_key[:8]}...")
+        return Client(api_key=current_key)
+    
+    def _switch_api_key(self):
+        """Switch to the next available API key"""
+        if len(self.api_keys) == 1:
+            print("[WARNING] Only one API key available, cannot switch")
+            return False
+        
+        old_index = self.current_key_index
+        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
+        self.rate_limit_switches += 1
+        
+        print(f"[SWITCH] Switching from API key #{old_index + 1} to #{self.current_key_index + 1} (switch #{self.rate_limit_switches})")
+        
+        # Create new client with the new API key
+        self.client = self._create_client()
+        
+        # Add a longer delay after switching keys
+        time.sleep(2.0)
+        return True
+    
+    def _is_rate_limit_error(self, error_message):
+        """Check if the error indicates a rate limit issue"""
+        rate_limit_indicators = [
+            "429",
+            "rate limit",
+            "too many requests", 
+            "api limit",
+            "quota exceeded",
+            "limit exceeded"
+        ]
+        error_str = str(error_message).lower()
+        return any(indicator in error_str for indicator in rate_limit_indicators)
+        
+    
+    def create_directory_structure(self):
+        """Create directory structure for organized data storage"""
+        subdirs = [
+            'stock_data', 'company_info', 'financials', 'earnings', 'news',
+            'crypto', 'forex', 'market_data',
+             'ownership', 'ratings', 'regulatory'
+        ]
+        for subdir in subdirs:
+            dir_path = os.path.join(self.base_dir, subdir)
+            os.makedirs(dir_path, exist_ok=True)
+
+    def save_data(self, data: Any, filename: str, subdir: str, symbol: str = None):
+        """
+        Robust Parquet writer:
+          • DataFrame       → direct to_parquet
+          • dict            → one-row DataFrame → parquet
+          • list-of-dicts   → DataFrame → parquet
+          • other lists     → single-column table
+          • other objects   → single-row, single-column table
+        """
+        if symbol:
+            base = os.path.join(self.base_dir, subdir, f"{symbol}_{filename}")
+        else:
+            base = os.path.join(self.base_dir, subdir, filename)
+        path = f"{base}.parquet"
+
+        try:
+            if isinstance(data, pd.DataFrame):
+                data = data.replace({'N/A': None, '-': None})
+                for col in data.columns:
+                    if data[col].dtype == object:
+                        data[col] = pd.to_numeric(data[col], errors='coerce')
+                data.to_parquet(path, index=False, engine="pyarrow")
+                return
+
+            # ————————————————
+            # Special-case: any dict whose “data” is a list-of-lists → tabular form
+            if isinstance(data, dict) and 'data' in data and isinstance(data['data'], list):
+                cols = None
+                # Finnhub sometimes calls labels 'metricType' or 'metric'
+                if 'metricType' in data:
+                    cols = data['metricType']
+                elif 'metric' in data:
+                    cols = data['metric']
+                # only proceed if we have column names
+                if cols and all(isinstance(row, (list, tuple)) for row in data['data']):
+                    df = pd.DataFrame(data['data'], columns=cols)
+                    # sanitize placeholders
+                    df = df.replace({'N/A': None, '-': None})
+                    # coerce object cols to numeric where possible
+                    for c in df.columns:
+                        if df[c].dtype == object:
+                            df[c] = pd.to_numeric(df[c], errors='coerce')
+                    df.to_parquet(path, index=False, engine="pyarrow")
+                    return
+            # ————————————————
+
+            if isinstance(data, dict):
+                df = pd.DataFrame([{k: (None if v == "N/A" else v) for k, v in data.items()}])
+                df.to_parquet(path, index=False, engine="pyarrow")
+                return
+
+            if isinstance(data, list):
+                if all(isinstance(item, dict) for item in data):
+                    df = pd.DataFrame([{k: (None if v == "N/A" else v) for k, v in item.items()} for item in data])
+                    df.to_parquet(path, index=False, engine="pyarrow")
+                else:
+                    tbl = pa.Table.from_pydict({"value": [None if v == "N/A" else v for v in data]})
+                    pq.write_table(tbl, path)
+                return
+
+            tbl = pa.Table.from_pydict({"value": [str(data) if data != "N/A" else None]})
+            pq.write_table(tbl, path)
+
+        except Exception as e:
+            print(f"Error saving {filename}: {e}")
+
+    def rate_limit_delay(self, delay: float = 0.1):
+        """Add delay to respect rate limits"""
+        time.sleep(delay)
+
+    def safe_api_call(self, func, *args, **kwargs):
+        """Safely call Finnhub API with retries and rate limit handling"""
+        max_retries = 3
+        base_delay = 1.2
+        keys_tried = set()
+        
+        for attempt in range(max_retries):
+            try:
+                # If we've tried all keys, wait longer and reset
+                if len(keys_tried) >= len(self.api_keys):
+                    print(f"[WARNING] All {len(self.api_keys)} API keys exhausted, waiting 30 seconds...")
+                    time.sleep(30)
+                    keys_tried.clear()
+                    self.current_key_index = 0
+                    self.client = self._create_client()
+                
+                result = func(*args, **kwargs)
+                
+                # If successful, apply rate limit delay and return
+                if result:
+                    if attempt > 0:
+                        print(f"[SUCCESS] API call succeeded on attempt {attempt + 1}")
+                    self.rate_limit_delay()
+                    return result
+                else:
+                    print("[WARNING] API returned empty result")
+                    
+            except Exception as e:
+                error_msg = str(e)
+                keys_tried.add(self.current_key_index)
+                
+                # Check if it's a rate limit error
+                if self._is_rate_limit_error(error_msg):
+                    print(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit: {error_msg}")
+                    
+                    # Try to switch to next API key
+                    if self._switch_api_key():
+                        continue  # Retry with new API key
+                    else:
+                        print("[ERROR] No more API keys available for switching")
+                        
+                print(f"[ERROR] API call attempt {attempt + 1}/{max_retries} failed: {error_msg}")
+                
+                if attempt < max_retries - 1:
+                    delay = base_delay * (2 ** attempt)
+                    print(f"[RETRY] Waiting {delay:.1f} seconds before retry...")
+                    time.sleep(delay)
+                else:
+                    print(f"[FAILED] All {max_retries} attempts failed")
+                    
+        return None
+    
+    def get_api_key_status(self):
+        """Get status information about API key usage"""
+        return {
+            "total_keys": len(self.api_keys),
+            "current_key": self.current_key_index + 1,
+            "rate_limit_switches": self.rate_limit_switches,
+            "current_key_preview": self.api_keys[self.current_key_index][:8] + "..."
+        }
+    
+    def print_api_key_status(self):
+        """Print API key usage status"""
+        status = self.get_api_key_status()
+        print(f"\n[API_STATUS] Using {status['total_keys']} API keys")
+        print(f"[API_STATUS] Current: Key #{status['current_key']} ({status['current_key_preview']})")
+        print(f"[API_STATUS] Rate limit switches: {status['rate_limit_switches']}")
+        if status['rate_limit_switches'] > 0:
+            print(f"[API_STATUS] Effective rate limit handling active")
+        print()
+
+    def fetch_stock_data(self):
+        """Fetch stock-related data"""
+        for symbol in self.stock_symbols:
+            quote = self.safe_api_call(self.client.quote, symbol)
+            if quote:
+                self.save_data(quote, "current_quote", "stock_data", symbol)
+
+    def fetch_company_info(self):
+        """Fetch company information"""
+        for symbol in self.stock_symbols:
+            profile = self.safe_api_call(self.client.company_profile2, symbol=symbol)
+            if profile:
+                self.save_data(profile, "company_profile", "company_info", symbol)
+            peers = self.safe_api_call(self.client.company_peers, symbol)
+            if peers:
+                self.save_data(peers, "company_peers", "company_info", symbol)
+
+    def fetch_financials(self):
+        """Fetch financial data"""
+        for symbol in self.stock_symbols:
+            basic_financials = self.safe_api_call(
+                self.client.company_basic_financials, symbol, 'all'
+            )
+            if basic_financials:
+                self.save_data(basic_financials, "basic_financials", "financials", symbol)
+            reported_financials = self.safe_api_call(
+                self.client.financials_reported, symbol=symbol, freq='annual'
+            )
+            if reported_financials:
+                self.save_data(reported_financials, "reported_financials", "financials", symbol)
+
+    def fetch_earnings_data(self):
+        """Fetch earnings-related data"""
+        for symbol in self.stock_symbols:
+            earnings = self.safe_api_call(self.client.company_earnings, symbol, limit=10)
+            if earnings:
+                self.save_data(earnings, "earnings_surprises", "earnings", symbol)
+
+    def fetch_news_data(self):
+        """Fetch news and sentiment data"""
+        for symbol in self.stock_symbols:
+            company_news = self.safe_api_call(
+                self.client.company_news, symbol,
+                _from=self.date_str_from, to=self.date_str_to
+            )
+            if company_news:
+                self.save_data(company_news, "company_news", "news", symbol)
+
+    def fetch_ownership_data(self):
+        """Fetch ownership data"""
+        for symbol in self.stock_symbols:
+            insider_transactions = self.safe_api_call(
+                self.client.stock_insider_transactions, symbol,
+                self.date_str_from, self.date_str_to
+            )
+            if insider_transactions:
+                self.save_data(insider_transactions, "insider_transactions", "ownership", symbol)
+            insider_sentiment = self.safe_api_call(
+                self.client.stock_insider_sentiment, symbol,
+                self.date_str_from, self.date_str_to
+            )
+            if insider_sentiment:
+                self.save_data(insider_sentiment, "insider_sentiment", "ownership", symbol)
+
+    def fetch_ratings_data(self):
+        """Fetch analyst ratings and recommendations"""
+        for symbol in self.stock_symbols:
+            recommendations = self.safe_api_call(self.client.recommendation_trends, symbol)
+            if recommendations:
+                self.save_data(recommendations, "recommendation_trends", "ratings", symbol)
+
+    def fetch_regulatory_data(self):
+        """Fetch regulatory and compliance data"""
+        for symbol in self.stock_symbols:
+            filings = self.safe_api_call(
+                self.client.filings, symbol=symbol,
+                _from=self.date_str_from, to=self.date_str_to
+            )
+            if filings:
+                self.save_data(filings, "sec_filings", "regulatory", symbol)
+            # patents = self.safe_api_call(
+            #     self.client.stock_uspto_patent, symbol,
+            #     self.date_str_from, self.date_str_to
+            # )
+            # if patents:
+            #     self.save_data(patents, "uspto_patents", "regulatory", symbol)
+            visa_apps = self.safe_api_call(
+                self.client.stock_visa_application, symbol,
+                self.date_str_from, self.date_str_to
+            )
+            if visa_apps:
+                self.save_data(visa_apps, "visa_applications", "regulatory", symbol)
+            lobbying = self.safe_api_call(
+                self.client.stock_lobbying, symbol,
+                self.date_str_from, self.date_str_to
+            )
+            if lobbying:
+                self.save_data(lobbying, "lobbying_data", "regulatory", symbol)
+            usa_spending = self.safe_api_call(
+                self.client.stock_usa_spending, symbol,
+                self.date_str_from, self.date_str_to
+            )
+            if usa_spending:
+                self.save_data(usa_spending, "usa_spending", "regulatory", symbol)
+
+    def fetch_market_data(self):
+        """Fetch general market data"""
+        stock_symbols = self.safe_api_call(self.client.stock_symbols, 'US')
+        if stock_symbols:
+            self.save_data(stock_symbols, "stock_symbols_us", "market_data")
+        ipo_calendar = self.safe_api_call(
+            self.client.ipo_calendar, _from=self.date_str_from, to=self.date_str_to
+        )
+        if ipo_calendar:
+            self.save_data(ipo_calendar, "ipo_calendar", "market_data")
+        market_status = self.safe_api_call(self.client.market_status, exchange='US')
+        if market_status:
+            self.save_data(market_status, "market_status", "market_data")
+        market_holidays = self.safe_api_call(self.client.market_holiday, exchange='US')
+        if market_holidays:
+            self.save_data(market_holidays, "market_holidays", "market_data")
+        general_news = self.safe_api_call(self.client.general_news, 'general', min_id=0)
+        if general_news:
+            self.save_data(general_news, "general_news", "market_data")
+        covid_data = self.safe_api_call(self.client.covid19)
+        if covid_data:
+            self.save_data(covid_data, "covid19_data", "market_data")
+        fda_calendar = self.safe_api_call(self.client.fda_calendar)
+        if fda_calendar:
+            self.save_data(fda_calendar, "fda_calendar", "market_data")
+
+    def fetch_crypto_data(self):
+        """Fetch cryptocurrency data"""
+        crypto_exchanges = self.safe_api_call(self.client.crypto_exchanges)
+        if crypto_exchanges:
+            self.save_data(crypto_exchanges, "crypto_exchanges", "crypto")
+        exchanges = ['BINANCE', 'COINBASE']
+        for exchange in exchanges:
+            symbols = self.safe_api_call(self.client.crypto_symbols, exchange)
+            if symbols:
+                self.save_data(symbols, f"crypto_symbols_{exchange.lower()}", "crypto")
+
+    def fetch_forex_data(self):
+        """Fetch forex data"""
+        forex_exchanges = self.safe_api_call(self.client.forex_exchanges)
+        if forex_exchanges:
+            self.save_data(forex_exchanges, "forex_exchanges", "forex")
+        forex_symbols = self.safe_api_call(self.client.forex_symbols, 'OANDA')
+        if forex_symbols:
+            self.save_data(forex_symbols, "forex_symbols_oanda", "forex")
+
+    def run_full_fetch(self):
+        """Run complete data fetch for all APIs"""
+        try:
+            self.fetch_stock_data()
+            self.fetch_company_info()
+            self.fetch_financials()
+            self.fetch_earnings_data()
+            self.fetch_news_data()
+            self.fetch_ownership_data()
+            self.fetch_ratings_data()
+            self.fetch_regulatory_data()
+            self.fetch_market_data()
+            self.fetch_crypto_data()
+            self.fetch_forex_data()
+        except Exception as e:
+            raise
+
+def main():
+    import sys
+    import os
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+    """Main function to run the data fetcher"""
+    try:
+        fetcher = FinnhubDataFetcher()
+        
+        # Print API key status
+        fetcher.print_api_key_status()
+        
+        fetcher.run_full_fetch()
+        print("Data fetching completed successfully!")
+        
+        # Print final API key status
+        print("\n[FINAL_STATUS] Finnhub API Key Usage Summary:")
+        fetcher.print_api_key_status()
+
+        # Upload all files in data/finnhub/news to S3 under news/finnhub_news/
+        from data_cloud.cloud_utils import StorageHandler
+        from dotenv import load_dotenv
+        load_dotenv()
+        import os
+        from pathlib import Path
+
+        FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT")
+        FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY")
+        FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY")
+        FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET")
+
+        storage = StorageHandler(
+            endpoint_url=FILEBASE_ENDPOINT,
+            access_key=FILEBASE_ACCESS_KEY,
+            secret_key=FILEBASE_SECRET_KEY,
+            bucket_name=FILEBASE_BUCKET,
+            local_base="data"
+        )
+
+        local_news_dir = os.path.join("data", "finnhub", "news")
+        s3_news_prefix = "news/finnhub_news/"
+        for root, _, files in os.walk(local_news_dir):
+            for fname in files:
+                local_path = os.path.join(root, fname)
+                rel_path = os.path.relpath(local_path, local_news_dir)
+                s3_key = s3_news_prefix + rel_path.replace("\\", "/")
+                with open(local_path, "rb") as f:
+                    file_bytes = f.read()
+                storage.upload(s3_key, file_bytes, content_type="application/octet-stream")
+                print(f"[OK] Uploaded {local_path} -> S3:{s3_key}")
+
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return 1
+    return 0
+
+if __name__ == "__main__":
+    exit(main())
\ No newline at end of file
diff --git a/src/fetchers/finnhub/report.py b/src/fetchers/finnhub/report.py
new file mode 100644
index 0000000000000000000000000000000000000000..f95e4dd2f8f2af92d1922bd0c9b9d8491bafe442
--- /dev/null
+++ b/src/fetchers/finnhub/report.py
@@ -0,0 +1,116 @@
+# import os
+# import glob
+# import json
+# import pandas as pd
+
+# # —— CONFIG —__
+# BASE_DIR     = "data/finnhub"
+# OUTPUT_FILE  = "data/finnhub/finnhub_feature_report.json"
+
+# # Define your feature‐category buckets by column‐name patterns:
+# FEATURE_CATEGORIES = {
+#     "Price Quotes":      [r"^c$", r"^o$", r"^h$", r"^l$", r"^pc$", r"^d$", r"^dp$"],
+#     "Company Profile":   [r"^logo$", r"^name$", r"^country$", r"^ticker$"],
+#     "Financial Metrics": [r"^metric", r"^value$", r"^data$"],
+#     "Earnings":          [r"^surprise", r"^actual", r"^estimate"],
+#     # add more as needed…
+# }
+
+# def load_all_parquets(base_dir):
+#     """Read every .parquet under subdirs into one wide DataFrame, then drop all‐NA columns."""
+#     dfs = []
+#     for sub in os.listdir(base_dir):
+#         path = os.path.join(base_dir, sub)
+#         if not os.path.isdir(path):
+#             continue
+#         for fn in glob.glob(os.path.join(path, "*.parquet")):
+#             df = pd.read_parquet(fn)
+#             dfs.append(df)
+#     if not dfs:
+#         return pd.DataFrame()
+#     full = pd.concat(dfs, axis=1)
+#     # --- NEW: remove any column that’s entirely NA ---
+#     full = full.dropna(axis=1, how='all')
+#     return full
+
+# def bucket_features(cols, buckets):
+#     """Assign each column to the first matching bucket."""
+#     import re
+#     result = {k: [] for k in buckets}
+#     leftovers = []
+#     for col in cols:
+#         placed = False
+#         for name, patterns in buckets.items():
+#             if any(re.match(pat, col) for pat in patterns):
+#                 result[name].append(col)
+#                 placed = True
+#                 break
+#         if not placed:
+#             leftovers.append(col)
+#     if leftovers:
+#         result["Other"] = leftovers
+#     return result
+
+# def generate_report(df):
+#     # --- NEW: again ensure we dropped fully‐missing columns in case of downstream mutations ---
+#     df = df.dropna(axis=1, how='all')
+
+#     now = pd.Timestamp.utcnow().isoformat()
+#     rows, cols = df.shape
+#     mem_mb = df.memory_usage(deep=True).sum() / 1e6
+
+#     # time range
+#     if "interval_timestamp" in df.columns:
+#         ts = pd.to_datetime(df["interval_timestamp"])
+#         time_range = {"start": str(ts.min()), "end": str(ts.max())}
+#     else:
+#         time_range = {}
+
+#     # feature categories
+#     feat_cats = bucket_features(df.columns.tolist(), FEATURE_CATEGORIES)
+
+#     # --- NEW: de-dupe each feature list while preserving order ---
+#     for cat, flist in feat_cats.items():
+#         feat_cats[cat] = list(dict.fromkeys(flist))
+
+#     feat_summary = {
+#         cat: {"count": len(cols), "features": cols}
+#         for cat, cols in feat_cats.items()
+#     }
+
+#     # data quality
+#     missing = df.isna().sum().to_dict()
+#     total_cells = rows * cols
+#     total_missing = sum(missing.values())
+#     completeness = 1 - total_missing / total_cells
+
+#     report = {
+#         "timestamp": now,
+#         "dataset_info": {
+#             "shape": [rows, cols],
+#             "memory_usage_mb": mem_mb,
+#             "time_range": time_range
+#         },
+#         "feature_categories": feat_summary,
+#         "data_quality": {
+#             "completeness": completeness,
+#             "avg_quality_score": completeness
+#         },
+#         "feature_importance": {},      # fill in if you run a model
+#         "configuration": {
+#             "source": "Finnhub Free Plan",
+#             "generated_by": "generate_finnhub_report.py"
+#         }
+#     }
+#     return report
+
+# def main():
+#     df = load_all_parquets(BASE_DIR)
+#     report = generate_report(df)
+#     with open(OUTPUT_FILE, "w") as f:
+#         json.dump(report, f, indent=2)
+#     print(f"Wrote report to {OUTPUT_FILE}")
+
+# if __name__ == "__main__":
+#     main()
+#     main()
diff --git a/src/fetchers/finviz_sentiment/app.py b/src/fetchers/finviz_sentiment/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..9faf5ae65ddb8d5edeafa59d552f6422561def83
--- /dev/null
+++ b/src/fetchers/finviz_sentiment/app.py
@@ -0,0 +1,136 @@
+from urllib.request import urlopen, Request
+from urllib.error import HTTPError
+from bs4 import BeautifulSoup
+import pandas as pd
+import datetime
+from dateutil import parser
+from pathlib import Path
+import sys
+import os
+import nltk
+
+# Ensure VADER lexicon is available in a writable location
+try:
+    from src import config as app_config
+    _nltk_dir = os.path.join(app_config.DATA_DIR, 'nltk_data')
+except Exception:
+    _nltk_dir = os.path.join(os.environ.get('DATA_DIR', '/data'), 'nltk_data')
+os.makedirs(_nltk_dir, exist_ok=True)
+if _nltk_dir not in nltk.data.path:
+    nltk.data.path.insert(0, _nltk_dir)
+try:
+    nltk.data.find('vader_lexicon')
+except LookupError:
+    nltk.download('vader_lexicon', download_dir=_nltk_dir)
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+
+class StockSentimentAnalyzer:
+    def __init__(self):
+        self.stock_url  = 'https://finviz.com/quote.ashx?t='
+        self.crypto_url = 'https://finviz.com/crypto_charts.ashx?t='
+        self.headers = {
+            'User-Agent': (
+                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                'AppleWebKit/537.36 (KHTML, like Gecko) '
+                'Chrome/115.0.0.0 Safari/537.36'
+            )
+        }
+        self.vader = SentimentIntensityAnalyzer()
+
+    def get_news(self, ticker):
+        """Fetch the Finviz news table for a ticker, falling back to crypto endpoint."""
+        ticker = ticker.upper()
+        # Try stock quotes endpoint first
+        try:
+            req = Request(self.stock_url + ticker, headers=self.headers)
+            resp = urlopen(req)
+        except HTTPError as e:
+            # On 404 (no stock page), retry crypto endpoint with USD suffix
+            if e.code == 404:
+                if not ticker.endswith('USD'):
+                    ticker += 'USD'
+                req = Request(self.crypto_url + ticker, headers=self.headers)
+                resp = urlopen(req)
+            else:
+                raise
+        html = resp.read()
+        soup = BeautifulSoup(html, 'lxml')
+        return soup.find(id='news-table')
+
+    def parse_news(self, news_table):
+        """Parse <tr> rows into DataFrame of date, time, headline."""
+        today = datetime.datetime.today().strftime('%b-%d-%y')
+        rows = []
+        for tr in news_table.find_all('tr'):
+            try:
+                text = tr.a.get_text()
+                parts = tr.td.text.split()
+                if len(parts) == 1:
+                    date_str, time_str = today, parts[0]
+                else:
+                    date_str, time_str = parts
+                if date_str.lower() == 'today':
+                    date_str = today
+                rows.append([date_str, time_str, text])
+            except:
+                continue
+        df = pd.DataFrame(rows, columns=['date','time','headline'])
+        if not df.empty:
+            df['datetime'] = df.apply(
+                lambda r: self._parse_datetime(r['date'], r['time']), axis=1
+            )
+            df = df.dropna(subset=['datetime'])
+        return df
+
+    def _parse_datetime(self, date_str, time_str):
+        try:
+            return parser.parse(f"{date_str} {time_str}")
+        except:
+            return None
+
+    def score_news(self, df):
+        """Attach VADER sentiment_score to each headline."""
+        if df.empty:
+            return df
+        scores = df['headline'].apply(self.vader.polarity_scores).tolist()
+        scores_df = pd.DataFrame(scores)
+        out = df.join(scores_df).set_index('datetime')
+        return out.drop(['date','time'], axis=1).rename(columns={'compound':'sentiment_score'})
+
+    def get_sentiment_data(self, ticker):
+        try:
+            table = self.get_news(ticker)
+            if table is None:
+                return None, f"No news table for '{ticker}'"
+            parsed = self.parse_news(table)
+            if parsed.empty:
+                return None, f"No articles for '{ticker}'"
+            scored = self.score_news(parsed)
+            if scored.empty:
+                return None, f"Sentiment scoring failed for '{ticker}'"
+            return scored, "Success"
+        except Exception as e:
+            return None, f"Error occurred: {e}"
+
+def main():
+    tickers = sys.argv[1:] or ["AAPL","TSLA","GOOGL","NVDA","MSFT","BTC","SOL","XRP","ETH","ADA", "COIN"]
+    analyzer = StockSentimentAnalyzer()
+    
+    # Get project root directory (3 levels up from this file)
+    project_root = Path(__file__).parent.parent.parent.parent
+    out_dir = project_root / "data" / "finviz" / "sentiment"
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    for t in tickers:
+        df, status = analyzer.get_sentiment_data(t)
+        if df is not None:
+            path = out_dir / f"{t.upper()}_sentiment.parquet"
+            # Ensure 'datetime' is a column before saving
+            df_reset = df.reset_index() if df.index.name == 'datetime' else df
+            df_reset.to_parquet(path)
+            print(f"Saved sentiment data for {t} to {path}")
+        else:
+            print(f"Error for {t}: {status}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/fetchers/main.py b/src/fetchers/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..41af443d666b6d7870bd40a0578d38e1c55c1417
--- /dev/null
+++ b/src/fetchers/main.py
@@ -0,0 +1,159 @@
+# Entrypoint for all fetchers
+
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))  # project root
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))  # src
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers")))  # fetchers
+import asyncio
+
+def run_advisorai_data():
+    # print("[DEBUG] sys.path:")
+    # for p in sys.path:
+    #     print("   ", p)
+    try:
+        from advisorai_data.advisorai_data_fetcher import main as advisorai_data_main
+        advisorai_data_main()
+    except ModuleNotFoundError as e:
+        print("[WARN] advisorai_data import failed, trying importlib fallback...")
+        import importlib.util
+        fetcher_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "advisorai_data", "advisorai_data_fetcher.py"))
+        spec = importlib.util.spec_from_file_location("advisorai_data_fetcher", fetcher_path)
+        advisorai_data_fetcher = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(advisorai_data_fetcher)
+        advisorai_data_fetcher.main()
+
+# def run_crawl_news():
+#     import importlib.util
+#     import os
+#     import asyncio
+#     crawl_news_path = os.path.join(os.path.dirname(__file__), "crawl4ai", "crawl_news.py")
+#     spec = importlib.util.spec_from_file_location("crawl_news", crawl_news_path)
+#     crawl_news = importlib.util.module_from_spec(spec)
+#     spec.loader.exec_module(crawl_news)
+#     asyncio.run(crawl_news.main())
+
+def run_crypto_bubbles():
+    from crypto_bubbles.fetch_crypto_bubbles import main as crypto_bubbles_main
+    crypto_bubbles_main()
+
+def run_finnhub():
+    # Use the installed finnhub package, not the local finnhub module
+    import importlib.util
+    import os
+    finnhub_main_path = os.path.join(os.path.dirname(__file__), "finnhub", "main.py")
+    spec = importlib.util.spec_from_file_location("finnhub_main", finnhub_main_path)
+    finnhub_main = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(finnhub_main)
+    finnhub_main.main()
+
+def run_alpaca_features():
+    import importlib
+    import os
+    alpaca_path = os.path.join(os.path.dirname(__file__), "alpaca_api", "main.py")
+    spec = importlib.util.spec_from_file_location("alpaca_features", alpaca_path)
+    alpaca_features = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(alpaca_features)
+    alpaca_features.main()
+
+def run_marketaux_news():
+    import importlib.util
+    import os
+    marketaux_news_path = os.path.join(os.path.dirname(__file__), "marketaux", "news.py")
+    spec = importlib.util.spec_from_file_location("marketaux_news", marketaux_news_path)
+    marketaux_news = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(marketaux_news)
+    marketaux_news.main()
+
+def run_finviz_sentiment():
+    import importlib.util
+    finviz_path = os.path.join(os.path.dirname(__file__), "finviz_sentiment", "app.py")
+    spec = importlib.util.spec_from_file_location("finviz_sentiment_app", finviz_path)
+    finviz_sentiment_app = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(finviz_sentiment_app)
+    finviz_sentiment_app.main()
+
+def run_santiment():
+    """Run Santiment with frequency control to preserve API limits"""
+    # Import frequency controller
+    import sys
+    import os
+    controller_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", "santiment_frequency_controller.py"))
+    sys.path.insert(0, os.path.dirname(controller_path))
+    
+    try:
+        from santiment_frequency_controller import SantimentFrequencyController
+        
+        # Check if Santiment should run
+        controller = SantimentFrequencyController()
+        
+        if not controller.should_run_santiment(max_runs_per_day=2):
+            print("[SANTIMENT] Skipping run due to frequency control")
+            status = controller.get_status()
+            print(f"[SANTIMENT] Runs today: {status['runs_today']}/2")
+            print(f"[SANTIMENT] Last run: {status['last_run']}")
+            return
+        
+        print("[SANTIMENT] Frequency control allows run - proceeding...")
+        
+        # Run Santiment
+        import importlib.util
+        santiment_path = os.path.join(os.path.dirname(__file__), "santiment", "main.py")
+        spec = importlib.util.spec_from_file_location("santiment_main", santiment_path)
+        santiment_main = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(santiment_main)
+        santiment_main.main()
+        
+        # Record the run
+        controller.record_run()
+        print("[SANTIMENT] Run completed and recorded")
+        
+    except Exception as e:
+        print(f"[SANTIMENT] Error in frequency control: {e}")
+        print("[SANTIMENT] Falling back to direct run...")
+        # Fallback to direct run if frequency control fails
+        import importlib.util
+        santiment_path = os.path.join(os.path.dirname(__file__), "santiment", "main.py")
+        spec = importlib.util.spec_from_file_location("santiment_main", santiment_path)
+        santiment_main = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(santiment_main)
+        santiment_main.main()
+
+def run_all():
+    run_advisorai_data()
+    # run_crawl_news()
+    run_crypto_bubbles()
+    run_finnhub()
+    run_alpaca_features()
+    run_marketaux_news()
+    run_finviz_sentiment()
+    run_santiment()
+    print("[OK] All fetchers completed successfully.")
+
+
+def main():
+    # Simple CLI: python main.py [advisorai|crawl_news|crypto_bubbles|finnhub|alpaca_features|marketaux_news|finviz_sentiment|santiment|all] [TICKERS...]
+    if len(sys.argv) < 2 or sys.argv[1] == "all":
+        run_all()
+    elif sys.argv[1] == "advisorai":
+        run_advisorai_data()
+    # elif sys.argv[1] == "crawl_news":
+    #     run_crawl_news()
+    elif sys.argv[1] == "crypto_bubbles":
+        run_crypto_bubbles()
+    elif sys.argv[1] == "finnhub":
+        run_finnhub()
+    elif sys.argv[1] == "alpaca_features":
+        run_alpaca_features()
+    elif sys.argv[1] == "marketaux_news":
+        run_marketaux_news()
+    elif sys.argv[1] == "finviz_sentiment":
+        run_finviz_sentiment()
+    elif sys.argv[1] == "santiment":
+        run_santiment()
+    else:
+        print("Usage: python main.py [advisorai|crawl_news|crypto_bubbles|finnhub|alpaca_features|marketaux_news|finviz_sentiment|santiment|all] [TICKERS...]")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/fetchers/marketaux/news.py b/src/fetchers/marketaux/news.py
new file mode 100644
index 0000000000000000000000000000000000000000..5507bc2ef75ea6a3a3c244c7da73a59b4e25a3c3
--- /dev/null
+++ b/src/fetchers/marketaux/news.py
@@ -0,0 +1,386 @@
+import os
+import time
+import logging
+import random
+from typing import Any, Dict, List, Optional
+
+import requests
+from dotenv import load_dotenv
+
+
+class MarketauxEndpointRestricted(RuntimeError):
+    """Raised when the requested API endpoint is not available for the current subscription."""
+
+
+class MarketauxClient:
+    """
+    Client for interacting with the Marketaux API.
+
+    Key features
+    ------------
+    • Graceful handling of rate limits (HTTP 429) with exponential back‑off.
+    • API key switching for better rate limit management
+    • Friendly error when an endpoint is restricted by the subscription plan (HTTP 403
+      with `endpoint_access_restricted` code).
+    • Transparent pagination through `_fetch_all`.
+    """
+
+    BASE_URL = "https://api.marketaux.com/v1/"
+    MAX_RETRIES = 3
+    MAX_RETRIES_EXHAUSTED = 1  # Lower retry count when all keys are exhausted
+    BACKOFF_FACTOR = 2
+
+    def __init__(self, api_token: Optional[str] = None, logger: Optional[logging.Logger] = None):
+        load_dotenv()  # load MARKETAUX_API_TOKEN from .env if present
+        
+        # Set up API key switching
+        self.api_keys = self._load_api_keys(api_token)
+        self.current_key_index = 0
+        self.exhausted_keys = set()
+        
+        if not self.api_keys:
+            raise ValueError("Marketaux API token(s) must be provided or set as MARKETAUX_API_TOKEN")
+        
+        self.logger = logger or logging.getLogger(self.__class__.__name__)
+        self.logger.info(f"Initialized MarketauxClient with {len(self.api_keys)} API key(s)")
+        
+        # Validate API key diversity
+        self._validate_api_key_diversity()
+
+    def _load_api_keys(self, provided_token: Optional[str] = None) -> List[str]:
+        """Load and validate API keys from environment or provided token."""
+        api_keys = []
+        
+        if provided_token:
+            api_keys.append(provided_token)
+        
+        # Try to load from environment variables
+        env_token = os.getenv("MARKETAUX_API_TOKEN")
+        if env_token:
+            # Support comma-separated multiple keys
+            env_keys = [key.strip() for key in env_token.split(',') if key.strip()]
+            api_keys.extend(env_keys)
+        
+        # Try numbered environment variables
+        for i in range(1, 10):  # Support up to 9 keys
+            key = os.getenv(f"MARKETAUX_API_TOKEN_{i}")
+            if key:
+                api_keys.append(key.strip())
+        
+        # Remove duplicates while preserving order
+        seen = set()
+        unique_keys = []
+        for key in api_keys:
+            if key not in seen:
+                seen.add(key)
+                unique_keys.append(key)
+        
+        return unique_keys
+
+    def _get_current_api_key(self) -> str:
+        """Get the current API key, switching if necessary."""
+        if self.current_key_index >= len(self.api_keys):
+            self.current_key_index = 0
+        
+        return self.api_keys[self.current_key_index]
+    
+    def _are_all_keys_exhausted(self) -> bool:
+        """Check if all API keys have been exhausted."""
+        return len(self.exhausted_keys) >= len(self.api_keys)
+
+    def _switch_api_key(self) -> bool:
+        """Switch to the next available API key."""
+        current_key = self._get_current_api_key()
+        self.exhausted_keys.add(current_key)
+        
+        # Find next non-exhausted key
+        for i in range(len(self.api_keys)):
+            next_index = (self.current_key_index + 1 + i) % len(self.api_keys)
+            next_key = self.api_keys[next_index]
+            
+            if next_key not in self.exhausted_keys:
+                self.current_key_index = next_index
+                self.logger.info(f"Switched to API key #{next_index + 1}")
+                return True
+        
+        # All keys are exhausted
+        self.logger.error("All API keys have been exhausted")
+        return False
+
+    def _validate_api_key_diversity(self) -> bool:
+        """Validate that we have diverse API keys from different accounts."""
+        if len(self.api_keys) < 2:
+            return True  # Single key is always valid
+        
+        # Check for diversity by comparing key prefixes/suffixes
+        prefixes = set()
+        suffixes = set()
+        
+        for key in self.api_keys:
+            if len(key) >= 8:
+                prefixes.add(key[:4])
+                suffixes.add(key[-4:])
+        
+        diversity_score = len(prefixes) + len(suffixes)
+        total_possible = len(self.api_keys) * 2
+        
+        if diversity_score < total_possible * 0.6:  # Less than 60% diversity
+            self.logger.warning("API keys may be from the same account - limited rate limit benefits")
+        
+        return True
+
+    # ------------------------------------------------------------
+    # Low‑level HTTP request
+    # ------------------------------------------------------------
+    def _request(self, endpoint: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        url = f"{self.BASE_URL}{endpoint}"
+        
+        # Add current API key to params
+        params = params.copy()
+        params["api_token"] = self._get_current_api_key()
+        
+        # Determine retry limit based on whether keys are exhausted
+        max_retries = self.MAX_RETRIES_EXHAUSTED if self._are_all_keys_exhausted() else self.MAX_RETRIES
+        
+        for attempt in range(1, max_retries + 1):
+            response = requests.get(url, params=params, timeout=30)
+
+            # 429 – Rate limit or 402 – Usage limit reached
+            if response.status_code in [429, 402]:
+                # Try switching API key first
+                if self._switch_api_key():
+                    params["api_token"] = self._get_current_api_key()
+                    error_type = "Rate limit" if response.status_code == 429 else "Usage limit"
+                    self.logger.warning(f"{error_type} hit, switched API key and retrying immediately (attempt {attempt}/{max_retries})")
+                    continue
+                
+                # If no more keys available, handle gracefully
+                error_type = "Rate limit" if response.status_code == 429 else "Usage limit"
+                if attempt >= max_retries:
+                    # On final attempt, log and skip gracefully instead of crashing
+                    self.logger.warning(f"{error_type} hit, all keys exhausted. Skipping request to avoid crash.")
+                    return {"data": []}  # Return empty result instead of crashing
+                
+                # Wait before retry
+                reset_header = response.headers.get("X-Api-Ratelimit-Reset")
+                wait = 60
+                if reset_header and reset_header.isdigit():
+                    wait = max(5, int(reset_header) - int(time.time()))
+                self.logger.warning(f"{error_type} hit, all keys exhausted. Waiting %s s before retrying (%s/%s)…", wait, attempt, max_retries)
+                time.sleep(wait * self.BACKOFF_FACTOR ** (attempt - 1))
+                continue
+
+            # 403 – Endpoint not available on plan or potentially invalid key
+            if response.status_code == 403:
+                try:
+                    payload = response.json()
+                    error_code = payload.get("error", {}).get("code")
+                    
+                    if error_code == "endpoint_access_restricted":
+                        raise MarketauxEndpointRestricted(payload["error"]["message"])
+                    elif error_code in ["invalid_api_token", "unauthorized"]:
+                        # Try switching API key
+                        if self._switch_api_key():
+                            params["api_token"] = self._get_current_api_key()
+                            self.logger.warning(f"Invalid API key detected, switched and retrying (attempt {attempt}/{self.MAX_RETRIES})")
+                            continue
+                        else:
+                            raise RuntimeError("All API keys are invalid or exhausted")
+                except ValueError:
+                    pass  # fall through to generic error handler
+
+            # Other errors
+            if not response.ok:
+                raise RuntimeError(f"Marketaux API error: {response.status_code} — {response.text}")
+
+            # Success
+            return response.json()
+
+        # If we exhausted retries, return empty result instead of crashing
+        if self._are_all_keys_exhausted():
+            self.logger.warning("All API keys exhausted, returning empty result to prevent pipeline crash")
+            return {"data": []}
+        else:
+            raise RuntimeError("Exceeded maximum retries for Marketaux API")
+
+    # ------------------------------------------------------------
+    # Pagination helper
+    # ------------------------------------------------------------
+    def _fetch_all(self, endpoint: str, params: Optional[Dict[str, Any]] = None, *, paginate: bool = True) -> List[Dict[str, Any]]:
+        params = params.copy() if params else {}
+        # Don't add api_token here - it's added in _request method
+
+        all_data: List[Dict[str, Any]] = []
+        page = 1
+
+        while True:
+            params["page"] = page
+            result = self._request(endpoint, params)
+            all_data.extend(result.get("data", []))
+
+            if not paginate:
+                break
+
+            meta = result.get("meta", {})
+            returned = meta.get("returned", 0)
+            limit = meta.get("limit", 0)
+            found = meta.get("found", 0)
+
+            if returned < limit or len(all_data) >= found:
+                break
+            page += 1
+
+        return all_data
+
+    # ------------------------------------------------------------
+    # High‑level convenience methods
+    # ------------------------------------------------------------
+    def fetch_news_all(
+        self,
+        *,
+        symbols: Optional[List[str]] = None,
+        limit: int = 20,
+        must_have_entities: bool = True,
+        published_after: Optional[str] = None,
+        published_before: Optional[str] = None,
+        language: Optional[str] = None,
+        sort: str = "published_at.desc",
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch news articles matching the supplied filters."""
+        params: Dict[str, Any] = {"limit": limit, "sort": sort}
+        if symbols:
+            params["symbols"] = ",".join(symbols)
+        if must_have_entities:
+            params["must_have_entities"] = "true"
+        if published_after:
+            params["published_after"] = published_after
+        if published_before:
+            params["published_before"] = published_before
+        if language:
+            params["language"] = language
+        return self._fetch_all("news/all", params, paginate=paginate)
+
+    def fetch_entity_stats_aggregation(
+        self,
+        *,
+        symbols: Optional[List[str]] = None,
+        interval: str = "day",
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch aggregated entity stats (daily/hourly)."""
+        params: Dict[str, Any] = {"interval": interval, "limit": limit}
+        if symbols:
+            params["symbols"] = ",".join(symbols)
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("entity/stats/aggregation", params, paginate=paginate)
+
+    def fetch_entity_stats_intraday(
+        self,
+        *,
+        symbols: Optional[List[str]] = None,
+        interval: str = "minute",
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch intraday entity stats (minute/5‑minute granularity)."""
+        params: Dict[str, Any] = {"interval": interval, "limit": limit}
+        if symbols:
+            params["symbols"] = ",".join(symbols)
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("entity/stats/intraday", params, paginate=paginate)
+
+    def fetch_trending_aggregation(
+        self,
+        *,
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch aggregated trending data."""
+        params: Dict[str, Any] = {"limit": limit}
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("trending/aggregation", params, paginate=paginate)
+
+    def fetch_trending_intraday(
+        self,
+        *,
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch intraday trending data."""
+        params: Dict[str, Any] = {"limit": limit}
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("trending/intraday", params, paginate=paginate)
+
+def main():
+    import sys
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+    import pprint
+
+    client = MarketauxClient()
+    pp = pprint.PrettyPrinter(depth=2)
+
+    news = client.fetch_news_all(symbols=["AAPL", "MSFT","NVDA", "GOOGL", "TSLA"], limit=20, paginate=False)
+    print(f"Fetched {len(news)} news articles from Marketaux.")
+    
+    # Save to Parquet file
+    import pandas as pd
+    out_dir = os.path.join("data", "marketaux", "news")
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, "news_latest.parquet")
+    df = pd.DataFrame(news)
+    df.to_parquet(out_path, index=False)
+    print(f"Saved news articles to {out_path}")
+
+    # Upload all files in data/marketaux/news to S3 under news/marketaux_news/
+    from data_cloud.cloud_utils import StorageHandler
+    from dotenv import load_dotenv
+    load_dotenv()
+    FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT")
+    FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY")
+    FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY")
+    FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET")
+
+    storage = StorageHandler(
+        endpoint_url=FILEBASE_ENDPOINT,
+        access_key=FILEBASE_ACCESS_KEY,
+        secret_key=FILEBASE_SECRET_KEY,
+        bucket_name=FILEBASE_BUCKET,
+        local_base="data"
+    )
+
+    local_news_dir = os.path.join("data", "marketaux", "news")
+    s3_news_prefix = "news/marketaux_news/"
+    for root, _, files in os.walk(local_news_dir):
+        for fname in files:
+            local_path = os.path.join(root, fname)
+            rel_path = os.path.relpath(local_path, local_news_dir)
+            s3_key = s3_news_prefix + rel_path.replace("\\", "/")
+            with open(local_path, "rb") as f:
+                file_bytes = f.read()
+            storage.upload(s3_key, file_bytes, content_type="application/octet-stream")
+            print(f"[OK] Uploaded {local_path} -> S3:{s3_key}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/fetchers/marketaux/news_original.py b/src/fetchers/marketaux/news_original.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b0f48f912c0eaea0d213ca484f860fe7800b17c
--- /dev/null
+++ b/src/fetchers/marketaux/news_original.py
@@ -0,0 +1,253 @@
+import os
+import time
+import logging
+from typing import Any, Dict, List, Optional
+
+import requests
+from dotenv import load_dotenv
+
+
+class MarketauxEndpointRestricted(RuntimeError):
+    """Raised when the requested API endpoint is not available for the current subscription."""
+
+
+class MarketauxClient:
+    """
+    Client for interacting with the Marketaux API.
+
+    Key features
+    ------------
+    • Graceful handling of rate limits (HTTP 429) with exponential back‑off.
+    • Friendly error when an endpoint is restricted by the subscription plan (HTTP 403
+      with `endpoint_access_restricted` code).
+    • Transparent pagination through `_fetch_all`.
+    """
+
+    BASE_URL = "https://api.marketaux.com/v1/"
+    MAX_RETRIES = 3
+    BACKOFF_FACTOR = 2
+
+    def __init__(self, api_token: Optional[str] = None, logger: Optional[logging.Logger] = None):
+        load_dotenv()  # load MARKETAUX_API_TOKEN from .env if present
+        self.api_token = api_token or os.getenv("MARKETAUX_API_TOKEN")
+        if not self.api_token:
+            raise ValueError("Marketaux API token must be provided or set as MARKETAUX_API_TOKEN")
+        self.logger = logger or logging.getLogger(self.__class__.__name__)
+
+    # ------------------------------------------------------------
+    # Low‑level HTTP request
+    # ------------------------------------------------------------
+    def _request(self, endpoint: str, params: Dict[str, Any]) -> Dict[str, Any]:
+        url = f"{self.BASE_URL}{endpoint}"
+        for attempt in range(1, self.MAX_RETRIES + 1):
+            response = requests.get(url, params=params, timeout=30)
+
+            # 429 – Rate limit
+            if response.status_code == 429:
+                reset_header = response.headers.get("X-Api-Ratelimit-Reset")
+                wait = 60
+                if reset_header and reset_header.isdigit():
+                    wait = max(5, int(reset_header) - int(time.time()))
+                self.logger.warning("Rate‑limit hit. Waiting %s s before retrying (%s/%s)…", wait, attempt, self.MAX_RETRIES)
+                time.sleep(wait * self.BACKOFF_FACTOR ** (attempt - 1))
+                continue
+
+            # 403 – Endpoint not available on plan
+            if response.status_code == 403:
+                try:
+                    payload = response.json()
+                    if payload.get("error", {}).get("code") == "endpoint_access_restricted":
+                        raise MarketauxEndpointRestricted(payload["error"]["message"])
+                except ValueError:
+                    pass  # fall through to generic error handler
+
+            # Other errors
+            if not response.ok:
+                raise RuntimeError(f"Marketaux API error: {response.status_code} — {response.text}")
+
+            # Success
+            return response.json()
+
+        # If we exhausted retries
+        raise RuntimeError("Exceeded maximum retries for Marketaux API")
+
+    # ------------------------------------------------------------
+    # Pagination helper
+    # ------------------------------------------------------------
+    def _fetch_all(self, endpoint: str, params: Optional[Dict[str, Any]] = None, *, paginate: bool = True) -> List[Dict[str, Any]]:
+        params = params.copy() if params else {}
+        params["api_token"] = self.api_token
+
+        all_data: List[Dict[str, Any]] = []
+        page = 1
+
+        while True:
+            params["page"] = page
+            result = self._request(endpoint, params)
+            all_data.extend(result.get("data", []))
+
+            if not paginate:
+                break
+
+            meta = result.get("meta", {})
+            returned = meta.get("returned", 0)
+            limit = meta.get("limit", 0)
+            found = meta.get("found", 0)
+
+            if returned < limit or len(all_data) >= found:
+                break
+            page += 1
+
+        return all_data
+
+    # ------------------------------------------------------------
+    # High‑level convenience methods
+    # ------------------------------------------------------------
+    def fetch_news_all(
+        self,
+        *,
+        symbols: Optional[List[str]] = None,
+        limit: int = 20,
+        must_have_entities: bool = True,
+        published_after: Optional[str] = None,
+        published_before: Optional[str] = None,
+        language: Optional[str] = None,
+        sort: str = "published_at.desc",
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch news articles matching the supplied filters."""
+        params: Dict[str, Any] = {"limit": limit, "sort": sort}
+        if symbols:
+            params["symbols"] = ",".join(symbols)
+        if must_have_entities:
+            params["must_have_entities"] = "true"
+        if published_after:
+            params["published_after"] = published_after
+        if published_before:
+            params["published_before"] = published_before
+        if language:
+            params["language"] = language
+        return self._fetch_all("news/all", params, paginate=paginate)
+
+    def fetch_entity_stats_aggregation(
+        self,
+        *,
+        symbols: Optional[List[str]] = None,
+        interval: str = "day",
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch aggregated entity stats (daily/hourly)."""
+        params: Dict[str, Any] = {"interval": interval, "limit": limit}
+        if symbols:
+            params["symbols"] = ",".join(symbols)
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("entity/stats/aggregation", params, paginate=paginate)
+
+    def fetch_entity_stats_intraday(
+        self,
+        *,
+        symbols: Optional[List[str]] = None,
+        interval: str = "minute",
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch intraday entity stats (minute/5‑minute granularity)."""
+        params: Dict[str, Any] = {"interval": interval, "limit": limit}
+        if symbols:
+            params["symbols"] = ",".join(symbols)
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("entity/stats/intraday", params, paginate=paginate)
+
+    def fetch_trending_aggregation(
+        self,
+        *,
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch aggregated trending data."""
+        params: Dict[str, Any] = {"limit": limit}
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("trending/aggregation", params, paginate=paginate)
+
+    def fetch_trending_intraday(
+        self,
+        *,
+        limit: int = 100,
+        date_from: Optional[str] = None,
+        date_to: Optional[str] = None,
+        paginate: bool = True,
+    ) -> List[Dict[str, Any]]:
+        """Fetch intraday trending data."""
+        params: Dict[str, Any] = {"limit": limit}
+        if date_from:
+            params["date_from"] = date_from
+        if date_to:
+            params["date_to"] = date_to
+        return self._fetch_all("trending/intraday", params, paginate=paginate)
+
+def main():
+    import sys
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
+    import pprint
+
+    client = MarketauxClient()
+    pp = pprint.PrettyPrinter(depth=2)
+
+    news = client.fetch_news_all(symbols=["AAPL", "MSFT","NVDA", "GOOGL", "TSLA"], limit=20, paginate=False)
+    print(f"Fetched {len(news)} news articles from Marketaux.")
+    
+    # Save to Parquet file
+    import pandas as pd
+    out_dir = os.path.join("data", "marketaux", "news")
+    os.makedirs(out_dir, exist_ok=True)
+    out_path = os.path.join(out_dir, "news_latest.parquet")
+    df = pd.DataFrame(news)
+    df.to_parquet(out_path, index=False)
+    print(f"Saved news articles to {out_path}")
+
+    # Upload all files in data/marketaux/news to S3 under news/marketaux_news/
+    from data_cloud.cloud_utils import StorageHandler
+    from dotenv import load_dotenv
+    load_dotenv()
+    FILEBASE_ENDPOINT = os.getenv("FILEBASE_ENDPOINT")
+    FILEBASE_ACCESS_KEY = os.getenv("FILEBASE_ACCESS_KEY")
+    FILEBASE_SECRET_KEY = os.getenv("FILEBASE_SECRET_KEY")
+    FILEBASE_BUCKET = os.getenv("FILEBASE_BUCKET")
+
+    storage = StorageHandler(
+        endpoint_url=FILEBASE_ENDPOINT,
+        access_key=FILEBASE_ACCESS_KEY,
+        secret_key=FILEBASE_SECRET_KEY,
+        bucket_name=FILEBASE_BUCKET,
+        local_base="data"
+    )
+
+    local_news_dir = os.path.join("data", "marketaux", "news")
+    s3_news_prefix = "news/marketaux_news/"
+    for root, _, files in os.walk(local_news_dir):
+        for fname in files:
+            local_path = os.path.join(root, fname)
+            rel_path = os.path.relpath(local_path, local_news_dir)
+            s3_key = s3_news_prefix + rel_path.replace("\\", "/")
+            with open(local_path, "rb") as f:
+                file_bytes = f.read()
+            storage.upload(s3_key, file_bytes, content_type="application/octet-stream")
+            print(f"[OK] Uploaded {local_path} -> S3:{s3_key}")
+if __name__ == "__main__":
+    main()
diff --git a/src/fetchers/santiment/main.py b/src/fetchers/santiment/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..12bb0f1e1075029185e3eb15464b7a88336e9d7f
--- /dev/null
+++ b/src/fetchers/santiment/main.py
@@ -0,0 +1,1871 @@
+"""
+Comprehensive Santiment Data Fetcher
+====================================
+
+This module provides a complete data fetcher for the Santiment API using the sanpy library.
+It maximizes data retrieval by organizing metrics into categories and providing batch operations.
+
+Features:
+- Fetches all available metrics organized by category
+- Supports batch operations for efficient API usage
+- Handles rate limiting and error management
+- Provides data export capabilities
+- Supports both single asset and multi-asset queries
+- Includes SQL query execution for custom data needs
+
+Author: AI Assistant
+Version: 1.0.0
+"""
+
+import san
+import pandas as pd
+import numpy as np
+import time
+import logging
+from datetime import datetime, timedelta
+from typing import List, Dict, Optional, Union, Any
+import json
+import os
+from dataclasses import dataclass, field
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# Load environment variables
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass  # dotenv not available, continue without it
+import warnings
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Resolve data directory base
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+from pathlib import Path
+
+def _resolve_under_data(path_like: str | os.PathLike) -> str:
+    p = Path(path_like)
+    if p.is_absolute():
+        return str(p)
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return str(Path(CFG_DATA_DIR) / rel)
+
+@dataclass
+class FetchConfig:
+    """Configuration class for data fetching parameters - OPTIMIZED FOR API CONSERVATION"""
+    from_date: str = "2024-01-01"  # Reduced from 2020 to save API calls
+    to_date: str = "utc_now"
+    interval: str = "1d"
+    include_incomplete_data: bool = False
+    batch_size: int = 25  # Reduced from 50 to save API calls
+    max_workers: int = 5   # Reduced from 10 to save API calls
+    rate_limit_delay: int = 60
+    export_format: str = "parquet"  # csv, json, parquet
+    export_directory: str = "data/santiment"
+
+class SantimentDataFetcher:
+    """
+    Comprehensive Santiment Data Fetcher
+    
+    This class provides methods to fetch maximum possible data from Santiment API
+    using the sanpy library with efficient batch operations and error handling.
+    """
+    
+    def __init__(self, api_key: Optional[str] = None, config: Optional[FetchConfig] = None):
+        """
+        Initialize the Santiment Data Fetcher
+        
+        Args:
+            api_key: Santiment API key(s) for accessing restricted data (comma-separated for multiple keys)
+            config: FetchConfig object with fetching parameters
+        """
+        self.config = config or FetchConfig()
+        self._normalize_dates()
+
+        # Set up multiple API keys
+        self._setup_api_keys(api_key)
+
+        # Resolve export directory under DATA_DIR, create and clean up existing files
+        self.config.export_directory = _resolve_under_data(self.config.export_directory)
+        os.makedirs(self.config.export_directory, exist_ok=True)
+        self._cleanup_existing_files()
+
+        # Initialize data storage
+        self.fetched_data: Dict[str, pd.DataFrame] = {}
+        self.failed_queries: List[Dict] = []
+
+        # Define comprehensive metric categories
+        self.metric_categories = self._define_metric_categories()
+
+        # Get available metrics and projects
+        self._initialize_metadata()
+        
+        # Initialize symbol normalization
+        self.symbol_normalizer = self._setup_symbol_normalizer()
+    
+    def _setup_symbol_normalizer(self):
+        """
+        Set up symbol normalization mapping for consistent asset identification
+        
+        Returns:
+            Dictionary mapping various symbol formats to canonical slugs
+        """
+        # Canonical mapping for major crypto assets
+        # Maps various symbols/names to the official Santiment slug
+        symbol_mapping = {
+            # Bitcoin variants
+            'bitcoin': 'bitcoin',
+            'btc': 'bitcoin',
+            'Bitcoin': 'bitcoin',
+            'BTC': 'bitcoin',
+            
+            # Ethereum variants  
+            'ethereum': 'ethereum',
+            'eth': 'ethereum',
+            'Ethereum': 'ethereum',
+            'ETH': 'ethereum',
+            
+            # Ripple/XRP variants
+            'ripple': 'ripple',
+            'xrp': 'ripple',
+            'Ripple': 'ripple',
+            'XRP': 'ripple',
+            
+            # Solana variants
+            'solana': 'solana',
+            'sol': 'solana',
+            'Solana': 'solana',
+            'SOL': 'solana',
+            
+            # Cardano variants
+            'cardano': 'cardano',
+            'ada': 'cardano',
+            'Cardano': 'cardano',
+            'ADA': 'cardano',
+            
+            # Polkadot variants
+            'polkadot': 'polkadot',
+            'dot': 'polkadot',
+            'Polkadot': 'polkadot',
+            'DOT': 'polkadot',
+            
+            # Chainlink variants
+            'chainlink': 'chainlink',
+            'link': 'chainlink',
+            'Chainlink': 'chainlink',
+            'LINK': 'chainlink',
+            
+            # Litecoin variants
+            'litecoin': 'litecoin',
+            'ltc': 'litecoin',
+            'Litecoin': 'litecoin',
+            'LTC': 'litecoin',
+            
+            # Bitcoin Cash variants
+            'bitcoin-cash': 'bitcoin-cash',
+            'bch': 'bitcoin-cash',
+            'Bitcoin Cash': 'bitcoin-cash',
+            'BCH': 'bitcoin-cash',
+            
+            # Stellar variants
+            'stellar': 'stellar',
+            'xlm': 'stellar',
+            'Stellar': 'stellar',
+            'XLM': 'stellar',
+            
+            # Ethereum Classic variants
+            'ethereum-classic': 'ethereum-classic',
+            'etc': 'ethereum-classic',
+            'Ethereum Classic': 'ethereum-classic',
+            'ETC': 'ethereum-classic',
+            
+            # EOS variants
+            'eos': 'eos',
+            'EOS': 'eos',
+        }
+        
+        logger.info(f"Initialized symbol normalizer with {len(symbol_mapping)} mappings")
+        return symbol_mapping
+    
+    def normalize_symbol(self, symbol: str) -> str:
+        """
+        Normalize a symbol to its canonical Santiment slug
+        
+        Args:
+            symbol: Symbol to normalize
+            
+        Returns:
+            Canonical slug
+        """
+        if symbol in self.symbol_normalizer:
+            canonical = self.symbol_normalizer[symbol]
+            if symbol != canonical:
+                logger.debug(f"Normalized '{symbol}' -> '{canonical}'")
+            return canonical
+        
+        # If not found in mapping, return as-is but log warning
+        logger.warning(f"Unknown symbol '{symbol}' not found in normalization mapping")
+        return symbol.lower()
+    
+    def get_symbol_alternatives(self, symbol: str) -> List[str]:
+        """
+        Get all alternative symbols for a given symbol (both directions)
+        
+        Args:
+            symbol: Symbol to find alternatives for
+            
+        Returns:
+            List of alternative symbols including the original
+        """
+        alternatives = [symbol]
+        
+        # Create reverse mapping to find alternatives
+        reverse_mapping = {}
+        for variant, canonical in self.symbol_normalizer.items():
+            if canonical not in reverse_mapping:
+                reverse_mapping[canonical] = []
+            reverse_mapping[canonical].append(variant)
+        
+        # If symbol is a canonical, get all its variants
+        if symbol in reverse_mapping:
+            alternatives.extend(reverse_mapping[symbol])
+        
+        # If symbol is a variant, get the canonical and other variants
+        canonical = self.normalize_symbol(symbol)
+        if canonical in reverse_mapping:
+            alternatives.extend(reverse_mapping[canonical])
+        
+        # Remove duplicates and return
+        return list(set(alternatives))
+
+    def fetch_single_metric_with_alternatives(self, metric: str, slug: str, **kwargs) -> Optional[pd.DataFrame]:
+        """
+        Fetch a single metric for a single asset, trying alternative symbols if the primary fails
+        
+        Args:
+            metric: The metric name
+            slug: The asset slug (will try alternatives if this fails)
+            **kwargs: Additional parameters for the API call
+            
+        Returns:
+            DataFrame with the metric data or None if failed
+        """
+        # Get all alternative symbols to try
+        alternatives = self.get_symbol_alternatives(slug)
+        logger.debug(f"Trying alternatives for {slug}: {alternatives}")
+        
+        # Try each alternative in order (start with the normalized canonical form)
+        canonical = self.normalize_symbol(slug)
+        if canonical != slug:
+            alternatives = [canonical] + [alt for alt in alternatives if alt != canonical]
+        
+        for i, alt_slug in enumerate(alternatives):
+            try:
+                data = self.fetch_single_metric(metric, alt_slug, **kwargs)
+                if data is not None and not data.empty:
+                    if i > 0 or alt_slug != slug:  # Successfully fetched with alternative
+                        logger.info(f"[ALT_SUCCESS] {metric} for {slug} succeeded using alternative '{alt_slug}'")
+                        # Update slug column to reflect the original requested slug for consistency
+                        data['slug'] = slug
+                        data['alternative_slug_used'] = alt_slug
+                    return data
+            except Exception as e:
+                error_msg = str(e)
+                # Check if this is a metric-level error that won't be fixed by trying other slugs
+                if any(skip_phrase in error_msg.lower() for skip_phrase in [
+                    'not supported for',
+                    'not implemented for',
+                    'outside the allowed interval',
+                    'upgrade to a higher tier'
+                ]):
+                    logger.warning(f"[METRIC_SKIP] {metric} has fundamental issues, skipping all alternatives: {error_msg}")
+                    break  # Don't try other alternatives for this metric
+                    
+                # If it's just a slug issue, continue trying alternatives
+                if 'is not an existing slug' in error_msg.lower():
+                    logger.debug(f"Alternative {alt_slug} failed for {metric}: {e}")
+                    continue
+                else:
+                    logger.debug(f"Alternative {alt_slug} failed for {metric}: {e}")
+                    continue
+        
+        logger.warning(f"[ALT_FAILED] All alternatives failed for {metric} with slug {slug}")
+        return None
+    
+    def normalize_slug_list(self, slugs: List[str]) -> List[str]:
+        """
+        Normalize a list of slugs and remove duplicates
+        
+        Args:
+            slugs: List of slugs to normalize
+            
+        Returns:
+            List of normalized, deduplicated slugs
+        """
+        normalized = []
+        seen = set()
+        
+        for slug in slugs:
+            canonical = self.normalize_symbol(slug)
+            if canonical not in seen:
+                normalized.append(canonical)
+                seen.add(canonical)
+            else:
+                logger.debug(f"Removed duplicate slug: {slug} (canonical: {canonical})")
+        
+        logger.info(f"Normalized {len(slugs)} slugs to {len(normalized)} unique canonical slugs")
+        return normalized
+
+    def _normalize_dates(self):
+        """
+        Convert relative date strings in self.config.from_date / to_date
+        into absolute YYYY-MM-DD dates that Sanpy can parse.
+        Supports:
+         - "ND" (e.g. "30d") → today minus N days
+         - "utc_now"    → today
+        """
+        now = datetime.utcnow()
+        # from_date: e.g. "30d"
+        fd = self.config.from_date.strip().lower()
+        if fd.endswith('d') and fd[:-1].isdigit():
+            days = int(fd[:-1])
+            from_dt = now - timedelta(days=days)
+            # Sanpy expects "YYYY-MM-DD"
+            self.config.from_date = from_dt.strftime('%Y-%m-%d')
+
+        # to_date: sometimes set to "utc_now"
+        td = self.config.to_date.strip().lower()
+        if td == 'utc_now':
+            self.config.to_date = now.strftime('%Y-%m-%d')
+
+    def _setup_api_keys(self, api_key: Optional[str] = None):
+        """
+        Set up multiple API keys for rate limit handling
+        
+        Args:
+            api_key: API key(s) - can be comma-separated for multiple keys
+        """
+        # Parse API keys from parameter or environment
+        api_key_string = api_key or os.getenv('SANTIMENT_API_KEY')
+        
+        if api_key_string:
+            # Support comma-separated API keys
+            self.api_keys = [key.strip() for key in api_key_string.split(',') if key.strip()]
+            logger.info(f"Santiment fetcher initialized with {len(self.api_keys)} API key(s)")
+            
+            # Check if all keys are from the same account
+            if len(self.api_keys) > 1:
+                logger.info("Multiple API keys detected. Testing key diversity...")
+                self._validate_api_key_diversity()
+        else:
+            self.api_keys = []
+            logger.warning("No API key provided - limited to free tier data")
+        
+        # Initialize API key management
+        self.current_key_index = 0
+        self.rate_limit_switches = 0
+        
+        # Set initial API key
+        if self.api_keys:
+            self._set_current_api_key()
+    
+    def _validate_api_key_diversity(self):
+        """
+        Validate that API keys are from different accounts for effective rate limit handling
+        """
+        try:
+            user_ids = set()
+            functional_keys = 0
+            rate_limited_keys = 0
+            
+            for i, key in enumerate(self.api_keys[:3]):  # Test only first 3 to avoid exhausting quota
+                # Temporarily set this key
+                san.ApiConfig.api_key = key
+                
+                try:
+                    # Make a simple query to get user info
+                    result = san.execute_sql(query="SELECT 1", set_index=None)
+                    
+                    # If successful, key is functional but we can't determine user ID without error
+                    functional_keys += 1
+                    logger.info(f"API Key #{i+1}: {key[:8]}... appears functional")
+                    
+                except Exception as e:
+                    error_str = str(e)
+                    if 'user with id' in error_str:
+                        # Extract user ID from error message
+                        import re
+                        match = re.search(r'user with id (\d+)', error_str)
+                        if match:
+                            user_id = match.group(1)
+                            user_ids.add(user_id)
+                            rate_limited_keys += 1
+                            logger.info(f"API Key #{i+1}: {key[:8]}... belongs to user ID {user_id} (rate limited)")
+                    else:
+                        logger.debug(f"API Key #{i+1}: {key[:8]}... - {error_str}")
+            
+            # Reset to first key
+            self.current_key_index = 0
+            self._set_current_api_key()
+            
+            # Analyze results
+            if rate_limited_keys > 0 and len(user_ids) == 1:
+                if functional_keys > 0:
+                    logger.warning("⚠️  WARNING: Cannot determine if all API keys are from different accounts!")
+                    logger.warning(f"⚠️  {rate_limited_keys} key(s) belong to user ID {list(user_ids)[0]}, {functional_keys} key(s) appear functional")
+                    logger.warning("⚠️  If functional keys are from the same account, rate limit switching won't work.")
+                    logger.warning("⚠️  For guaranteed effective rate limiting, use API keys from different Santiment accounts.")
+                    logger.warning("⚠️  Create additional accounts at https://app.santiment.net/")
+                else:
+                    logger.warning("⚠️  WARNING: All tested API keys belong to the same Santiment account!")
+                    logger.warning("⚠️  Rate limits are applied per account, not per key.")
+                    logger.warning("⚠️  API key switching will not be effective with same-account keys.")
+                    logger.warning("⚠️  Create additional accounts at https://app.santiment.net/")
+            elif len(user_ids) > 1:
+                logger.info(f"✅ Good! API keys are from {len(user_ids)} different accounts.")
+                logger.info("✅ This will provide effective rate limit distribution.")
+            elif functional_keys == len(self.api_keys):
+                logger.info("✅ All API keys appear functional.")
+                logger.info("ℹ️  Cannot determine account diversity without rate limit errors.")
+                logger.info("ℹ️  Monitor rate limit switches during operation to verify effectiveness.")
+            
+        except Exception as e:
+            logger.debug(f"Could not validate API key diversity: {e}")
+            logger.info("API key diversity validation skipped - continuing with provided keys")
+    
+    def _set_current_api_key(self):
+        """Set the current API key in san.ApiConfig"""
+        if self.api_keys:
+            current_key = self.api_keys[self.current_key_index]
+            san.ApiConfig.api_key = current_key
+            logger.info(f"Using API key #{self.current_key_index + 1}: {current_key[:8]}...")
+        else:
+            san.ApiConfig.api_key = None
+    
+    def _switch_api_key(self):
+        """Switch to the next available API key"""
+        if len(self.api_keys) <= 1:
+            logger.warning("Only one or no API keys available, cannot switch")
+            return False
+        
+        old_index = self.current_key_index
+        self.current_key_index = (self.current_key_index + 1) % len(self.api_keys)
+        self.rate_limit_switches += 1
+        
+        logger.info(f"[SWITCH] Switching from API key #{old_index + 1} to #{self.current_key_index + 1} (switch #{self.rate_limit_switches})")
+        
+        # Warn if switching too frequently (indicates same account issue)
+        if self.rate_limit_switches > len(self.api_keys) * 2:
+            logger.warning("⚠️  High number of API key switches detected!")
+            logger.warning("⚠️  This suggests all keys may be from the same account.")
+            logger.warning("⚠️  Consider using API keys from different Santiment accounts.")
+        
+        # Set new API key
+        self._set_current_api_key()
+        
+        # Add a delay after switching keys
+        time.sleep(2.0)
+        return True
+    
+    def _is_rate_limit_error(self, error_message):
+        """Check if the error indicates a rate limit issue"""
+        rate_limit_indicators = [
+            "429",
+            "rate limit",
+            "too many requests",
+            "api limit",
+            "quota exceeded",
+            "limit exceeded",
+            "rate_limit_exception",
+            "API Rate Limit Reached",
+            "rate limit reached"
+        ]
+        error_str = str(error_message).lower()
+        return any(indicator in error_str for indicator in rate_limit_indicators)
+
+    def _cleanup_existing_files(self):
+        """
+        Clean up all existing files in the export directory before starting a new fetch.
+        This prevents accumulation of old data files from previous runs.
+        """
+        import glob
+        import shutil
+        
+        if not os.path.exists(self.config.export_directory):
+            return
+        
+        try:
+            # Get all files in the export directory
+            all_files = glob.glob(os.path.join(self.config.export_directory, "*"))
+            
+            if all_files:
+                logger.info(f"Cleaning up {len(all_files)} existing files in {self.config.export_directory}")
+                
+                for file_path in all_files:
+                    try:
+                        if os.path.isfile(file_path):
+                            os.remove(file_path)
+                            logger.debug(f"Removed file: {os.path.basename(file_path)}")
+                        elif os.path.isdir(file_path):
+                            shutil.rmtree(file_path)
+                            logger.debug(f"Removed directory: {os.path.basename(file_path)}")
+                    except Exception as e:
+                        logger.warning(f"Failed to remove {file_path}: {e}")
+                
+                logger.info(f"Successfully cleaned up export directory: {self.config.export_directory}")
+            else:
+                logger.info(f"Export directory is already clean: {self.config.export_directory}")
+                
+        except Exception as e:
+            logger.error(f"Failed to cleanup export directory {self.config.export_directory}: {e}")
+            # Don't raise the exception - just log it and continue
+
+    def _define_metric_categories(self) -> Dict[str, List[str]]:
+        """Define REDUCED categories of Santiment metrics for API conservation."""
+        return {
+            # Essential Financial Metrics Only
+            'financial': [
+                'price_usd', 'marketcap_usd', 'volume_usd'
+                # Reduced from 12 to 3 most important metrics
+            ],
+
+            # Core Network Activity
+            'network_activity': [
+                'daily_active_addresses', 'new_addresses'
+                # Reduced from 9 to 2 most important metrics
+            ],
+
+            # Basic Transaction Metrics
+            'transactions': [
+                'transaction_count', 'transaction_volume_usd'
+                # Reduced from 8 to 2 most important metrics
+            ],
+
+            # Essential Exchange Metrics
+            'exchange': [
+                'exchange_inflow', 'exchange_outflow'
+                # Reduced from 8 to 2 most important metrics
+            ]
+            
+            # Removed: supply, development, social, derivatives, whales
+            # This reduces API calls by ~70% while keeping core metrics
+        }
+
+    def _initialize_metadata(self):
+        """Initialize metadata about available metrics and projects"""
+        try:
+            logger.info("Fetching available metrics...")
+            self.available_metrics = san.available_metrics()
+            logger.info(f"Found {len(self.available_metrics)} available metrics")
+            
+            logger.info("Fetching available projects...")
+            self.projects_df = san.get("projects/all")
+            self.available_slugs = self.projects_df['slug'].tolist()
+            logger.info(f"Found {len(self.available_slugs)} available projects")
+            
+        except Exception as e:
+            logger.error(f"Failed to initialize metadata: {e}")
+            self.available_metrics = []
+            self.available_slugs = []
+
+    def get_metric_metadata(self, metric: str) -> Dict[str, Any]:
+        """
+        Get metadata for a specific metric
+        
+        Args:
+            metric: The metric name
+            
+        Returns:
+            Dictionary containing metric metadata
+        """
+        try:
+            metadata = san.metadata(
+                metric,
+                arr=["availableSlugs", "defaultAggregation", "humanReadableName", 
+                     "isAccessible", "isRestricted", "restrictedFrom", "restrictedTo"]
+            )
+            return metadata
+        except Exception as e:
+            logger.warning(f"Failed to get metadata for {metric}: {e}")
+            return {}
+
+    def fetch_single_metric(self, metric: str, slug: str, **kwargs) -> Optional[pd.DataFrame]:
+        """
+        Fetch a single metric for a single asset
+        
+        Args:
+            metric: The metric name
+            slug: The asset slug
+            **kwargs: Additional parameters for the API call
+            
+        Returns:
+            DataFrame with the metric data or None if failed
+        """
+        max_retries = len(self.api_keys) if self.api_keys else 1
+        keys_tried = set()
+        
+        for attempt in range(max_retries):
+            try:
+                # If we've tried all keys, reset and wait
+                if len(keys_tried) >= len(self.api_keys) and self.api_keys:
+                    logger.warning(f"All {len(self.api_keys)} API keys exhausted for {metric}, waiting 30 seconds...")
+                    time.sleep(30)
+                    keys_tried.clear()
+                    self.current_key_index = 0
+                    self._set_current_api_key()
+                
+                params = {
+                    'slug': slug,
+                    'from_date': kwargs.get('from_date', self.config.from_date),
+                    'to_date': kwargs.get('to_date', self.config.to_date),
+                    'interval': kwargs.get('interval', self.config.interval),
+                    'include_incomplete_data': kwargs.get('include_incomplete_data', self.config.include_incomplete_data)
+                }
+                
+                # Add any additional selector parameters
+                if 'selector' in kwargs:
+                    params['selector'] = kwargs['selector']
+                
+                data = san.get(metric, **params)
+                
+                if data is not None and not data.empty:
+                    # Add metadata columns
+                    data['metric'] = metric
+                    data['slug'] = slug
+                    if attempt > 0:
+                        logger.info(f"[SUCCESS] {metric} for {slug} succeeded on attempt {attempt + 1}")
+                    return data
+                    
+            except Exception as e:
+                error_msg = str(e)
+                keys_tried.add(self.current_key_index)
+                
+                # Check if it's a rate limit error
+                if self._is_rate_limit_error(error_msg) and self.api_keys:
+                    logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for {metric}: {error_msg}")
+                    
+                    # Check if we've tried all keys
+                    if len(keys_tried) >= len(self.api_keys):
+                        logger.error(f"All {len(self.api_keys)} API keys exhausted for {metric}. Skipping.")
+                        break  # Exit retry loop since all keys are exhausted
+                    
+                    # Try to switch to next API key
+                    if self._switch_api_key():
+                        continue  # Retry with new API key
+                    else:
+                        logger.error("No more API keys available for switching")
+                
+                # Handle rate limit with san library specific check
+                if hasattr(san, 'is_rate_limit_exception') and san.is_rate_limit_exception(e):
+                    if hasattr(san, 'rate_limit_time_left'):
+                        rate_limit_seconds = san.rate_limit_time_left(e)
+                        logger.warning(f"Santiment rate limit hit. Sleeping for {rate_limit_seconds} seconds")
+                        time.sleep(rate_limit_seconds)
+                    else:
+                        # Try switching API key if available
+                        if self.api_keys and self._switch_api_key():
+                            continue
+                        else:
+                            time.sleep(60)  # Default wait
+                else:
+                    # Check for specific error types that mean we should skip this metric entirely
+                    if any(skip_phrase in error_msg.lower() for skip_phrase in [
+                        'not supported for',
+                        'is not an existing slug',
+                        'not implemented for',
+                        'missing_contract',
+                        'outside the allowed interval',
+                        'upgrade to a higher tier'
+                    ]):
+                        logger.warning(f"[SKIP] {metric} for {slug} - {error_msg}")
+                        return None  # Skip this metric/slug combination entirely
+                    
+                    logger.error(f"Failed to fetch {metric} for {slug}: {error_msg}")
+                
+                error_info = {
+                    'metric': metric,
+                    'slug': slug,
+                    'error': error_msg,
+                    'timestamp': datetime.now().isoformat(),
+                    'api_key_index': self.current_key_index
+                }
+                self.failed_queries.append(error_info)
+        
+        return None
+
+    def fetch_multi_asset_metric(self, metric: str, slugs: List[str], **kwargs) -> Optional[pd.DataFrame]:
+        """
+        Fetch a single metric for multiple assets using get_many
+        
+        Args:
+            metric: The metric name
+            slugs: List of asset slugs
+            **kwargs: Additional parameters for the API call
+            
+        Returns:
+            DataFrame with the metric data or None if failed
+        """
+        max_retries = len(self.api_keys) if self.api_keys else 1
+        keys_tried = set()
+        
+        for attempt in range(max_retries):
+            try:
+                # If we've tried all keys, reset and wait
+                if len(keys_tried) >= len(self.api_keys) and self.api_keys:
+                    logger.warning(f"All {len(self.api_keys)} API keys exhausted for {metric}, waiting 30 seconds...")
+                    time.sleep(30)
+                    keys_tried.clear()
+                    self.current_key_index = 0
+                    self._set_current_api_key()
+                
+                params = {
+                    'slugs': slugs,
+                    'from_date': kwargs.get('from_date', self.config.from_date),
+                    'to_date': kwargs.get('to_date', self.config.to_date),
+                    'interval': kwargs.get('interval', self.config.interval),
+                    'include_incomplete_data': kwargs.get('include_incomplete_data', self.config.include_incomplete_data)
+                }
+                
+                data = san.get_many(metric, **params)
+                
+                if data is not None and not data.empty:
+                    # Reshape data for consistent format
+                    data_melted = data.reset_index().melt(
+                        id_vars=['datetime'], 
+                        var_name='slug', 
+                        value_name='value'
+                    )
+                    data_melted['metric'] = metric
+                    data_melted.set_index('datetime', inplace=True)
+                    if attempt > 0:
+                        logger.info(f"[SUCCESS] {metric} for multiple assets succeeded on attempt {attempt + 1}")
+                    return data_melted
+                    
+            except Exception as e:
+                error_msg = str(e)
+                keys_tried.add(self.current_key_index)
+                
+                # Check if it's a rate limit error
+                if self._is_rate_limit_error(error_msg) and self.api_keys:
+                    logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for {metric}: {error_msg}")
+                    
+                    # Check if we've tried all keys
+                    if len(keys_tried) >= len(self.api_keys):
+                        logger.error(f"All {len(self.api_keys)} API keys exhausted for {metric}. Skipping.")
+                        break  # Exit retry loop since all keys are exhausted
+                    
+                    # Try to switch to next API key
+                    if self._switch_api_key():
+                        continue  # Retry with new API key
+                    else:
+                        logger.error("No more API keys available for switching")
+                
+                # Handle rate limit with san library specific check
+                if hasattr(san, 'is_rate_limit_exception') and san.is_rate_limit_exception(e):
+                    if hasattr(san, 'rate_limit_time_left'):
+                        rate_limit_seconds = san.rate_limit_time_left(e)
+                        logger.warning(f"Santiment rate limit hit. Sleeping for {rate_limit_seconds} seconds")
+                        time.sleep(rate_limit_seconds)
+                    else:
+                        # Try switching API key if available
+                        if self.api_keys and self._switch_api_key():
+                            continue
+                        else:
+                            time.sleep(60)  # Default wait
+                else:
+                    logger.error(f"Failed to fetch {metric} for multiple assets: {error_msg}")
+                
+                error_info = {
+                    'metric': metric,
+                    'slugs': slugs,
+                    'error': error_msg,
+                    'timestamp': datetime.now().isoformat(),
+                    'api_key_index': self.current_key_index
+                }
+                self.failed_queries.append(error_info)
+        
+        return None
+
+    def fetch_category_batch(self, category: str, slugs: List[str], use_async_batch: bool = True) -> Dict[str, pd.DataFrame]:
+        """
+        Fetch all metrics in a category using batch operations with symbol alternatives fallback
+        
+        Args:
+            category: The metric category name
+            slugs: List of asset slugs to fetch for
+            use_async_batch: Whether to use AsyncBatch (recommended) or Batch
+            
+        Returns:
+            Dictionary mapping metric names to DataFrames
+        """
+        if category not in self.metric_categories:
+            logger.error(f"Unknown category: {category}")
+            return {}
+        
+        metrics = self.metric_categories[category]
+        category_data = {}
+        
+        # Filter metrics that are actually available
+        available_metrics_in_category = [m for m in metrics if m in self.available_metrics]
+        
+        if not available_metrics_in_category:
+            logger.warning(f"No available metrics found for category: {category}")
+            return {}
+        
+        logger.info(f"Fetching {len(available_metrics_in_category)} metrics for category: {category}")
+        
+        # First try batch operation with normalized slugs
+        normalized_slugs = self.normalize_slug_list(slugs)
+        batch_success = self._try_batch_fetch(category, available_metrics_in_category, normalized_slugs, use_async_batch)
+        category_data.update(batch_success)
+        
+        # For failed metrics, try individual fetches with alternatives
+        failed_metrics = [m for m in available_metrics_in_category if m not in batch_success]
+        if failed_metrics:
+            logger.info(f"Retrying {len(failed_metrics)} failed metrics with alternatives")
+            individual_results = self._fetch_failed_metrics_with_alternatives(failed_metrics, slugs)
+            category_data.update(individual_results)
+        
+        return category_data
+    
+    def _try_batch_fetch(self, category: str, metrics: List[str], slugs: List[str], use_async_batch: bool) -> Dict[str, pd.DataFrame]:
+        """Try batch fetch operation"""
+        category_data = {}
+        
+        try:
+            if use_async_batch:
+                batch = san.AsyncBatch()
+            else:
+                batch = san.Batch()
+            
+            # Add queries to batch
+            for metric in metrics:
+                try:
+                    if len(slugs) == 1:
+                        batch.get(
+                            metric,
+                            slug=slugs[0],
+                            from_date=self.config.from_date,
+                            to_date=self.config.to_date,
+                            interval=self.config.interval,
+                            include_incomplete_data=self.config.include_incomplete_data
+                        )
+                    else:
+                        batch.get_many(
+                            metric,
+                            slugs=slugs,
+                            from_date=self.config.from_date,
+                            to_date=self.config.to_date,
+                            interval=self.config.interval,
+                            include_incomplete_data=self.config.include_incomplete_data
+                        )
+                except Exception as e:
+                    logger.warning(f"Failed to add {metric} to batch: {e}")
+            
+            # Execute batch
+            if use_async_batch:
+                results = batch.execute(max_workers=self.config.max_workers)
+            else:
+                results = batch.execute()
+            
+            # Process results
+            for i, (metric, result) in enumerate(zip(metrics, results)):
+                if result is not None and not result.empty:
+                    if len(slugs) > 1:
+                        # Reshape multi-asset data
+                        result_melted = result.reset_index().melt(
+                            id_vars=['datetime'],
+                            var_name='slug',
+                            value_name='value'
+                        )
+                        result_melted['metric'] = metric
+                        result_melted.set_index('datetime', inplace=True)
+                        category_data[metric] = result_melted
+                    else:
+                        result['metric'] = metric
+                        result['slug'] = slugs[0]
+                        category_data[metric] = result
+                else:
+                    logger.debug(f"No data received for metric: {metric} in batch")
+            
+        except Exception as e:
+            logger.error(f"Batch execution failed for category {category}: {e}")
+        
+        return category_data
+    
+    def _fetch_failed_metrics_with_alternatives(self, metrics: List[str], original_slugs: List[str]) -> Dict[str, pd.DataFrame]:
+        """Fetch failed metrics individually using symbol alternatives"""
+        individual_data = {}
+        
+        for metric in metrics:
+            logger.info(f"Retrying {metric} with symbol alternatives...")
+            
+            if len(original_slugs) == 1:
+                # Single asset - use alternatives
+                result = self.fetch_single_metric_with_alternatives(metric, original_slugs[0])
+                if result is not None:
+                    individual_data[metric] = result
+            else:
+                # Multiple assets - try each with alternatives and combine
+                all_results = []
+                for slug in original_slugs:
+                    result = self.fetch_single_metric_with_alternatives(metric, slug)
+                    if result is not None:
+                        all_results.append(result)
+                
+                if all_results:
+                    # Concatenate results - they already have datetime as index
+                    combined_result = pd.concat(all_results, ignore_index=False, sort=False)
+                    # Ensure datetime index is properly set
+                    if not isinstance(combined_result.index, pd.DatetimeIndex):
+                        if 'datetime' in combined_result.columns:
+                            combined_result.set_index('datetime', inplace=True)
+                    individual_data[metric] = combined_result
+        
+        return individual_data
+
+    def fetch_special_metrics(self, slugs: List[str]) -> Dict[str, pd.DataFrame]:
+        """
+        Fetch special metrics that have different API signatures
+        
+        Args:
+            slugs: List of asset slugs
+            
+        Returns:
+            Dictionary mapping metric names to DataFrames
+        """
+        special_data = {}
+        
+        for slug in slugs:
+            max_retries = len(self.api_keys) if self.api_keys else 1
+            keys_tried = set()
+            
+            for attempt in range(max_retries):
+                try:
+                    # If we've tried all keys, reset and wait
+                    if len(keys_tried) >= len(self.api_keys) and self.api_keys:
+                        logger.warning(f"All {len(self.api_keys)} API keys exhausted for special metrics on {slug}, waiting 30 seconds...")
+                        time.sleep(30)
+                        keys_tried.clear()
+                        self.current_key_index = 0
+                        self._set_current_api_key()
+                    
+                    # OHLCV data
+                    logger.info(f"Fetching OHLCV data for {slug}")
+                    ohlcv = san.get(
+                        f"ohlcv/{slug}",
+                        from_date=self.config.from_date,
+                        to_date=self.config.to_date,
+                        interval=self.config.interval
+                    )
+                    if ohlcv is not None and not ohlcv.empty:
+                        ohlcv['metric'] = 'ohlcv'
+                        ohlcv['slug'] = slug
+                        special_data[f'ohlcv_{slug}'] = ohlcv
+                    
+                    # Prices with OHLC format
+                    logger.info(f"Fetching detailed prices for {slug}")
+                    prices = san.get(
+                        "prices",
+                        slug=slug,
+                        from_date=self.config.from_date,
+                        to_date=self.config.to_date,
+                        interval=self.config.interval
+                    )
+                    if prices is not None and not prices.empty:
+                        prices['metric'] = 'prices_detailed'
+                        prices['slug'] = slug
+                        special_data[f'prices_{slug}'] = prices
+                    
+                    # If we get here, the attempt was successful
+                    break
+                        
+                except Exception as e:
+                    error_msg = str(e)
+                    keys_tried.add(self.current_key_index)
+                    
+                    # Check if it's a rate limit error
+                    if self._is_rate_limit_error(error_msg) and self.api_keys:
+                        logger.warning(f"[RATE_LIMIT] API key #{self.current_key_index + 1} hit rate limit for special metrics on {slug}: {error_msg}")
+                        
+                        # Check if we've tried all keys
+                        if len(keys_tried) >= len(self.api_keys):
+                            logger.error(f"All {len(self.api_keys)} API keys exhausted for special metrics on {slug}. Skipping.")
+                            break  # Exit retry loop since all keys are exhausted
+                        
+                        # Try to switch to next API key
+                        if self._switch_api_key():
+                            continue  # Retry with new API key
+                        else:
+                            logger.error("No more API keys available for switching")
+                    
+                    logger.error(f"Failed to fetch special metrics for {slug}: {e}")
+                    break  # Exit retry loop for this slug
+        
+        return special_data
+
+    def fetch_blockchain_address_data(self, addresses: List[str], slugs: List[str]) -> Dict[str, pd.DataFrame]:
+        """
+        Fetch blockchain address-related data
+        
+        Args:
+            addresses: List of blockchain addresses
+            slugs: List of asset slugs for context
+            
+        Returns:
+            Dictionary mapping data types to DataFrames
+        """
+        address_data = {}
+        
+        for slug in slugs:
+            for address in addresses:
+                try:
+                    # Historical balance
+                    balance = san.get(
+                        "historical_balance",
+                        slug=slug,
+                        address=address,
+                        from_date=self.config.from_date,
+                        to_date=self.config.to_date,
+                        interval=self.config.interval
+                    )
+                    if balance is not None and not balance.empty:
+                        balance['address'] = address
+                        balance['slug'] = slug
+                        address_data[f'historical_balance_{slug}_{address[:8]}'] = balance
+                        
+                    # Top transactions
+                    top_txs = san.get(
+                        "eth_top_transactions",
+                        slug=slug,
+                        from_date=self.config.from_date,
+                        to_date=self.config.to_date,
+                        limit=100,
+                        transaction_type="ALL"
+                    )
+                    if top_txs is not None and not top_txs.empty:
+                        top_txs['slug'] = slug
+                        address_data[f'eth_top_transactions_{slug}'] = top_txs
+                        
+                except Exception as e:
+                    logger.error(f"Failed to fetch address data for {address} on {slug}: {e}")
+        
+        return address_data
+
+    def execute_custom_sql_queries(self) -> Dict[str, pd.DataFrame]:
+        """
+        Execute custom SQL queries for additional data insights, using dictGetString for asset metadata.
+        
+        Returns:
+            Dictionary mapping query names to DataFrames
+        """
+        sql_data = {}
+        custom_queries = {
+            'top_assets_by_volume': """
+                SELECT 
+                    dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name,
+                    dictGetString('default.asset_metadata_dict', 'slug', asset_id) as slug,
+                    SUM(value) as total_volume
+                FROM daily_metrics_v2 
+                WHERE metric_id = get_metric_id('volume_usd') 
+                    AND dt >= now() - INTERVAL 30 DAY
+                GROUP BY asset_id 
+                ORDER BY total_volume DESC 
+                LIMIT 50
+            """,
+            'recent_high_activity_addresses': """
+                SELECT 
+                    dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name,
+                    get_metric_name(metric_id) as metric_name,
+                    dt,
+                    value
+                FROM daily_metrics_v2 
+                WHERE metric_id = get_metric_id('daily_active_addresses') 
+                    AND dt >= now() - INTERVAL 7 DAY
+                    AND value > 1000
+                ORDER BY dt DESC, value DESC
+                LIMIT 100
+            """,
+            'exchange_flow_summary': """
+                SELECT 
+                    dictGetString('default.asset_metadata_dict', 'name', asset_id) as asset_name,
+                    dt,
+                    SUM(CASE WHEN metric_id = get_metric_id('exchange_inflow') THEN value ELSE 0 END) as inflow,
+                    SUM(CASE WHEN metric_id = get_metric_id('exchange_outflow') THEN value ELSE 0 END) as outflow
+                FROM daily_metrics_v2 
+                WHERE metric_id IN (get_metric_id('exchange_inflow'), get_metric_id('exchange_outflow'))
+                    AND dt >= now() - INTERVAL 30 DAY
+                GROUP BY asset_id, dt
+                ORDER BY dt DESC
+                LIMIT 1000
+            """
+        }
+        for query_name, query in custom_queries.items():
+            try:
+                logger.info(f"Executing SQL query: {query_name}")
+                result = san.execute_sql(query=query, set_index="dt" if "dt" in query else None)
+                if result is not None and not result.empty:
+                    sql_data[query_name] = result
+                    logger.info(f"SQL query {query_name} returned {len(result)} rows")
+            except Exception as e:
+                logger.error(f"Failed to execute SQL query {query_name}: {e}")
+        return sql_data
+
+    def fetch_comprehensive_data(self, 
+                               slugs: List[str] = None, 
+                               categories: List[str] = None,
+                               include_special_metrics: bool = True,
+                               include_sql_queries: bool = True,
+                               addresses: List[str] = None) -> Dict[str, Any]:
+        """
+        Fetch comprehensive data across all categories and metrics
+        
+        Args:
+            slugs: List of asset slugs (if None, uses top assets)
+            categories: List of categories to fetch (if None, fetches all)
+            include_special_metrics: Whether to include special format metrics
+            include_sql_queries: Whether to execute custom SQL queries
+            addresses: List of blockchain addresses for address-specific data
+            
+        Returns:
+            Dictionary containing all fetched data organized by category
+        """
+        # Set defaults
+        if slugs is None:
+            slugs = ['bitcoin', 'ethereum', 'cardano', 'polkadot', 'chainlink', 
+                    'litecoin', 'bitcoin-cash', 'stellar', 'ethereum-classic', 'eos']
+        
+        # Normalize and deduplicate slugs
+        slugs = self.normalize_slug_list(slugs)
+        
+        if categories is None:
+            categories = list(self.metric_categories.keys())
+        
+        # Limit slugs for free tier
+        if not san.ApiConfig.api_key:
+            slugs = slugs[:3]  # Limit to 3 assets for free tier
+            logger.warning("No API key detected. Limiting to 3 assets to avoid rate limits.")
+        
+        all_data = {}
+        start_time = datetime.now()
+        
+        logger.info(f"Starting comprehensive data fetch for {len(slugs)} assets across {len(categories)} categories")
+        
+        # Check if all API keys are exhausted early
+        all_keys_exhausted = False
+        if self.api_keys and self.rate_limit_switches > len(self.api_keys) * 3:
+            logger.warning("⚠️  All API keys appear to be rate-limited. Attempting reduced fetch...")
+            all_keys_exhausted = True
+        
+        # Fetch data by category
+        for category in categories:
+            if all_keys_exhausted:
+                logger.info(f"Skipping category {category} due to API exhaustion")
+                continue
+                
+            logger.info(f"Fetching category: {category}")
+            category_data = self.fetch_category_batch(category, slugs, use_async_batch=True)
+            
+            if category_data:
+                all_data[category] = category_data
+                # Store individual DataFrames for later use
+                for metric_name, df in category_data.items():
+                    self.fetched_data[f"{category}_{metric_name}"] = df
+            
+            # Check if we should stop due to rate limits
+            if self.rate_limit_switches > len(self.api_keys) * 5:
+                logger.warning("⚠️  Excessive rate limit switches detected. Stopping data fetch to avoid further exhaustion.")
+                all_keys_exhausted = True
+                break
+        
+        # Fetch special metrics (only if not exhausted)
+        if include_special_metrics and not all_keys_exhausted:
+            logger.info("Fetching special metrics...")
+            special_data = self.fetch_special_metrics(slugs)
+            if special_data:
+                all_data['special_metrics'] = special_data
+                self.fetched_data.update(special_data)
+        elif all_keys_exhausted:
+            logger.info("Skipping special metrics due to API exhaustion")
+        
+        # Fetch blockchain address data
+        if addresses and not all_keys_exhausted:
+            logger.info("Fetching blockchain address data...")
+            address_data = self.fetch_blockchain_address_data(addresses, slugs)
+            if address_data:
+                all_data['address_data'] = address_data
+                self.fetched_data.update(address_data)
+        elif addresses and all_keys_exhausted:
+            logger.info("Skipping blockchain address data due to API exhaustion")
+        
+        # Execute SQL queries (only if not exhausted)
+        if include_sql_queries and san.ApiConfig.api_key and not all_keys_exhausted:
+            logger.info("Executing custom SQL queries...")
+            sql_data = self.execute_custom_sql_queries()
+            if sql_data:
+                all_data['sql_queries'] = sql_data
+                self.fetched_data.update(sql_data)
+        elif all_keys_exhausted:
+            logger.info("Skipping SQL queries due to API exhaustion")
+        
+        end_time = datetime.now()
+        duration = end_time - start_time
+        
+        logger.info(f"Comprehensive data fetch completed in {duration}")
+        logger.info(f"Successfully fetched {len(self.fetched_data)} datasets")
+        logger.info(f"Failed queries: {len(self.failed_queries)}")
+        
+        # Add exhaustion notice to summary
+        if all_keys_exhausted:
+            logger.warning("⚠️  Data fetch completed with API rate limit exhaustion - some data may be missing")
+        
+        # Generate summary
+        summary = self._generate_fetch_summary(all_data, duration)
+        summary['all_keys_exhausted'] = all_keys_exhausted
+        summary['rate_limit_switches'] = self.rate_limit_switches
+        all_data['fetch_summary'] = summary
+        
+        return all_data
+
+    def _generate_fetch_summary(self, data: Dict[str, Any], duration: timedelta) -> Dict[str, Any]:
+        """Generate a summary of the data fetching operation"""
+        summary = {
+            'fetch_duration': str(duration),
+            'total_datasets': len(self.fetched_data),
+            'failed_queries': len(self.failed_queries),
+            'categories_fetched': list(data.keys()),
+            'data_points_by_category': {},
+            'date_range': f"{self.config.from_date} to {self.config.to_date}",
+            'interval': self.config.interval,
+            'timestamp': datetime.now().isoformat()
+        }
+        
+        # Count data points by category
+        for category, category_data in data.items():
+            if isinstance(category_data, dict):
+                total_points = sum(len(df) for df in category_data.values() if isinstance(df, pd.DataFrame))
+                summary['data_points_by_category'][category] = total_points
+        
+        return summary
+
+    def export_data(self, 
+                   export_format: str = None,
+                   combine_categories: bool = False,
+                   include_metadata: bool = True) -> Dict[str, str]:
+        """
+        Export fetched data to files
+        
+        Args:
+            export_format: Export format ('csv', 'json', 'parquet')
+            combine_categories: Whether to combine all data into single files
+            include_metadata: Whether to include metadata files
+            
+        Returns:
+            Dictionary mapping data names to file paths
+        """
+        export_format = export_format or self.config.export_format
+        exported_files = {}
+        
+        if not self.fetched_data:
+            logger.warning("No data to export")
+            return exported_files
+        
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        
+        if combine_categories:
+            # Combine all DataFrames
+            all_dfs = []
+            for name, df in self.fetched_data.items():
+                if isinstance(df, pd.DataFrame) and not df.empty:
+                    df_copy = df.copy()
+                    df_copy['dataset_name'] = name
+                    all_dfs.append(df_copy)
+            
+            if all_dfs:
+                combined_df = pd.concat(all_dfs, ignore_index=True, sort=False)
+                filename = f"santiment_comprehensive_data_{timestamp}.{export_format}"
+                filepath = os.path.join(self.config.export_directory, filename)
+                
+                self._export_dataframe(combined_df, filepath, export_format)
+                exported_files['combined_data'] = filepath
+        else:
+            # Export individual datasets
+            for name, df in self.fetched_data.items():
+                if isinstance(df, pd.DataFrame) and not df.empty:
+                    filename = f"santiment_{name}_{timestamp}.{export_format}"
+                    filepath = os.path.join(self.config.export_directory, filename)
+                    
+                    self._export_dataframe(df, filepath, export_format)
+                    exported_files[name] = filepath
+        
+        # Export metadata and summary
+        if include_metadata:
+            metadata = {
+                'failed_queries': self.failed_queries,
+                'available_metrics': self.available_metrics,
+                'config': {
+                    'from_date': self.config.from_date,
+                    'to_date': self.config.to_date,
+                    'interval': self.config.interval,
+                    'batch_size': self.config.batch_size
+                },
+                'export_timestamp': datetime.now().isoformat()
+            }
+            
+            metadata_file = os.path.join(self.config.export_directory, f"santiment_metadata_{timestamp}.json")
+            with open(metadata_file, 'w') as f:
+                json.dump(metadata, f, indent=2)
+            exported_files['metadata'] = metadata_file
+        
+        logger.info(f"Exported {len(exported_files)} files to {self.config.export_directory}")
+        return exported_files
+
+    def _export_dataframe(self, df: pd.DataFrame, filepath: str, format_type: str):
+        """Export a DataFrame to the specified format"""
+        try:
+            if format_type == 'csv':
+                df.to_csv(filepath)
+            elif format_type == 'json':
+                df.to_json(filepath, date_format='iso', orient='records')
+            elif format_type == 'parquet':
+                df.to_parquet(filepath)
+            else:
+                logger.error(f"Unsupported export format: {format_type}")
+                return
+            
+            logger.info(f"Exported DataFrame to {filepath}")
+            
+        except Exception as e:
+            logger.error(f"Failed to export DataFrame to {filepath}: {e}")
+
+    def get_api_usage_stats(self) -> Dict[str, Any]:
+        """Get API usage statistics"""
+        try:
+            stats = {
+                'calls_made': san.api_calls_made(),
+                'calls_remaining': san.api_calls_remaining(),
+                'failed_queries': len(self.failed_queries),
+                'successful_datasets': len(self.fetched_data)
+            }
+            return stats
+        except Exception as e:
+            logger.error(f"Failed to get API usage stats: {e}")
+            return {}
+
+    def print_summary(self):
+            """Print a comprehensive summary of the fetching operation"""
+            print("\n" + "="*60)
+            print("SANTIMENT DATA FETCHER SUMMARY")
+            print("="*60)
+            
+            # Basic stats
+            print(f"Total datasets fetched: {len(self.fetched_data)}")
+            print(f"Failed queries: {len(self.failed_queries)}")
+            
+            # Configuration info
+            print(f"\nConfiguration:")
+            print(f"  Date range: {self.config.from_date} to {self.config.to_date}")
+            print(f"  Interval: {self.config.interval}")
+            print(f"  Export directory: {self.config.export_directory}")
+            
+            # Categories summary
+            if self.fetched_data:
+                print(f"\nData by category:")
+                category_counts = {}
+                for key in self.fetched_data.keys():
+                    if '_' in key:
+                        category = key.split('_')[0]
+                        category_counts[category] = category_counts.get(category, 0) + 1
+                
+                for category, count in sorted(category_counts.items()):
+                    print(f"  {category}: {count} datasets")
+            
+            # Sample data info
+            if self.fetched_data:
+                print(f"\nSample datasets:")
+                for i, (name, df) in enumerate(list(self.fetched_data.items())[:5]):
+                    if isinstance(df, pd.DataFrame):
+                        print(f"  {name}: {len(df)} rows, {len(df.columns)} columns")
+                        if not df.empty:
+                            date_range = f"{df.index.min()} to {df.index.max()}" if hasattr(df.index, 'min') else "N/A"
+                            print(f"    Date range: {date_range}")
+            
+            # Failed queries summary
+            if self.failed_queries:
+                print(f"\nFailed queries summary:")
+                error_types = {}
+                for failed in self.failed_queries:
+                    error_msg = str(failed.get('error', 'Unknown error'))
+                    error_type = error_msg.split(':')[0] if ':' in error_msg else error_msg
+                    error_types[error_type] = error_types.get(error_type, 0) + 1
+                
+                for error_type, count in sorted(error_types.items()):
+                    print(f"  {error_type}: {count} occurrences")
+            
+            # API usage stats
+            try:
+                api_stats = self.get_api_usage_stats()
+                if api_stats:
+                    print(f"\nAPI Usage:")
+                    print(f"  Calls made: {api_stats.get('calls_made', 'N/A')}")
+                    print(f"  Calls remaining: {api_stats.get('calls_remaining', 'N/A')}")
+            except:
+                pass
+            
+            print("="*60)   
+
+    def analyze_data_quality(self) -> Dict[str, Any]:
+        """Analyze the quality of fetched data"""
+        quality_report = {
+            'total_datasets': len(self.fetched_data),
+            'empty_datasets': 0,
+            'datasets_with_nulls': 0,
+            'date_coverage': {},
+            'data_completeness': {},
+            'outliers_detected': {}
+        }
+        
+        for name, df in self.fetched_data.items():
+            if isinstance(df, pd.DataFrame):
+                # Check if dataset is empty
+                if df.empty:
+                    quality_report['empty_datasets'] += 1
+                    continue
+                
+                # Check for null values
+                if df.isnull().any().any():
+                    quality_report['datasets_with_nulls'] += 1
+                    null_percentage = (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
+                    quality_report['data_completeness'][name] = f"{100 - null_percentage:.2f}%"
+                
+                # Analyze date coverage
+                if hasattr(df.index, 'min') and hasattr(df.index, 'max'):
+                    try:
+                        date_range = {
+                            'start': str(df.index.min()),
+                            'end': str(df.index.max()),
+                            'days': (df.index.max() - df.index.min()).days if hasattr(df.index.max() - df.index.min(), 'days') else 'N/A'
+                        }
+                        quality_report['date_coverage'][name] = date_range
+                    except:
+                        quality_report['date_coverage'][name] = 'Unable to determine'
+                
+                # Simple outlier detection for numeric columns
+                numeric_cols = df.select_dtypes(include=[np.number]).columns
+                outlier_info = {}
+                for col in numeric_cols:
+                    if col not in ['metric', 'slug']:  # Skip metadata columns
+                        try:
+                            q1 = df[col].quantile(0.25)
+                            q3 = df[col].quantile(0.75)
+                            iqr = q3 - q1
+                            lower_bound = q1 - 1.5 * iqr
+                            upper_bound = q3 + 1.5 * iqr
+                            outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
+                            if len(outliers) > 0:
+                                outlier_info[col] = len(outliers)
+                        except:
+                            continue
+                
+                if outlier_info:
+                    quality_report['outliers_detected'][name] = outlier_info
+        
+        return quality_report
+
+    def create_data_dashboard(self) -> str:
+        """Create a simple HTML dashboard summarizing the fetched data"""
+        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        total_datasets = len(self.fetched_data)
+        date_range = f"{self.config.from_date} to {self.config.to_date}"
+
+        html_content = f"""
+<!DOCTYPE html>
+<html>
+<head>
+    <title>Santiment Data Dashboard</title>
+    <style>
+        body {{ font-family: Arial, sans-serif; margin: 20px; }}
+        .header {{ background-color: #f0f0f0; padding: 20px; border-radius: 5px; }}
+        .section {{ margin: 20px 0; padding: 15px; border: 1px solid #ddd; border-radius: 5px; }}
+        .metric-card {{ display: inline-block; margin: 10px; padding: 15px; background-color: #f9f9f9; border-radius: 5px; }}
+        table {{ border-collapse: collapse; width: 100%; }}
+        th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
+        th {{ background-color: #f2f2f2; }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>Santiment Data Dashboard</h1>
+        <p>Generated on: {timestamp}</p>
+        <p>Total Datasets: {total_datasets}</p>
+        <p>Date Range: {date_range}</p>
+    </div>
+"""
+
+        # Add category summary
+        if self.fetched_data:
+            category_counts = {}
+            for key in self.fetched_data.keys():
+                if '_' in key:
+                    category = key.split('_')[0]
+                    category_counts[category] = category_counts.get(category, 0) + 1
+
+            html_content += """
+    <div class="section">
+        <h2>Categories Overview</h2>
+    """
+            for category, count in sorted(category_counts.items()):
+                html_content += f'<div class="metric-card"><strong>{category}</strong><br>{count} datasets</div>'
+            html_content += "</div>"
+
+        # Add failed queries section
+        if self.failed_queries:
+            html_content += """
+    <div class="section">
+        <h2>Failed Queries</h2>
+        <table>
+            <tr><th>Metric</th><th>Slug</th><th>Error</th></tr>
+    """
+            for failed in self.failed_queries[:10]:  # Show first 10
+                metric = failed.get('metric', 'N/A')
+                slug = failed.get('slug', failed.get('slugs', 'N/A'))
+                error = str(failed.get('error', 'Unknown'))[:100] + '...' if len(str(failed.get('error', ''))) > 100 else failed.get('error', 'Unknown')
+                html_content += f"<tr><td>{metric}</td><td>{slug}</td><td>{error}</td></tr>"
+            html_content += "</table></div>"
+
+        html_content += "</body></html>"
+
+        # Save dashboard
+        dashboard_path = os.path.join(
+            self.config.export_directory,
+            f"santiment_dashboard_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
+        )
+        with open(dashboard_path, 'w') as f:
+            f.write(html_content)
+
+        logger.info(f"Dashboard created at {dashboard_path}")
+        return dashboard_path
+
+    def get_top_performing_assets(self, metric: str = 'price_usd', days: int = 30) -> pd.DataFrame:
+        """
+        Analyze top performing assets based on a specific metric
+        
+        Args:
+            metric: The metric to analyze performance on
+            days: Number of days to look back for performance calculation
+            
+        Returns:
+            DataFrame with performance analysis
+        """
+        performance_data = []
+        
+        for name, df in self.fetched_data.items():
+            if isinstance(df, pd.DataFrame) and metric in str(name) and not df.empty:
+                try:
+                    if 'slug' in df.columns:
+                        # Group by slug and calculate performance
+                        for slug in df['slug'].unique():
+                            slug_data = df[df['slug'] == slug].copy()
+                            if len(slug_data) >= 2:
+                                slug_data = slug_data.sort_index()
+                                
+                                # Calculate performance over the specified period
+                                if len(slug_data) > days:
+                                    recent_data = slug_data.tail(days)
+                                else:
+                                    recent_data = slug_data
+                                
+                                if 'value' in recent_data.columns and not recent_data['value'].empty:
+                                    start_value = recent_data['value'].iloc[0]
+                                    end_value = recent_data['value'].iloc[-1]
+                                    
+                                    if start_value and start_value != 0:
+                                        performance = ((end_value - start_value) / start_value) * 100
+                                        
+                                        performance_data.append({
+                                            'slug': slug,
+                                            'metric': metric,
+                                            'start_value': start_value,
+                                            'end_value': end_value,
+                                            'performance_pct': performance,
+                                            'data_points': len(recent_data),
+                                            'period_days': days
+                                        })
+                except Exception as e:
+                    logger.warning(f"Failed to analyze performance for {name}: {e}")
+        
+        if performance_data:
+            performance_df = pd.DataFrame(performance_data)
+            return performance_df.sort_values('performance_pct', ascending=False)
+        else:
+            return pd.DataFrame()
+
+    def cleanup_export_directory(self) -> bool:
+        """
+        Manually clean up the export directory.
+        
+        Returns:
+            bool: True if cleanup was successful, False otherwise
+        """
+        try:
+            self._cleanup_existing_files()
+            return True
+        except Exception as e:
+            logger.error(f"Manual cleanup failed: {e}")
+            return False
+
+    def get_api_key_status(self):
+        """Get status information about API key usage"""
+        if not self.api_keys:
+            return {
+                "total_keys": 0,
+                "current_key": "None",
+                "rate_limit_switches": self.rate_limit_switches,
+                "current_key_preview": "No API key"
+            }
+        
+        return {
+            "total_keys": len(self.api_keys),
+            "current_key": self.current_key_index + 1,
+            "rate_limit_switches": self.rate_limit_switches,
+            "current_key_preview": self.api_keys[self.current_key_index][:8] + "..."
+        }
+    
+    def print_api_key_status(self):
+        """Print API key usage status"""
+        status = self.get_api_key_status()
+        print(f"\n[API_STATUS] Using {status['total_keys']} API key(s)")
+        if status['total_keys'] > 0:
+            print(f"[API_STATUS] Current: Key #{status['current_key']} ({status['current_key_preview']})")
+            print(f"[API_STATUS] Rate limit switches: {status['rate_limit_switches']}")
+            if status['rate_limit_switches'] > 0:
+                print(f"[API_STATUS] Effective rate limit handling active")
+        else:
+            print(f"[API_STATUS] No API keys configured - using free tier")
+        print()
+
+    def save_configuration(self, config_path: str = None) -> str:
+        """Save current configuration to a JSON file"""
+        if config_path is None:
+            config_path = os.path.join(self.config.export_directory, "santiment_config.json")
+        
+        config_dict = {
+            'from_date': self.config.from_date,
+            'to_date': self.config.to_date,
+            'interval': self.config.interval,
+            'include_incomplete_data': self.config.include_incomplete_data,
+            'batch_size': self.config.batch_size,
+            'max_workers': self.config.max_workers,
+            'rate_limit_delay': self.config.rate_limit_delay,
+            'export_format': self.config.export_format,
+            'export_directory': self.config.export_directory,
+            'saved_at': datetime.now().isoformat()
+        }
+        
+        with open(config_path, 'w') as f:
+            json.dump(config_dict, f, indent=2)
+        
+        logger.info(f"Configuration saved to {config_path}")
+        return config_path
+
+    @classmethod
+    def load_configuration(cls, config_path: str) -> 'SantimentDataFetcher':
+        """Load configuration from a JSON file and create a fetcher instance"""
+        with open(config_path, 'r') as f:
+            config_dict = json.load(f)
+        
+        # Remove metadata fields
+        config_dict.pop('saved_at', None)
+        
+        config = FetchConfig(**config_dict)
+        return cls(config=config)
+
+
+# Utility functions for easy usage
+def cleanup_santiment_directory(directory_path: str = "data/santiment") -> bool:
+    """
+    Utility function to clean up a Santiment data directory without creating a fetcher instance.
+    
+    Args:
+        directory_path: Path to the directory to clean up
+        
+    Returns:
+        bool: True if cleanup was successful, False otherwise
+    """
+    import glob
+    import shutil
+    
+    try:
+        if not os.path.exists(directory_path):
+            logger.info(f"Directory does not exist: {directory_path}")
+            return True
+        
+        # Get all files in the directory
+        all_files = glob.glob(os.path.join(directory_path, "*"))
+        
+        if all_files:
+            logger.info(f"Cleaning up {len(all_files)} existing files in {directory_path}")
+            
+            for file_path in all_files:
+                try:
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                        logger.debug(f"Removed file: {os.path.basename(file_path)}")
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                        logger.debug(f"Removed directory: {os.path.basename(file_path)}")
+                except Exception as e:
+                    logger.warning(f"Failed to remove {file_path}: {e}")
+            
+            logger.info(f"Successfully cleaned up directory: {directory_path}")
+        else:
+            logger.info(f"Directory is already clean: {directory_path}")
+        
+        return True
+        
+    except Exception as e:
+        logger.error(f"Failed to cleanup directory {directory_path}: {e}")
+        return False
+
+def fetch_quick_crypto_overview(assets: List[str] = None, api_key: str = None) -> Dict[str, pd.DataFrame]:
+    """
+    Quick function to fetch essential crypto data for analysis
+    
+    Args:
+        assets: List of asset slugs (defaults to top 10 cryptos)
+        api_key: Santiment API key
+        
+    Returns:
+        Dictionary with essential data
+    """
+    if assets is None:
+        assets = ['bitcoin', 'ethereum', 'solana', 'ripple', 'cardano']
+    
+    config = FetchConfig(
+        from_date="2025-07-01",  # Changed to be within free tier allowed range
+        to_date="2025-07-06",    # Use last valid date for free tier
+        interval="30m",
+        export_format="parquet"
+    )
+    
+    fetcher = SantimentDataFetcher(api_key=api_key, config=config)
+    
+    # Fetch essential categories
+    essential_categories = ['financial', 'network_activity', 'exchange']
+    
+    data = fetcher.fetch_comprehensive_data(
+        slugs=assets,
+        categories=essential_categories,
+        include_special_metrics=True,
+        include_sql_queries=False
+    )
+    
+    return data
+
+def create_crypto_report(assets: List[str], output_dir: str = "./crypto_report", api_key: str = None):
+    """
+    Create a comprehensive crypto analysis report
+    
+    Args:
+        assets: List of asset slugs to analyze
+        output_dir: Directory to save the report
+        api_key: Santiment API key(s) - can be comma-separated for multiple keys
+    """
+    config = FetchConfig(
+        from_date="2025-07-01",  # Changed to be within free tier allowed range
+        to_date="2025-07-06",    # Use last valid date for free tier
+        interval="30m",
+        export_directory=output_dir,
+        export_format="parquet"  # Use Parquet for output
+    )
+    
+    fetcher = SantimentDataFetcher(api_key=api_key, config=config)
+    
+    # Print API key status
+    fetcher.print_api_key_status()
+    
+    # Fetch comprehensive data
+    logger.info("Fetching comprehensive cryptocurrency data...")
+    data = fetcher.fetch_comprehensive_data(
+        slugs=assets,
+        include_special_metrics=True,
+        include_sql_queries=True
+    )
+    
+    # Export data
+    logger.info("Exporting data to files...")
+    exported_files = fetcher.export_data(combine_categories=False, include_metadata=True)
+    
+    # Create dashboard
+    logger.info("Creating data dashboard...")
+    dashboard_path = fetcher.create_data_dashboard()
+    
+    # Analyze data quality
+    logger.info("Analyzing data quality...")
+    quality_report = fetcher.analyze_data_quality()
+    
+    # Save quality report
+    quality_path = os.path.join(output_dir, "data_quality_report.json")
+    with open(quality_path, 'w') as f:
+        json.dump(quality_report, f, indent=2, default=str)
+    
+    # Print summary
+    fetcher.print_summary()
+    
+    print(f"\nReport generated successfully!")
+    print(f"Dashboard: {dashboard_path}")
+    print(f"Data files: {len(exported_files)} files in {output_dir}")
+    print(f"Quality report: {quality_path}")
+    
+    # Print final API key status
+    print("\n[FINAL_STATUS] Santiment API Key Usage Summary:")
+    fetcher.print_api_key_status()
+
+# Example usage
+def main():
+    # Get API key from environment (already loaded at module top)
+    santiment_api_key = os.getenv("SANTIMENT_API_KEY")
+
+    # Create fetcher instance
+    fetcher = SantimentDataFetcher(api_key=santiment_api_key)
+    
+    # Print API key status
+    fetcher.print_api_key_status()
+
+    # DISABLED: Do not cleanup Santiment directory to preserve data
+    # cleanup_santiment_directory("./data/santiment")
+    print("[SANTIMENT] Data preservation mode - keeping existing data")
+
+    # Reduced scope for API conservation - only top 2 crypto assets
+    print("Fetching reduced crypto overview (API conservation mode)...")
+    # Note: Reduced from 5 to 2 assets to conserve API calls
+    overview_data = fetch_quick_crypto_overview(['bitcoin', 'ethereum'], api_key=santiment_api_key)
+
+    # Comprehensive analysis - reduced scope
+    print("\nCreating conservative crypto report...")
+    # Note: Reduced scope - only Bitcoin and Ethereum to preserve API limits
+    create_crypto_report(
+        assets=['bitcoin', 'ethereum'],  # Reduced from 5 to 2 assets
+        output_dir="./data/santiment",
+        api_key=santiment_api_key
+    )
+    
+    # Print final API key status
+    print("\n[FINAL_STATUS] Santiment API Key Usage Summary:")
+    fetcher.print_api_key_status()
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/fetchers/santiment/test_api_switching.py b/src/fetchers/santiment/test_api_switching.py
new file mode 100644
index 0000000000000000000000000000000000000000..0569070fb82f875624aced03d14f3f0d881a9ca2
--- /dev/null
+++ b/src/fetchers/santiment/test_api_switching.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Test script to verify if API key switching is effective
+"""
+
+import os
+import sys
+from datetime import datetime
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# Add the current directory to Python path for imports
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+from main import SantimentDataFetcher
+
+def test_api_key_switching():
+    """
+    Test if API key switching actually works by attempting multiple requests
+    """
+    print("🔧 Testing API Key Switching Effectiveness")
+    print("=" * 60)
+    
+    # Get API keys
+    api_keys = os.getenv('SANTIMENT_API_KEY')
+    if not api_keys:
+        print("❌ No SANTIMENT_API_KEY found in environment")
+        return
+    
+    key_list = [key.strip() for key in api_keys.split(',')]
+    print(f"📊 Testing with {len(key_list)} API keys")
+    
+    # Create fetcher
+    fetcher = SantimentDataFetcher()
+    
+    # Track switches
+    initial_switches = fetcher.rate_limit_switches
+    
+    print(f"\n🚀 Starting test at {datetime.now().strftime('%H:%M:%S')}")
+    print(f"Initial API key switches: {initial_switches}")
+    
+    # Attempt to fetch a simple metric multiple times
+    test_slug = 'bitcoin'
+    test_metric = 'price_usd'
+    
+    success_count = 0
+    attempt_count = 5  # Try 5 requests
+    
+    for i in range(attempt_count):
+        print(f"\n--- Attempt {i+1}/{attempt_count} ---")
+        print(f"Current API key: #{fetcher.current_key_index + 1}")
+        
+        try:
+            result = fetcher.fetch_single_metric(test_metric, test_slug)
+            
+            if result is not None and not result.empty:
+                success_count += 1
+                print(f"✅ Success! Got {len(result)} data points")
+            else:
+                print("⚠️  No data returned")
+                
+        except Exception as e:
+            print(f"❌ Error: {e}")
+        
+        print(f"API key switches so far: {fetcher.rate_limit_switches}")
+        
+        # Small delay between requests
+        import time
+        time.sleep(1)
+    
+    # Final report
+    print(f"\n📈 FINAL RESULTS")
+    print("=" * 40)
+    print(f"Successful requests: {success_count}/{attempt_count}")
+    print(f"Total API key switches: {fetcher.rate_limit_switches}")
+    print(f"Final API key: #{fetcher.current_key_index + 1}")
+    
+    # Interpret results
+    if fetcher.rate_limit_switches > 0:
+        print("\n✅ API key switching IS working!")
+        print("✅ Your keys appear to be from different accounts.")
+    elif success_count == attempt_count:
+        print("\n✅ All requests successful without switching!")
+        print("ℹ️  Either keys are from different accounts OR current key still has quota.")
+    else:
+        print("\n❌ No switching occurred and some requests failed.")
+        print("⚠️  All keys might be from the same exhausted account.")
+    
+    return fetcher.rate_limit_switches > 0
+
+if __name__ == "__main__":
+    test_api_key_switching()
diff --git a/src/fetchers/stocktwits/ticker_stream.py b/src/fetchers/stocktwits/ticker_stream.py
new file mode 100644
index 0000000000000000000000000000000000000000..76bc5ea1b0a5e870548170377759e56366082559
--- /dev/null
+++ b/src/fetchers/stocktwits/ticker_stream.py
@@ -0,0 +1,56 @@
+# """
+# ticker_stream.py – StockTwits Ticker Streams (Raw Messages)
+
+# Fetches real-time “cashtag” message streams for any US ticker (e.g., $AAPL).
+# No API key required. You can apply your own NLP/sentiment models client-side.
+
+# Endpoint:
+#     https://api.stocktwits.com/api/2/streams/symbol/{symbol}.json
+# """
+
+
+# import os
+# import requests
+
+
+# class StockTwitsTickerStream:
+#     BASE_URL = "https://api.stocktwits.com/api/2/streams/symbol/"
+
+#     def fetch_stream(self, symbol: str, access_token: str, **kwargs):
+#         """
+#         Fetch raw message stream for a given ticker symbol (e.g., 'AAPL').
+#         Requires OAuth access_token.
+#         Returns JSON with messages and metadata.
+#         """
+#         url = f"{self.BASE_URL}{symbol}.json"
+#         params = {"access_token": access_token}
+#         params.update(kwargs)
+#         headers = {
+#             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
+#         }
+#         resp = requests.get(url, params=params, headers=headers)
+#         resp.raise_for_status()
+#         return resp.json()
+
+
+# def main():
+#     """
+#     Example usage: Fetch and print StockTwits stream for a sample ticker.
+#     """
+#     stream_client = StockTwitsTickerStream()
+#     symbol = "AAPL"  # Example ticker
+#     access_token = os.getenv("STOCKTWITS_ACCESS_TOKEN")
+#     if not access_token:
+#         raise RuntimeError("STOCKTWITS_ACCESS_TOKEN environment variable not set.")
+#     try:
+#         data = stream_client.fetch_stream(symbol, access_token)
+#         print(f"Fetched {len(data.get('messages', []))} messages for ${symbol}.")
+#         # Print first message text as a sample
+#         if data.get('messages'):
+#             print("Sample message:", data['messages'][0].get('body', ''))
+#     except Exception as e:
+#         print(f"Error fetching stream for ${symbol}: {e}")
+
+
+# if __name__ == "__main__":
+#     main()
diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9971c0fb19483766e654718b42c94d01287ef76
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Memory-Optimized Main Pipeline for AdvisorAI Data Enhanced
+Addresses critical memory issues causing instance failures (512MB limit)
+"""
+
+import sys
+import os
+import gc
+import psutil
+from datetime import datetime
+from contextlib import contextmanager
+
+# Add paths
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers")))
+
+class MemoryMonitor:
+    """Memory monitoring and optimization utility"""
+    
+    def __init__(self, max_memory_mb=450):  # Set to 450MB to stay under 512MB limit
+        self.max_memory_mb = max_memory_mb
+        self.process = psutil.Process()
+    
+    def get_memory_usage(self):
+        """Get current memory usage in MB"""
+        return self.process.memory_info().rss / 1024 / 1024
+    
+    def check_and_cleanup(self, operation_name=""):
+        """Check memory usage and cleanup if needed"""
+        memory_mb = self.get_memory_usage()
+        
+        if memory_mb > self.max_memory_mb * 0.8:  # 80% threshold (360MB)
+            print(f"[MemOpt] High memory usage during {operation_name}: {memory_mb:.1f}MB")
+            collected = gc.collect()
+            new_memory_mb = self.get_memory_usage()
+            print(f"[MemOpt] Memory after GC: {new_memory_mb:.1f}MB (freed {collected} objects)")
+            
+            if new_memory_mb > self.max_memory_mb * 0.9:  # Still high (405MB)
+                print(f"[MemOpt] WARNING: Memory still high after cleanup")
+        
+        return memory_mb
+    
+    @contextmanager
+    def memory_context(self, operation_name):
+        """Context manager for memory monitoring"""
+        start_memory = self.get_memory_usage()
+        print(f"[MemOpt] Starting {operation_name} - Memory: {start_memory:.1f}MB")
+        
+        try:
+            yield
+        finally:
+            end_memory = self.get_memory_usage()
+            diff = end_memory - start_memory
+            print(f"[MemOpt] Finished {operation_name} - Memory: {end_memory:.1f}MB (Δ{diff:+.1f}MB)")
+            
+            # Force cleanup if memory is getting high
+            if end_memory > self.max_memory_mb * 0.8:
+                print(f"[MemOpt] Memory high after {operation_name}, forcing cleanup...")
+                gc.collect()
+                final_memory = self.get_memory_usage()
+                print(f"[MemOpt] Memory after cleanup: {final_memory:.1f}MB")
+
+def run_fetchers_optimized(memory_monitor):
+    """Run fetchers with memory optimization"""
+    try:
+        with memory_monitor.memory_context("Fetchers"):
+            # Import fetchers main (only when needed)
+            from fetchers.main import main as fetchers_main
+            
+            print("[Pipeline] Starting data fetchers (memory optimized)...")
+            result = fetchers_main()
+            
+            # Clear imports to free memory
+            if 'fetchers.main' in sys.modules:
+                del sys.modules['fetchers.main']
+            
+            # Force cleanup after fetchers
+            memory_monitor.check_and_cleanup("Fetchers")
+            
+            return result
+            
+    except Exception as e:
+        print(f"[Pipeline] Error in fetchers: {e}")
+        # Still cleanup on error
+        memory_monitor.check_and_cleanup("Fetchers (error)")
+        return False
+
+def run_merge_optimized(memory_monitor):
+    """Run merge operations with memory optimization"""
+    try:
+        with memory_monitor.memory_context("Merge"):
+            # Import merge main (only when needed)
+            from merge import main as merge_main
+            
+            print("[Pipeline] Starting data merge (memory optimized)...")
+            result = merge_main.main()
+            
+            # Clear imports to free memory
+            if 'merge.main' in sys.modules:
+                del sys.modules['merge.main']
+            
+            # Force cleanup after merge
+            memory_monitor.check_and_cleanup("Merge")
+            
+            return result
+            
+    except Exception as e:
+        print(f"[Pipeline] Error in merge: {e}")
+        # Still cleanup on error
+        memory_monitor.check_and_cleanup("Merge (error)")
+        return False
+
+def main():
+    """Memory-optimized main pipeline execution"""
+    print("AdvisorAI Data Pipeline - Memory Optimized")
+    print("=" * 50)
+    print(f"Pipeline started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # Initialize memory monitor
+    memory_monitor = MemoryMonitor(max_memory_mb=450)  # Stay under 512MB limit
+    
+    initial_memory = memory_monitor.get_memory_usage()
+    print(f"[Pipeline] Initial memory usage: {initial_memory:.1f}MB")
+    
+    # Check if we're already too high
+    if initial_memory > 200:
+        print(f"[Pipeline] WARNING: High initial memory usage: {initial_memory:.1f}MB")
+        memory_monitor.check_and_cleanup("Initial")
+    
+    try:
+        # Step 1: Run fetchers with memory optimization
+        print("\n" + "="*30)
+        print("STEP 1: DATA FETCHERS")
+        print("="*30)
+        
+        fetchers_success = run_fetchers_optimized(memory_monitor)
+        
+        if not fetchers_success:
+            print("[Pipeline] Fetchers failed, but continuing to merge existing data...")
+        
+        # Memory checkpoint
+        mid_memory = memory_monitor.get_memory_usage()
+        print(f"\n[Pipeline] Memory after fetchers: {mid_memory:.1f}MB")
+        
+        if mid_memory > 400:  # Getting close to limit
+            print("[Pipeline] Memory high after fetchers, forcing cleanup...")
+            gc.collect()
+            mid_memory = memory_monitor.get_memory_usage()
+            print(f"[Pipeline] Memory after cleanup: {mid_memory:.1f}MB")
+        
+        # Step 2: Run merge with memory optimization
+        print("\n" + "="*30)
+        print("STEP 2: DATA MERGE")
+        print("="*30)
+        
+        merge_success = run_merge_optimized(memory_monitor)
+        
+        if not merge_success:
+            print("[Pipeline] Merge failed")
+            return False
+        
+        # Final memory check
+        final_memory = memory_monitor.get_memory_usage()
+        print(f"\n[Pipeline] Final memory usage: {final_memory:.1f}MB")
+        
+        if final_memory > 450:  # Close to 512MB limit
+            print("⚠️ WARNING: Memory usage approaching limit - optimization needed")
+        
+        print(f"Pipeline ended at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        print("[OK] All steps completed successfully!")
+        return True
+        
+    except Exception as e:
+        import traceback
+        print(f"[ERROR] Pipeline execution failed: {e}")
+        print(traceback.format_exc())
+        
+        # Emergency memory cleanup
+        print("[Pipeline] Emergency memory cleanup...")
+        memory_monitor.check_and_cleanup("Emergency")
+        
+        return False
+
+if __name__ == "__main__":
+    success = main()
+    if not success:
+        sys.exit(1)
diff --git a/src/main_memory_optimized.py b/src/main_memory_optimized.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9971c0fb19483766e654718b42c94d01287ef76
--- /dev/null
+++ b/src/main_memory_optimized.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Memory-Optimized Main Pipeline for AdvisorAI Data Enhanced
+Addresses critical memory issues causing instance failures (512MB limit)
+"""
+
+import sys
+import os
+import gc
+import psutil
+from datetime import datetime
+from contextlib import contextmanager
+
+# Add paths
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers")))
+
+class MemoryMonitor:
+    """Memory monitoring and optimization utility"""
+    
+    def __init__(self, max_memory_mb=450):  # Set to 450MB to stay under 512MB limit
+        self.max_memory_mb = max_memory_mb
+        self.process = psutil.Process()
+    
+    def get_memory_usage(self):
+        """Get current memory usage in MB"""
+        return self.process.memory_info().rss / 1024 / 1024
+    
+    def check_and_cleanup(self, operation_name=""):
+        """Check memory usage and cleanup if needed"""
+        memory_mb = self.get_memory_usage()
+        
+        if memory_mb > self.max_memory_mb * 0.8:  # 80% threshold (360MB)
+            print(f"[MemOpt] High memory usage during {operation_name}: {memory_mb:.1f}MB")
+            collected = gc.collect()
+            new_memory_mb = self.get_memory_usage()
+            print(f"[MemOpt] Memory after GC: {new_memory_mb:.1f}MB (freed {collected} objects)")
+            
+            if new_memory_mb > self.max_memory_mb * 0.9:  # Still high (405MB)
+                print(f"[MemOpt] WARNING: Memory still high after cleanup")
+        
+        return memory_mb
+    
+    @contextmanager
+    def memory_context(self, operation_name):
+        """Context manager for memory monitoring"""
+        start_memory = self.get_memory_usage()
+        print(f"[MemOpt] Starting {operation_name} - Memory: {start_memory:.1f}MB")
+        
+        try:
+            yield
+        finally:
+            end_memory = self.get_memory_usage()
+            diff = end_memory - start_memory
+            print(f"[MemOpt] Finished {operation_name} - Memory: {end_memory:.1f}MB (Δ{diff:+.1f}MB)")
+            
+            # Force cleanup if memory is getting high
+            if end_memory > self.max_memory_mb * 0.8:
+                print(f"[MemOpt] Memory high after {operation_name}, forcing cleanup...")
+                gc.collect()
+                final_memory = self.get_memory_usage()
+                print(f"[MemOpt] Memory after cleanup: {final_memory:.1f}MB")
+
+def run_fetchers_optimized(memory_monitor):
+    """Run fetchers with memory optimization"""
+    try:
+        with memory_monitor.memory_context("Fetchers"):
+            # Import fetchers main (only when needed)
+            from fetchers.main import main as fetchers_main
+            
+            print("[Pipeline] Starting data fetchers (memory optimized)...")
+            result = fetchers_main()
+            
+            # Clear imports to free memory
+            if 'fetchers.main' in sys.modules:
+                del sys.modules['fetchers.main']
+            
+            # Force cleanup after fetchers
+            memory_monitor.check_and_cleanup("Fetchers")
+            
+            return result
+            
+    except Exception as e:
+        print(f"[Pipeline] Error in fetchers: {e}")
+        # Still cleanup on error
+        memory_monitor.check_and_cleanup("Fetchers (error)")
+        return False
+
+def run_merge_optimized(memory_monitor):
+    """Run merge operations with memory optimization"""
+    try:
+        with memory_monitor.memory_context("Merge"):
+            # Import merge main (only when needed)
+            from merge import main as merge_main
+            
+            print("[Pipeline] Starting data merge (memory optimized)...")
+            result = merge_main.main()
+            
+            # Clear imports to free memory
+            if 'merge.main' in sys.modules:
+                del sys.modules['merge.main']
+            
+            # Force cleanup after merge
+            memory_monitor.check_and_cleanup("Merge")
+            
+            return result
+            
+    except Exception as e:
+        print(f"[Pipeline] Error in merge: {e}")
+        # Still cleanup on error
+        memory_monitor.check_and_cleanup("Merge (error)")
+        return False
+
+def main():
+    """Memory-optimized main pipeline execution"""
+    print("AdvisorAI Data Pipeline - Memory Optimized")
+    print("=" * 50)
+    print(f"Pipeline started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    
+    # Initialize memory monitor
+    memory_monitor = MemoryMonitor(max_memory_mb=450)  # Stay under 512MB limit
+    
+    initial_memory = memory_monitor.get_memory_usage()
+    print(f"[Pipeline] Initial memory usage: {initial_memory:.1f}MB")
+    
+    # Check if we're already too high
+    if initial_memory > 200:
+        print(f"[Pipeline] WARNING: High initial memory usage: {initial_memory:.1f}MB")
+        memory_monitor.check_and_cleanup("Initial")
+    
+    try:
+        # Step 1: Run fetchers with memory optimization
+        print("\n" + "="*30)
+        print("STEP 1: DATA FETCHERS")
+        print("="*30)
+        
+        fetchers_success = run_fetchers_optimized(memory_monitor)
+        
+        if not fetchers_success:
+            print("[Pipeline] Fetchers failed, but continuing to merge existing data...")
+        
+        # Memory checkpoint
+        mid_memory = memory_monitor.get_memory_usage()
+        print(f"\n[Pipeline] Memory after fetchers: {mid_memory:.1f}MB")
+        
+        if mid_memory > 400:  # Getting close to limit
+            print("[Pipeline] Memory high after fetchers, forcing cleanup...")
+            gc.collect()
+            mid_memory = memory_monitor.get_memory_usage()
+            print(f"[Pipeline] Memory after cleanup: {mid_memory:.1f}MB")
+        
+        # Step 2: Run merge with memory optimization
+        print("\n" + "="*30)
+        print("STEP 2: DATA MERGE")
+        print("="*30)
+        
+        merge_success = run_merge_optimized(memory_monitor)
+        
+        if not merge_success:
+            print("[Pipeline] Merge failed")
+            return False
+        
+        # Final memory check
+        final_memory = memory_monitor.get_memory_usage()
+        print(f"\n[Pipeline] Final memory usage: {final_memory:.1f}MB")
+        
+        if final_memory > 450:  # Close to 512MB limit
+            print("⚠️ WARNING: Memory usage approaching limit - optimization needed")
+        
+        print(f"Pipeline ended at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+        print("[OK] All steps completed successfully!")
+        return True
+        
+    except Exception as e:
+        import traceback
+        print(f"[ERROR] Pipeline execution failed: {e}")
+        print(traceback.format_exc())
+        
+        # Emergency memory cleanup
+        print("[Pipeline] Emergency memory cleanup...")
+        memory_monitor.check_and_cleanup("Emergency")
+        
+        return False
+
+if __name__ == "__main__":
+    success = main()
+    if not success:
+        sys.exit(1)
diff --git a/src/main_original.py b/src/main_original.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ab1497c4fe0e777c402dd28642c6718c75569d
--- /dev/null
+++ b/src/main_original.py
@@ -0,0 +1,21 @@
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".")))
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "fetchers")))
+
+from fetchers.main import main as fetchers_main
+from merge import main as merge_main
+
+if __name__ == "__main__":
+    try:
+        print("Starting fetchers...")
+        fetchers_main()
+        print("Starting merge pipeline...")
+        merge_main.main()
+        print("[OK] All steps completed.")
+    except Exception as e:
+        import traceback
+        print(f"[ERROR] Pipeline execution failed: {e}")
+        print(traceback.format_exc())
+        sys.exit(1)
diff --git a/src/merge/ENHANCED_MERGE_README.md b/src/merge/ENHANCED_MERGE_README.md
new file mode 100644
index 0000000000000000000000000000000000000000..51303c632f61c935cc068c45f6807c50facaea6c
--- /dev/null
+++ b/src/merge/ENHANCED_MERGE_README.md
@@ -0,0 +1,123 @@
+# Enhanced Merge with Intelligent Null Filling
+
+## Overview
+
+The `merge_temp.py` module has been enhanced with sophisticated null filling capabilities that prioritize finding values from the **same symbol + interval_timestamp** combination across different data sources before falling back to other strategies.
+
+## Key Features
+
+### 1. Symbol-First Null Filling Strategy
+
+When merging temp files to existing features, the system now:
+
+1. **Identifies null values** in the target (merged) dataset
+2. **Searches for matching records** in the source (temp) dataset using `(symbol, interval_timestamp)` as the key
+3. **Fills null values** only when:
+   - The same symbol + timestamp exists in the temp data
+   - The temp data has a non-null value for that column
+   - The column exists in both datasets
+
+### 2. Cross-Dataset Null Filling
+
+During train file creation and merged features generation:
+
+1. **Combines multiple sources** (archive, features, temp files)
+2. **Creates a comprehensive lookup** of all non-null values by `(symbol, timestamp)`
+3. **Fills nulls intelligently** using the best available data from any source
+4. **Preserves data integrity** by only filling with values from the exact same symbol and time
+
+### 3. Enhanced Functions
+
+#### `fill_nulls_from_temp(df_merged, df_temp)`
+- Fills null values in `df_merged` using data from `df_temp`
+- Only fills when exact `(symbol, interval_timestamp)` match exists
+- Returns count of null values filled
+- Provides detailed logging of the filling process
+
+#### `merge_temp_to_merged(temp_name, merged_name)`
+- Enhanced to perform null filling before adding new records
+- Reports both new records added and null values filled
+- Maintains existing functionality while adding intelligent null handling
+
+#### `merge_all_to_train()`
+- Cross-source null filling during train file creation
+- Combines archive, features, and temp data optimally
+- Eliminates duplicates while preserving the best available data
+
+#### `create_merged_features()`
+- Creates the main `merged_features.parquet` file
+- Combines crypto and stock features with cross-dataset null filling
+- Provides comprehensive statistics on the merge process
+
+## Benefits
+
+### 🎯 **Data Quality Improvements**
+- **Preserves Symbol Characteristics**: Uses same-symbol data to fill nulls
+- **Temporal Consistency**: Only uses data from the exact same timestamp
+- **No Data Pollution**: Never mixes data from different symbols or times
+
+### 📊 **Better Coverage**
+- **Reduced Null Values**: Significantly fewer missing values in final datasets
+- **Multi-Source Integration**: Leverages all available data sources
+- **Smart Deduplication**: Keeps the best version of each record
+
+### 🔧 **Robust Processing**
+- **Error Handling**: Graceful handling of missing files and edge cases
+- **Detailed Logging**: Clear reporting of what was filled and why
+- **Validation**: Built-in checks to ensure data integrity
+
+## Usage Examples
+
+### Test the Null Filling
+```bash
+cd src/merge
+python merge_temp.py --test-null-filling
+```
+
+### Run Normal Merge Process
+```bash
+cd src/merge
+python merge_temp.py
+```
+
+### Manual Testing
+```bash
+cd src/merge
+python test_null_filling_merge.py
+```
+
+## Integration with Main Pipeline
+
+The enhanced merge functionality is automatically integrated into the main pipeline:
+
+1. **After data collection**: Temp files are created with new data
+2. **During merge_temp.py**: Null filling happens automatically
+3. **Before normalization**: Data is as complete as possible
+4. **Train file creation**: Uses all available historical data
+
+## Example Output
+
+```
+[INFO] Attempting to fill nulls in 4 columns: ['price', 'volume', 'rsi', 'macd']
+[INFO] Successfully filled 7 null values from temp data
+[INFO] Column 'price': 0 nulls remaining
+[INFO] Column 'volume': 0 nulls remaining
+[INFO] Column 'rsi': 0 nulls remaining
+[INFO] Column 'macd': 0 nulls remaining
+[OK] Added 15 new records from crypto_features.parquet to crypto_features.parquet, filled 7 null values
+```
+
+## Performance Considerations
+
+- **Efficient Lookups**: Uses dictionary-based lookups for O(1) access
+- **Memory Optimized**: Processes data in chunks when possible
+- **Minimal Overhead**: Only processes columns that actually have nulls
+
+## Future Enhancements
+
+- **Time-Window Filling**: Fill with nearest timestamp if exact match not found
+- **Interpolation**: Smart interpolation for numerical features
+- **Symbol Similarity**: Fill using similar symbols when exact match unavailable
+- **Quality Scoring**: Rank data sources by quality for better filling decisions
+
+This enhanced merge system ensures that your machine learning models receive the highest quality, most complete data possible while preserving the integrity and characteristics of each financial instrument.
diff --git a/src/merge/alpaca_features.py b/src/merge/alpaca_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..858ba941bd7364fd98995df12f768a591b3f33dd
--- /dev/null
+++ b/src/merge/alpaca_features.py
@@ -0,0 +1,142 @@
+"""
+Merge Alpaca bars + quotes + trades into a single feature table.
+
+• data/alpaca/*_bars.parquet   ← master timeline (daily)
+• data/alpaca/*_quotes.parquet ← L1 quotes (intraday ticks)
+• data/alpaca/*_trades.parquet ← raw trades (intraday ticks)
+
+The script logs shapes / null counts so you can eyeball data quality.
+"""
+
+from __future__ import annotations
+
+
+import os
+import sys
+from glob import glob
+
+import pandas as pd
+import warnings
+warnings.filterwarnings("ignore", category=FutureWarning)
+
+# --------------------------------------------------------------------------- #
+# CONFIG
+# --------------------------------------------------------------------------- #
+# Resolve writable base using central config (fallback to /data)
+try:
+    from src import config as app_config
+    BASE_DATA_DIR = app_config.DATA_DIR
+except Exception:
+    BASE_DATA_DIR = os.environ.get("DATA_DIR", "/data")
+
+DATA_DIR = os.path.join(BASE_DATA_DIR, "alpaca")
+os.makedirs(DATA_DIR, exist_ok=True)
+OUT_FILE = "alpaca_features.parquet"
+TOLERANCE = 86_400_000  # 1 day in ms for integer timestamps
+MERGE_DIR = "nearest"               # ← **important change**
+
+# --------------------------------------------------------------------------- #
+# HELPERS
+# --------------------------------------------------------------------------- #
+def log(title: str, char: str = "=", width: int = 60) -> None:
+    print(f"\n{title.center(width, char)}")
+
+def load_parquets(suffix: str) -> pd.DataFrame:
+    """Read every *{suffix}.parquet in DATA_DIR and concat."""
+    paths = glob(os.path.join(DATA_DIR, f"*{suffix}.parquet"))
+    if not paths:
+        return pd.DataFrame()
+
+    def normalize(df: pd.DataFrame) -> pd.DataFrame:
+        # Normalize symbol: "XRP/USD" -> "XRP"
+        df["symbol"] = df["symbol"].astype(str).str.replace(r"([A-Z]+)[/_][A-Z]+", r"\1", regex=True)
+        # Convert timestamp to ms since epoch
+        df["timestamp"] = pd.to_datetime(df["timestamp"])
+        df["timestamp"] = df["timestamp"].astype("int64") // 10**6
+        return df
+
+    dfs: list[pd.DataFrame] = []
+    for p in paths:
+        df = pd.read_parquet(p)
+        df = normalize(df)
+        dfs.append(df)
+
+    out = pd.concat(dfs, ignore_index=True)
+    return out
+
+
+# --------------------------------------------------------------------------- #
+# MAIN LOGIC
+# --------------------------------------------------------------------------- #
+def build_features() -> pd.DataFrame:
+    bars   = load_parquets("_bars")
+    quotes = load_parquets("_quotes")
+    trades = load_parquets("_trades")
+
+    if bars.empty:
+        raise RuntimeError(f"No '*_bars.parquet' files found in {DATA_DIR}")
+
+    # Merge symbol-by-symbol so each group is already sorted
+    features = []
+    symbols = sorted(bars["symbol"].unique())
+
+    for sym in symbols:
+        bar_df = bars[bars["symbol"] == sym].sort_values("timestamp").reset_index(drop=True)
+
+        # nearest quote merge
+        if not quotes.empty:
+            q = quotes[quotes["symbol"] == sym].sort_values("timestamp")
+            if not q.empty:
+                bar_df = pd.merge_asof(
+                    bar_df,
+                    q,
+                    on="timestamp",
+                    suffixes=("", "_quote"),
+                    tolerance=TOLERANCE,
+                    direction=MERGE_DIR,   # ← nearest!
+                )
+
+        # nearest trade merge
+        if not trades.empty:
+            t = trades[trades["symbol"] == sym].sort_values("timestamp")
+            if not t.empty:
+                bar_df = pd.merge_asof(
+                    bar_df,
+                    t,
+                    on="timestamp",
+                    suffixes=("", "_trade"),
+                    tolerance=TOLERANCE,
+                    direction=MERGE_DIR,   # ← nearest!
+                )
+
+        features.append(bar_df)
+
+    feat = pd.concat(features, ignore_index=True)
+
+    # --------------------------------------------------------------------- #
+    # Fill remaining holes within each symbol
+    # --------------------------------------------------------------------- #
+    feat = (
+        feat
+        .groupby("symbol", group_keys=False)
+        .apply(lambda df: df.ffill().bfill())
+        .reset_index(drop=True)
+    )
+
+    return feat
+
+
+def save(df: pd.DataFrame) -> None:
+    out_path = os.path.join(DATA_DIR, OUT_FILE)
+    df.to_parquet(out_path, index=False)
+    print(f"\n-> wrote merged features to {out_path}")
+
+
+# --------------------------------------------------------------------------- #
+def main() -> None:
+    merged = build_features()
+    save(merged)
+
+if __name__ == "__main__":
+    log("Merging Alpaca Features")
+    main()
diff --git a/src/merge/crypto_data_filler.py b/src/merge/crypto_data_filler.py
new file mode 100644
index 0000000000000000000000000000000000000000..b853535c81682620408ef4171ba560fa588ae7db
--- /dev/null
+++ b/src/merge/crypto_data_filler.py
@@ -0,0 +1,865 @@
+import pandas as pd
+import numpy as np
+from sklearn.impute import KNNImputer
+from sklearn.preprocessing import StandardScaler
+import warnings
+warnings.filterwarnings('ignore')
+
+class CryptoDataImputerFixed:
+    """
+    Specialized imputation for cryptocurrency data that preserves unique
+    characteristics of different crypto assets and prevents homogenization.
+    """
+
+    def __init__(self, preserve_crypto_diversity=True):
+        self.preserve_crypto_diversity = preserve_crypto_diversity
+        self.crypto_profiles = {}
+        self.scalers = {}
+
+    def _create_crypto_profiles(self, df):
+        """Create profiles for each cryptocurrency to guide imputation."""
+        profiles = {}
+
+        for symbol in df['symbol'].unique():
+            symbol_data = df[df['symbol'] == symbol]
+
+            # Calculate crypto-specific statistics
+            # Defensive mode extraction for 'stable' and 'blockchain_network'
+            stable_mode = symbol_data['stable'].mode() if 'stable' in symbol_data.columns else pd.Series()
+            is_stablecoin = stable_mode.iloc[0] if not stable_mode.empty else False
+            network_mode = symbol_data['blockchain_network'].mode() if 'blockchain_network' in symbol_data.columns else pd.Series()
+            blockchain_network = network_mode.iloc[0] if not network_mode.empty else None
+
+            profile = {
+                'symbol': symbol,
+                'price_level': symbol_data['price'].median() if 'price' in symbol_data.columns else None,
+                'price_volatility': symbol_data['price'].std() if 'price' in symbol_data.columns else None,
+                'volume_level': symbol_data['volume'].median() if 'volume' in symbol_data.columns else None,
+                'marketcap_level': symbol_data['marketcap'].median() if 'marketcap' in symbol_data.columns else None,
+                'dominance_level': symbol_data['dominance'].median() if 'dominance' in symbol_data.columns else None,
+                'rank': symbol_data['rank'].median() if 'rank' in symbol_data.columns else None,
+                'is_stablecoin': is_stablecoin,
+                'typical_rsi': symbol_data['rsi'].median() if 'rsi' in symbol_data.columns else None,
+                'blockchain_network': blockchain_network,
+                'has_onchain_data': symbol_data['transaction_count'].notna().any() if 'transaction_count' in symbol_data.columns else False,
+                'exchange_coverage': len([col for col in symbol_data.columns if col.startswith('symbols.') and symbol_data[col].notna().any()]),
+                'data_availability': len(symbol_data) / len(df) if len(df) > 0 else 0
+            }
+
+            profiles[symbol] = profile
+
+        return profiles
+
+    def _impute_with_crypto_context(self, df, column, crypto_profiles):
+        """Impute values using crypto-specific context to prevent homogenization."""
+
+        df_result = df.copy()
+
+        for symbol in df['symbol'].unique():
+            symbol_mask = df['symbol'] == symbol
+            symbol_data = df.loc[symbol_mask, column]
+
+            if symbol_data.isnull().sum() == 0:
+                continue  # No missing values for this symbol
+
+            profile = crypto_profiles.get(symbol, {})
+            is_stablecoin = profile.get('is_stablecoin', False)
+            rank = profile.get('rank', 999)
+
+            # Strategy depends on column type and crypto characteristics
+            if column in ['price', 'open', 'high', 'low', 'close']:
+                # Price data - special handling for stablecoins
+                if is_stablecoin:
+                    # Stablecoins should stay around $1
+                    base_price = 1.0
+                    symbol_hash = hash(symbol + column) % 1000 / 100000  # Very small variation
+                    adjusted_price = base_price + symbol_hash
+                else:
+                    # Regular crypto - use interpolation with crypto-specific bounds
+                    interpolated = symbol_data.interpolate(method='linear', limit_direction='both')
+
+                    # If still missing, use crypto's typical price level with volatility-based noise
+                    if interpolated.isnull().any() and profile.get('price_level'):
+                        base_price = profile['price_level']
+                        volatility = profile.get('price_volatility', base_price * 0.05)  # Crypto is more volatile
+
+                        # Add crypto-specific noise based on rank (higher rank = more volatile)
+                        symbol_hash = hash(symbol) % 1000 / 1000  # 0-1 range
+                        volatility_multiplier = 1 + (rank / 100)  # Higher rank = higher volatility
+                        noise_factor = (symbol_hash - 0.5) * 0.2 * volatility_multiplier  # More volatile than stocks
+                        adjusted_price = base_price * (1 + noise_factor)
+                    else:
+                        adjusted_price = interpolated
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(adjusted_price)
+
+            elif column in ['volume', 'volume_alpaca']:
+                # Volume data - crypto volume patterns differ significantly
+                filled = symbol_data.fillna(method='ffill').fillna(method='bfill')
+
+                if filled.isnull().any():
+                    base_volume = profile.get('volume_level', 1000000)  # Default higher for crypto
+                    # Major cryptos have much higher volume
+                    if rank and rank <= 10:
+                        volume_multiplier = 5 + (hash(symbol + column) % 1000 / 200)  # 5x-10x
+                    elif rank and rank <= 50:
+                        volume_multiplier = 1 + (hash(symbol + column) % 1000 / 500)  # 1x-3x
+                    else:
+                        volume_multiplier = 0.1 + (hash(symbol + column) % 1000 / 1000)  # 0.1x-1.1x
+
+                    adjusted_volume = base_volume * volume_multiplier
+                    filled = filled.fillna(adjusted_volume)
+
+                df_result.loc[symbol_mask, column] = filled
+
+            elif column in ['marketcap']:
+                # Market cap - highly dependent on rank
+                if profile.get('marketcap_level'):
+                    baseline = profile['marketcap_level']
+                else:
+                    # Estimate based on rank
+                    if rank and rank <= 10:
+                        baseline = 10_000_000_000  # $10B+ for top 10
+                    elif rank and rank <= 50:
+                        baseline = 1_000_000_000   # $1B+ for top 50
+                    elif rank and rank <= 100:
+                        baseline = 100_000_000     # $100M+ for top 100
+                    else:
+                        baseline = 10_000_000      # $10M+ for others
+
+                    # Add symbol-specific variation
+                    symbol_hash = hash(symbol + column) % 1000 / 1000
+                    baseline *= (0.5 + symbol_hash)  # 0.5x to 1.5x variation
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+            elif column in ['dominance']:
+                # Market dominance - only meaningful for major cryptos
+                if rank and rank <= 5:
+                    # Major cryptos have meaningful dominance
+                    symbol_hash = hash(symbol + column) % 1000 / 1000
+                    if symbol.upper() == 'BTC':
+                        baseline = 0.4 + (symbol_hash * 0.2)  # BTC: 40-60%
+                    elif symbol.upper() == 'ETH':
+                        baseline = 0.15 + (symbol_hash * 0.1)  # ETH: 15-25%
+                    else:
+                        baseline = 0.01 + (symbol_hash * 0.05)  # Others: 1-6%
+                else:
+                    baseline = 0.001 + (hash(symbol + column) % 1000 / 100000)  # Very small
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+            elif column in ['rsi', 'stoch_k', 'stoch_d']:
+                # Oscillator indicators - crypto markets are more extreme
+                symbol_median = symbol_data.median()
+
+                if pd.isna(symbol_median):
+                    symbol_hash = hash(symbol + column) % 1000 / 1000
+                    if column == 'rsi':
+                        # Crypto RSI tends to be more extreme
+                        if rank and rank <= 10:  # Major cryptos more stable
+                            baseline = 20 + (symbol_hash * 60)  # 20-80 range
+                        else:  # Alt coins more extreme
+                            baseline = 10 + (symbol_hash * 80)  # 10-90 range
+                    else:  # stochastic
+                        baseline = 10 + (symbol_hash * 80)  # 10-90 range
+                else:
+                    baseline = symbol_median
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+            elif column in ['macd', 'macd_signal', 'macd_histogram']:
+                # MACD - crypto MACD values tend to be more volatile
+                symbol_median = symbol_data.median()
+
+                if pd.isna(symbol_median):
+                    price_level = profile.get('price_level', 1)
+                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1
+                    # Scale MACD relative to price level and volatility
+                    volatility_factor = 2 if rank and rank > 50 else 1  # Alt coins more volatile
+                    baseline = (price_level * 0.01 * volatility_factor) * symbol_hash
+                else:
+                    baseline = symbol_median
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+            elif column.startswith('performance.'):
+                # Performance metrics - crypto performance is more extreme
+                symbol_median = symbol_data.median()
+
+                if pd.isna(symbol_median):
+                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1
+
+                    # Different baselines for different timeframes
+                    if 'year' in column:
+                        baseline = symbol_hash * 5  # ±500% annual performance possible
+                    elif 'month' in column:
+                        baseline = symbol_hash * 2  # ±200% monthly performance possible
+                    elif 'week' in column:
+                        baseline = symbol_hash * 0.5  # ±50% weekly performance possible
+                    elif 'day' in column:
+                        baseline = symbol_hash * 0.2  # ±20% daily performance possible
+                    else:  # hour, min
+                        baseline = symbol_hash * 0.05  # ±5% short-term performance
+
+                    # Alt coins are more volatile
+                    if rank and rank > 50:
+                        baseline *= 2
+
+                else:
+                    baseline = symbol_median
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+            elif column.startswith('tx_') or column.startswith('gas_') or column in [
+                'transaction_volume', 'transaction_count', 'total_fees', 'total_gas_used', 
+                'avg_gas_price', 'avg_tx_size', 'fees_7d_change', 'gas_used_7d_change', 'gas_price_7d_change'
+            ] or '_7d_change' in column:
+                # On-chain features - only meaningful for blockchains with transaction data
+                network = profile.get('blockchain_network', 'unknown')
+
+                # Special handling for 7d change columns
+                if '7d_change' in column:
+                    # These are percentage changes, should be reasonable values
+                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1 range
+
+                    if 'fees' in column.lower():
+                        # Fee changes can be more volatile in crypto
+                        baseline = symbol_hash * 0.5  # ±50% change
+                    elif 'gas' in column.lower():
+                        # Gas usage changes
+                        baseline = symbol_hash * 0.3  # ±30% change
+                    else:
+                        # Other transaction-related changes
+                        baseline = symbol_hash * 0.4  # ±40% change
+
+                    # Alt coins more volatile
+                    if rank and rank > 100:
+                        baseline *= 2
+
+                elif network in ['ethereum', 'bitcoin', 'polygon', 'bsc', 'avalanche']:
+                    # Major networks have meaningful on-chain data
+                    symbol_median = symbol_data.median()
+
+                    if pd.isna(symbol_median):
+                        # Estimate based on network and rank
+                        symbol_hash = hash(symbol + column) % 1000 / 1000
+
+                        if 'count' in column.lower():
+                            if network == 'ethereum':
+                                baseline = 1000000 * (1 + symbol_hash)  # High transaction count
+                            elif network == 'bitcoin':
+                                baseline = 300000 * (1 + symbol_hash)   # Lower transaction count
+                            else:
+                                baseline = 500000 * (1 + symbol_hash)   # Medium transaction count
+                        elif 'gas' in column.lower():
+                            if network == 'ethereum':
+                                baseline = 50 * (1 + symbol_hash)       # Higher gas prices
+                            else:
+                                baseline = 5 * (1 + symbol_hash)        # Lower gas prices
+                        elif 'fee' in column.lower():
+                            baseline = 1000000 * (1 + symbol_hash)      # Transaction fees in wei/satoshi
+                        else:
+                            # Other on-chain metrics
+                            baseline = symbol_hash * 1000
+                    else:
+                        baseline = symbol_median
+                else:
+                    # Networks without meaningful on-chain data OR 7d_change columns
+                    if '7d_change' in column:
+                        # Use the calculated baseline from above
+                        pass  # baseline already set
+                    else:
+                        baseline = 0
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+            elif column.startswith('exchangePrices.') or column.startswith('symbols.'):
+                # Exchange-specific data
+                exchange = column.split('.')[1] if '.' in column else 'unknown'
+
+                if column.startswith('exchangePrices.'):
+                    # Use main price with small exchange-specific variation
+                    main_price = profile.get('price_level', 100)
+                    if main_price and not is_stablecoin:
+                        # Different exchanges have small price differences
+                        exchange_hash = hash(symbol + exchange) % 200 / 10000  # ±1% variation
+                        baseline = main_price * (1 + exchange_hash)
+                    else:
+                        baseline = main_price or 1
+                else:
+                    # Exchange symbols - should be strings, handle separately
+                    continue
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+            else:
+                # Generic numeric imputation with crypto-specific variation
+                symbol_median = symbol_data.median()
+
+                if pd.isna(symbol_median):
+                    overall_median = df[column].median()
+                    if pd.isna(overall_median):
+                        overall_median = 0
+
+                    # Add crypto-specific variation based on rank and volatility
+                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1
+                    volatility_factor = 2 if rank and rank > 100 else 1
+                    variation = overall_median * 0.2 * symbol_hash * volatility_factor
+                    baseline = overall_median + variation
+                else:
+                    baseline = symbol_median
+
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+
+        return df_result[column]
+
+    def _force_fill_stubborn_nulls(self, df):
+        """Aggressively fill any remaining nulls with appropriate defaults."""
+
+        # Target ALL the problematic 7d_change columns
+        stubborn_cols = ['fees_7d_change', 'gas_used_7d_change', 'gas_price_7d_change']
+
+        for col in stubborn_cols:
+            if col in df.columns:
+                null_count_before = df[col].isnull().sum()
+                if null_count_before > 0:
+                    # Strategy 1: Try group-based fill first
+                    df[col] = df.groupby('symbol')[col].transform(lambda x: x.fillna(x.median()))
+
+                    # Strategy 2: Fill remaining with symbol-specific hash-based values
+                    still_null = df[col].isnull()
+                    if still_null.any():
+                        for symbol in df[still_null]['symbol'].unique():
+                            symbol_mask = (df['symbol'] == symbol) & df[col].isnull()
+                            if symbol_mask.any():
+                                # Create deterministic but varied values based on symbol
+                                symbol_hash = hash(symbol + col) % 2000 / 1000 - 1  # -1 to +1
+
+                                if 'fees' in col.lower():
+                                    fill_value = symbol_hash * 0.3  # ±30% fee change
+                                elif 'gas' in col.lower():
+                                    fill_value = symbol_hash * 0.25  # ±25% gas change
+                                else:
+                                    fill_value = symbol_hash * 0.2   # ±20% generic change
+
+                                df.loc[symbol_mask, col] = fill_value
+
+                    # Strategy 3: Nuclear option - fill any remaining with 0
+                    remaining_nulls = df[col].isnull().sum()
+                    if remaining_nulls > 0:
+                        print(f"[WARNING] Nuclear fill: {remaining_nulls} nulls in {col} filled with 0")
+                        df[col] = df[col].fillna(0)
+
+        return df
+
+    def _nuclear_null_elimination(self, df):
+        """Final pass to eliminate ALL nulls with extreme prejudice."""
+        print("[INFO] Performing nuclear null elimination...")
+
+        # Get all numeric columns
+        numeric_cols = df.select_dtypes(include=[np.number]).columns
+
+        for col in numeric_cols:
+            null_count = df[col].isnull().sum()
+            if null_count > 0:
+                print(f"[NUCLEAR] Eliminating {null_count} nulls in {col}")
+
+                # Try different strategies in order
+                if '7d_change' in col or 'change' in col.lower():
+                    # Change columns - use symbol-specific hash
+                    for symbol in df['symbol'].unique():
+                        symbol_mask = (df['symbol'] == symbol) & df[col].isnull()
+                        if symbol_mask.any():
+                            symbol_hash = hash(symbol + col) % 2000 / 1000 - 1  # -1 to +1
+                            if 'fees' in col.lower():
+                                fill_value = symbol_hash * 0.3
+                            elif 'gas' in col.lower():
+                                fill_value = symbol_hash * 0.25
+                            else:
+                                fill_value = symbol_hash * 0.2
+                            df.loc[symbol_mask, col] = fill_value
+
+                elif 'timestamp' in col.lower():
+                    # Timestamp columns
+                    df[col] = df[col].fillna(method='ffill').fillna(method='bfill').fillna(0)
+
+                elif col in ['price', 'open', 'high', 'low', 'close']:
+                    # Price columns - use symbol-specific values
+                    for symbol in df['symbol'].unique():
+                        symbol_mask = (df['symbol'] == symbol) & df[col].isnull()
+                        if symbol_mask.any():
+                            symbol_price = df[df['symbol'] == symbol][col].median()
+                            if pd.isna(symbol_price):
+                                symbol_hash = hash(symbol + col) % 10000 / 100  # 0-100 range
+                                symbol_price = 1 + symbol_hash  # $1-$101
+                            df.loc[symbol_mask, col] = symbol_price
+
+                else:
+                    # Generic columns - try median first, then 0
+                    median_val = df[col].median()
+                    if pd.isna(median_val):
+                        median_val = 0
+                    df[col] = df[col].fillna(median_val)
+
+                # Final check - if still nulls, force to 0
+                remaining_nulls = df[col].isnull().sum()
+                if remaining_nulls > 0:
+                    print(f"[NUCLEAR] Force filling {remaining_nulls} remaining nulls in {col} with 0")
+                    df[col] = df[col].fillna(0)
+
+        return df
+
+    def _enhanced_sentiment_imputation(self, df):
+        """Enhanced sentiment imputation that creates realistic, diverse sentiment values."""
+        
+        print(f"[INFO] Starting enhanced sentiment imputation...")
+        
+        # Define sentiment columns
+        core_sentiment_cols = ['sentiment_score', 'neg', 'neu', 'pos']
+        
+        for col in core_sentiment_cols:
+            if col in df.columns:
+                null_count_before = df[col].isnull().sum()
+                if null_count_before > 0:
+                    print(f"[INFO] Processing {col}: {null_count_before} nulls to fill")
+        
+        # Process each symbol separately for core sentiment columns
+        for col in core_sentiment_cols:
+            if col in df.columns and df[col].isnull().any():
+                print(f"Enhanced imputation for {col}...")
+                
+                for symbol in df['symbol'].unique():
+                    symbol_mask = df['symbol'] == symbol
+                    symbol_sentiment = df.loc[symbol_mask, col]
+                    
+                    if symbol_sentiment.isnull().any():
+                        # Try forward/backward fill first
+                        filled = symbol_sentiment.fillna(method='ffill').fillna(method='bfill')
+                        
+                        # For remaining nulls, use symbol-specific realistic values
+                        if filled.isnull().any():
+                            symbol_hash = hash(symbol + col) % 10000 / 10000
+                            symbol_upper = symbol.upper()
+                            
+                            # Define crypto categories
+                            stablecoins = ['USDT', 'USDC', 'BUSD', 'DAI', 'TUSD', 'USDP']
+                            major_cryptos = ['BTC', 'ETH', 'BNB', 'ADA', 'XRP', 'SOL', 'DOT', 'AVAX']
+                            
+                            if col == 'sentiment_score':
+                                # Sentiment score (-1 to +1)
+                                if any(stable in symbol_upper for stable in stablecoins):
+                                    fill_value = (symbol_hash - 0.5) * 0.1  # Stable: ±0.05
+                                elif any(major in symbol_upper for major in major_cryptos):
+                                    fill_value = 0.1 + (symbol_hash - 0.5) * 0.4  # Major: 0.1 ± 0.2
+                                else:
+                                    fill_value = (symbol_hash - 0.5) * 0.6  # Alt: ±0.3
+                                fill_value = np.clip(fill_value, -1.0, 1.0)
+                                
+                            elif col == 'neu':
+                                # Neutral sentiment (dominant)
+                                if any(stable in symbol_upper for stable in stablecoins):
+                                    fill_value = 0.85 + symbol_hash * 0.1  # 0.85-0.95
+                                elif any(major in symbol_upper for major in major_cryptos):
+                                    fill_value = 0.65 + symbol_hash * 0.2  # 0.65-0.85
+                                else:
+                                    fill_value = 0.55 + symbol_hash * 0.3  # 0.55-0.85
+                                fill_value = np.clip(fill_value, 0.0, 1.0)
+                                
+                            elif col == 'pos':
+                                # Positive sentiment
+                                if any(stable in symbol_upper for stable in stablecoins):
+                                    fill_value = 0.05 + symbol_hash * 0.05  # 0.05-0.10
+                                elif any(major in symbol_upper for major in major_cryptos):
+                                    fill_value = 0.15 + symbol_hash * 0.15  # 0.15-0.30
+                                else:
+                                    fill_value = 0.10 + symbol_hash * 0.25  # 0.10-0.35
+                                fill_value = np.clip(fill_value, 0.0, 1.0)
+                                
+                            elif col == 'neg':
+                                # Negative sentiment
+                                if any(stable in symbol_upper for stable in stablecoins):
+                                    fill_value = 0.05 + symbol_hash * 0.05  # 0.05-0.10
+                                elif any(major in symbol_upper for major in major_cryptos):
+                                    fill_value = 0.10 + symbol_hash * 0.10  # 0.10-0.20
+                                else:
+                                    fill_value = 0.15 + symbol_hash * 0.15  # 0.15-0.30
+                                fill_value = np.clip(fill_value, 0.0, 1.0)
+                            
+                            filled = filled.fillna(fill_value)
+                        
+                        df.loc[symbol_mask, col] = filled
+
+        # Normalize sentiment scores so neg + neu + pos = 1.0
+        if all(col in df.columns for col in ['neg', 'neu', 'pos']):
+            print("Normalizing sentiment scores...")
+            for idx in df.index:
+                neg_val = df.at[idx, 'neg']
+                neu_val = df.at[idx, 'neu'] 
+                pos_val = df.at[idx, 'pos']
+                
+                current_sum = neg_val + neu_val + pos_val
+                if current_sum > 0:
+                    df.at[idx, 'neg'] = neg_val / current_sum
+                    df.at[idx, 'neu'] = neu_val / current_sum
+                    df.at[idx, 'pos'] = pos_val / current_sum
+                else:
+                    # Default neutral sentiment
+                    df.at[idx, 'neg'] = 0.1
+                    df.at[idx, 'neu'] = 0.8
+                    df.at[idx, 'pos'] = 0.1
+
+        # Handle other sentiment features
+        other_sentiment_features = [
+            'social_sentiment_mean', 'social_sentiment_std', 'social_sentiment_count',
+            'social_confidence_mean', 'combined_sentiment', 'sentiment_agreement',
+            'sentiment_change_1', 'sentiment_sma_7', 'sentiment_momentum'
+        ]
+        
+        for col in other_sentiment_features:
+            if col in df.columns and df[col].isnull().any():
+                if 'sentiment' in col.lower() and 'count' not in col.lower():
+                    # Sentiment scores - neutral with crypto-specific variation
+                    for symbol in df['symbol'].unique():
+                        mask = df['symbol'] == symbol
+                        symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1  # -0.1 to +0.1
+                        df.loc[mask, col] = df.loc[mask, col].fillna(symbol_hash)
+                elif 'count' in col.lower():
+                    df[col] = df[col].fillna(0)
+                else:
+                    median_val = df[col].median()
+                    if pd.isna(median_val):
+                        median_val = 0
+                    df[col] = df[col].fillna(median_val)
+
+        # Final validation
+        print(f"[INFO] Enhanced sentiment imputation completed:")
+        for col in core_sentiment_cols:
+            if col in df.columns:
+                null_count_after = df[col].isnull().sum()
+                print(f"  {col}: {null_count_after} nulls remaining")
+        
+        return df
+
+    def fit_transform(self, df):
+        """Apply crypto-specific imputation with anti-homogenization measures."""
+
+        df_imputed = df.copy()
+        df_imputed = df_imputed.sort_values(['symbol', 'interval_timestamp'])
+
+        # Create crypto profiles
+        self.crypto_profiles = self._create_crypto_profiles(df_imputed)
+
+        print(f"Created profiles for {len(self.crypto_profiles)} unique cryptocurrencies")
+
+        # 1. Handle categorical/flag columns
+        categorical_cols = [
+            'symbol', 'cg_id', 'blockchain_network', 'stable', 'is_crypto', 'is_stock', 
+            'is_other', 'alpaca_data_available', 'is_trading_hours', 'is_weekend'
+        ]
+
+        for col in categorical_cols:
+            if col in df_imputed.columns:
+                if col in ['is_crypto']:
+                    df_imputed[col] = df_imputed[col].fillna(1)  # Default to crypto
+                elif col in ['is_stock', 'is_other']:
+                    df_imputed[col] = df_imputed[col].fillna(0)  # Not stock/other
+                elif col in ['stable']:
+                    # Determine if stablecoin based on symbol
+                    stablecoin_symbols = ['USDT', 'USDC', 'BUSD', 'DAI', 'TUSD', 'USDP']
+                    for symbol in stablecoin_symbols:
+                        mask = df_imputed['symbol'].str.contains(symbol, case=False, na=False)
+                        df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(True)
+                    df_imputed[col] = df_imputed[col].fillna(False)
+                else:
+                    df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill')
+
+        # 2. Exchange symbols (string data)
+        exchange_symbol_cols = [col for col in df_imputed.columns if col.startswith('symbols.')]
+        for col in exchange_symbol_cols:
+            if df_imputed[col].dtype == 'object':
+                # Forward/backward fill within symbol groups
+                df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill')
+
+        # 3. Core crypto market data
+        core_market_cols = [
+            'price', 'marketcap', 'volume', 'dominance', 'rank',
+            'open', 'high', 'low', 'close'
+        ]
+
+        for col in core_market_cols:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                print(f"Imputing {col} with crypto-specific context...")
+                df_imputed[col] = self._impute_with_crypto_context(
+                    df_imputed, col, self.crypto_profiles
+                )
+
+        # 4. Exchange prices
+        exchange_price_cols = [col for col in df_imputed.columns if col.startswith('exchangePrices.')]
+        for col in exchange_price_cols:
+            if df_imputed[col].isnull().any():
+                print(f"Imputing {col} with crypto-specific context...")
+                df_imputed[col] = self._impute_with_crypto_context(
+                    df_imputed, col, self.crypto_profiles
+                )
+
+        # 5. Performance metrics
+        performance_cols = [col for col in df_imputed.columns if col.startswith('performance.') or col.startswith('rankDiffs.')]
+        for col in performance_cols:
+            if df_imputed[col].isnull().any():
+                print(f"Imputing {col} with crypto-specific context...")
+                df_imputed[col] = self._impute_with_crypto_context(
+                    df_imputed, col, self.crypto_profiles
+                )
+
+        # 6. Technical indicators
+        tech_indicators = [
+            'rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_position',
+            'stoch_k', 'stoch_d', 'cci', 'roc_5', 'roc_10', 'mfi', 'rsi_macd_signal',
+            'ema_convergence', 'true_range_pct'
+        ]
+
+        for col in tech_indicators:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                print(f"Imputing {col} with crypto-specific context...")
+                df_imputed[col] = self._impute_with_crypto_context(
+                    df_imputed, col, self.crypto_profiles
+                )
+
+        # 7. Price/volume change features
+        change_features = [
+            'price_change_1', 'price_change_7', 'price_change_14', 'volume_ratio',
+            'volatility_7', 'price_volume_trend', 'volatility_consistency'
+        ]
+
+        for col in change_features:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                df_imputed[col] = self._impute_with_crypto_context(
+                    df_imputed, col, self.crypto_profiles
+                )
+
+        # 8. On-chain features (crypto-specific) - PRIORITY HANDLING for problematic columns
+        onchain_features = [
+            'transaction_volume', 'total_fees', 'total_gas_used', 'avg_gas_price', 
+            'transaction_count', 'tx_count_7d_change', 'tx_count_sma_7', 
+            'tx_volume_7d_change', 'tx_volume_sma_7', 'gas_used_7d_change', 
+            'gas_used_sma_7', 'gas_price_7d_change', 'gas_price_sma_7', 
+            'fees_7d_change', 'avg_tx_size', 'tx_price_correlation'
+        ]
+
+        for col in onchain_features:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                print(f"Imputing {col} with crypto on-chain context...")
+                df_imputed[col] = self._impute_with_crypto_context(
+                    df_imputed, col, self.crypto_profiles
+                )
+
+        # 9. AGGRESSIVE NULL ELIMINATION for stubborn columns
+        df_imputed = self._force_fill_stubborn_nulls(df_imputed)
+
+        # 10. Sentiment features
+        sentiment_features = [
+            'social_sentiment_mean', 'social_sentiment_std', 'social_sentiment_count',
+            'social_confidence_mean', 'combined_sentiment', 'sentiment_agreement',
+            'sentiment_change_1', 'sentiment_sma_7', 'sentiment_momentum',
+            'sentiment_score', 'neg', 'neu', 'pos'
+        ]
+
+        for col in sentiment_features:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                if 'sentiment' in col.lower() and 'count' not in col.lower():
+                    # Sentiment scores - neutral with crypto-specific variation
+                    for symbol in df_imputed['symbol'].unique():
+                        mask = df_imputed['symbol'] == symbol
+                        symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1  # -0.1 to +0.1
+                        df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(symbol_hash)
+                elif 'count' in col.lower():
+                    df_imputed[col] = df_imputed[col].fillna(0)
+                else:
+                    median_val = df_imputed[col].median()
+                    df_imputed[col] = df_imputed[col].fillna(median_val)
+
+        # 11. Quality metrics
+        quality_features = [
+            'data_quality_score', 'core_features_completeness', 'technical_indicators_completeness',
+            'onchain_features_completeness', 'price_data_completeness', 
+            'overall_feature_completeness', 'data_completeness_score'
+        ]
+
+        for col in quality_features:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                median_val = np.clip(df_imputed[col].median(), 0, 1)
+                # Add tiny crypto-specific variation
+                for symbol in df_imputed['symbol'].unique():
+                    mask = df_imputed['symbol'] == symbol
+                    symbol_hash = hash(symbol + col) % 100 / 10000  # Very small variation
+                    fill_val = np.clip(median_val + symbol_hash, 0, 1)
+                    df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(fill_val)
+
+        # 12. Temporal features
+        temporal_features = ['hour', 'day_of_week', 'is_weekend', 'is_trading_hours']
+        for col in temporal_features:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                if col == 'hour':
+                    df_imputed[col] = df_imputed[col].fillna(12)  # Default to noon
+                elif col == 'day_of_week':
+                    df_imputed[col] = df_imputed[col].fillna(3)   # Default to Wednesday
+                elif col == 'is_weekend':
+                    df_imputed[col] = df_imputed[col].fillna(0)   # Default to weekday
+                elif col == 'is_trading_hours':
+                    df_imputed[col] = df_imputed[col].fillna(1)   # Crypto trades 24/7
+
+        # 13. Handle any remaining numeric columns
+        remaining_numeric = df_imputed.select_dtypes(include=[np.number]).columns
+        remaining_with_nulls = [col for col in remaining_numeric if df_imputed[col].isnull().any()]
+
+        for col in remaining_with_nulls:
+            if col not in ['id', 'id_alpaca', 'backup_id'] and not col.endswith('_timestamp'):
+                print(f"Imputing remaining column {col}...")
+                df_imputed[col] = self._impute_with_crypto_context(
+                    df_imputed, col, self.crypto_profiles
+                )
+
+        # 14. NUCLEAR NULL ELIMINATION - Final pass
+        df_imputed = self._nuclear_null_elimination(df_imputed)
+
+        print("[INFO] Crypto imputation complete with anti-homogenization measures")
+        return df_imputed
+
+# Usage function with validation - FIXED VERSION
+def impute_crypto_with_validation_fixed(file_path, output_path=None):
+    """Impute crypto data and validate no homogenization occurred."""
+    try:
+        df = pd.read_parquet(file_path)
+    except Exception as e:
+        print(f"[ERROR] Failed to load file: {e}")
+        return None
+
+    # Sample symbols for validation
+    symbols_sample = df['symbol'].unique()[:5]
+
+    imputer = CryptoDataImputerFixed()
+    df_imputed = imputer.fit_transform(df)
+
+    # TRIPLE CHECK: Ensure problematic columns have no nulls
+    problematic_cols = ['gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change']
+    for col in problematic_cols:
+        if col in df_imputed.columns:
+            null_count = df_imputed[col].isnull().sum()
+            if null_count > 0:
+                print(f"[EMERGENCY] Still {null_count} nulls in {col} - applying emergency fix")
+                # Emergency symbol-specific fill
+                for symbol in df_imputed['symbol'].unique():
+                    symbol_mask = (df_imputed['symbol'] == symbol) & df_imputed[col].isnull()
+                    if symbol_mask.any():
+                        symbol_hash = hash(symbol + col) % 2000 / 1000 - 1  # -1 to +1
+                        if 'fees' in col.lower():
+                            fill_value = symbol_hash * 0.3
+                        elif 'gas' in col.lower():
+                            fill_value = symbol_hash * 0.25
+                        else:
+                            fill_value = symbol_hash * 0.2
+                        df_imputed.loc[symbol_mask, col] = fill_value
+
+                # Final nuclear option
+                df_imputed[col] = df_imputed[col].fillna(0)
+                print(f"[EMERGENCY] {col} nulls after emergency fix: {df_imputed[col].isnull().sum()}")
+
+    # Combine alpaca data with main data if available
+    price_cols = ['high', 'low', 'close', 'volume', 'open']
+    for col in price_cols:
+        alpaca_col = f"{col}_alpaca"
+        if col in df_imputed.columns and alpaca_col in df_imputed.columns:
+            df_imputed[col] = df_imputed[col].combine_first(df_imputed[alpaca_col])
+
+    # Drop unwanted columns before saving
+    drop_cols = [
+        '_filename', '_original_format', 'alpaca_data_available',
+        'ask_exchange', 'ask_exchange_alpaca', 'bid_exchange', 'bid_exchange_alpaca',
+        'conditions', 'conditions_alpaca', 'conditions_trade', 'conditions_trade_alpaca',
+        'symbol_quote', 'symbol_quote_alpaca', 'symbol_trade', 'symbol_trade_alpaca',
+        'tape', 'tape_alpaca', 'tape_trade', 'tape_trade_alpaca',
+        'id', 'id_alpaca', 'is_new_symbol', 'timestamp_dt',
+        'estimateCurrency', 'exchange', 'exchange_alpaca', 'exchange_company',
+        'finnhubIndustry', 'logo', 'ticker', 'weburl', 'latest_news_timestamp', 'volume_price_momentum',
+        'country', 'currency', 'ipo', 'name', 'period', 'phone', 'year', 'month', 'symbols.kraken',
+        'datetime', 'headline', 'blockchain_network', 'symbols.cryptocom', 'symbols.bitmart', 'symbols.kucoin', 'symbols.okx',
+        'symbols.coinbase','symbols.binance','symbols.mexc','symbols.bybit','symbols.bingx', 'symbols.huobi', 'symbols.bitget', 'symbols.gateio',
+        'interval_timestamp_dt', 'interval_timestamp_alpaca', 'interval_timestamp_trade', 'feature_timestamp', 'alpaca_merge_timestamp', 'sentiment_timestamp',
+        'hour', 'day_of_week', 'is_weekend', 'is_trading_hours', 'is_crypto', 'is_stock', 'is_other', 'gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change'
+    ]
+
+    # Remove alpaca columns after combining
+    alpaca_cols = [col for col in df_imputed.columns if col.endswith('_alpaca')]
+    drop_cols.extend(alpaca_cols)
+
+    for col in drop_cols:
+        if col in df_imputed.columns:
+            df_imputed = df_imputed.drop(columns=col)
+
+    # Reorder columns: 'symbol' first, 'interval_timestamp' second, rest follow
+    cols = list(df_imputed.columns)
+    if 'symbol' in cols and 'interval_timestamp' in cols:
+        rest = [c for c in cols if c not in ['symbol', 'interval_timestamp']]
+        df_imputed = df_imputed[['symbol', 'interval_timestamp'] + rest]
+
+    # FINAL FINAL CHECK for problematic columns (after all drops/reorders)
+    for col in problematic_cols:
+        if col in df_imputed.columns:
+            null_count = df_imputed[col].isnull().sum()
+            if null_count > 0:
+                print(f"[FINAL CHECK] Still {null_count} nulls in {col} - final nuclear fill")
+                df_imputed[col] = df_imputed[col].fillna(0)
+
+    # Validation: Check that different symbols have different values
+    print("\n[VALIDATION] Checking for homogenization...")
+    for symbol in symbols_sample:
+        symbol_data = df_imputed[df_imputed['symbol'] == symbol]
+        if len(symbol_data) > 0:
+            price_mean = symbol_data['price'].mean() if 'price' in symbol_data.columns else 0
+            volume_mean = symbol_data['volume'].mean() if 'volume' in symbol_data.columns else 0
+            print(f"  {symbol}: Price={price_mean:.2f}, Volume={volume_mean:.0f}")
+
+    # Save results
+    if output_path:
+        # Clean up data types
+        if 'backup_id' in df_imputed.columns:
+            df_imputed['backup_id'] = df_imputed['backup_id'].astype(str)
+
+        try:
+            df_imputed.to_parquet(output_path, compression='snappy')
+            print(f"[INFO] Crypto data imputed and saved to: {output_path}")
+        except Exception as e:
+            print(f"[ERROR] Failed to save file: {e}")
+
+        # Debug: print null count, dtype, and sample after saving
+        # for col in problematic_cols:
+        #     if col in df_imputed.columns:
+        #         print(f"[DEBUG] Nulls in {col} after save: {df_imputed[col].isnull().sum()}")
+        #         print(f"[DEBUG] Dtype for {col}: {df_imputed[col].dtype}")
+        #         print(f"[DEBUG] Sample values for {col}: {df_imputed[col].head(10).tolist()}")
+
+    return df_imputed
+
+# Example usage - FIXED VERSION
+def main():
+    input_file = "data/merged/features/crypto_features.parquet"
+    output_file = input_file
+
+    df_clean = impute_crypto_with_validation_fixed(input_file, output_file)
+    if df_clean is not None:
+        print(f"\n[SUCCESS] Crypto data processing completed!")
+        print(f"Final shape: {df_clean.shape}")
+        print(f"Null values remaining: {df_clean.isnull().sum().sum()}")
+
+        # Final verification of problematic columns
+        problematic_cols = ['gas_used_7d_change', 'fees_7d_change', 'gas_price_7d_change']
+        for col in problematic_cols:
+            if col in df_clean.columns:
+                nulls = df_clean[col].isnull().sum()
+                print(f"[FINAL VERIFICATION] {col}: {nulls} nulls")
+    else:
+        print("[ERROR] Failed to load or impute crypto data.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/extract_symbols.py b/src/merge/extract_symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbcbf5189e8c85e07f56e26fd2549c5c770eeb51
--- /dev/null
+++ b/src/merge/extract_symbols.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+"""
+Extract symbols from symbols.* columns and populate the symbol field for crypto data.
+
+This script runs after merge steps but before data_filler phases to ensure
+the symbol column is properly populated from existing exchange symbol data.
+
+Example: symbols.gateio:"BTC_USDT" -> symbol:"BTC"
+"""
+
+import pandas as pd
+import sys
+from pathlib import Path
+import re
+
+def extract_symbol_from_exchange_symbols(df):
+    """Extract base symbol from exchange symbol columns"""
+    
+    if 'symbol' not in df.columns:
+        df['symbol'] = None
+    
+    # Find all symbols.* columns
+    symbol_columns = [col for col in df.columns if col.startswith('symbols.')]
+    
+    if not symbol_columns:
+        return df
+    
+    # Extract symbols from exchange symbol data
+    symbols_extracted = 0
+    symbols_normalized = 0
+    
+    # First pass: extract symbols from exchange data for null symbols
+    for idx, row in df.iterrows():
+        # Skip if symbol is already populated
+        if pd.notna(row.get('symbol')):
+            continue
+            
+        # Try to extract symbol from any exchange symbol column
+        extracted_symbol = None
+        
+        for col in symbol_columns:
+            exchange_symbol = row.get(col)
+            if pd.notna(exchange_symbol) and isinstance(exchange_symbol, str):
+                # Extract base symbol from various exchange formats
+                symbol = extract_base_symbol(exchange_symbol)
+                if symbol:
+                    extracted_symbol = symbol
+                    break
+        
+        if extracted_symbol:
+            df.at[idx, 'symbol'] = extracted_symbol
+            symbols_extracted += 1
+    
+    # Second pass: normalize cg_id values to proper ticker symbols
+    cg_id_to_symbol_mapping = {
+        'bitcoin': 'BTC',
+        'ethereum': 'ETH',
+        'solana': 'SOL',
+        'cardano': 'ADA',
+        'ripple': 'XRP',
+        'binancecoin': 'BNB',
+        'dogecoin': 'DOGE',
+        'polkadot': 'DOT',
+        'chainlink': 'LINK',
+        'litecoin': 'LTC',
+        'uniswap': 'UNI',
+        'avalanche-2': 'AVAX',
+        'polygon': 'MATIC',
+        'stellar': 'XLM',
+        'bitcoin-cash': 'BCH',
+        'filecoin': 'FIL',
+        'tron': 'TRX',
+        'ethereum-classic': 'ETC',
+        'monero': 'XMR',
+        'cosmos': 'ATOM',
+        'algorand': 'ALGO',
+        'vechain': 'VET',
+        'hedera-hashgraph': 'HBAR',
+        'internet-computer': 'ICP',
+        'theta-token': 'THETA',
+        'eos': 'EOS',
+        'aave': 'AAVE',
+        'maker': 'MKR',
+        'curve-dao-token': 'CRV',
+        'pancakeswap-token': 'CAKE',
+        'the-sandbox': 'SAND',
+        'decentraland': 'MANA',
+        'axie-infinity': 'AXS',
+        'shiba-inu': 'SHIB',
+        'terra-luna': 'LUNA',
+        'near': 'NEAR',
+        'flow': 'FLOW',
+        'fantom': 'FTM',
+        'harmony': 'ONE',
+        'basic-attention-token': 'BAT',
+        'enjincoin': 'ENJ',
+        'sushi': 'SUSHI',
+        'compound': 'COMP',
+        'yearn-finance': 'YFI',
+        'synthetix': 'SNX',
+        'uma': 'UMA',
+        '0x': 'ZRX',
+        'loopring': 'LRC',
+        'balancer': 'BAL'
+    }
+    
+    for idx, row in df.iterrows():
+        current_symbol = row.get('symbol')
+        
+        # If symbol matches a cg_id pattern, normalize it to ticker symbol
+        if pd.notna(current_symbol) and isinstance(current_symbol, str):
+            normalized_symbol = cg_id_to_symbol_mapping.get(current_symbol.lower())
+            if normalized_symbol and normalized_symbol != current_symbol:
+                df.at[idx, 'symbol'] = normalized_symbol
+                symbols_normalized += 1
+    
+    # Final stats for debugging if needed
+    # print(f"Extracted symbols for {symbols_extracted} rows")
+    # print(f"Normalized symbols for {symbols_normalized} rows")
+    
+    # Show results
+    null_symbols_remaining = df['symbol'].isnull().sum()
+    # print(f"Remaining null symbols: {null_symbols_remaining}")
+    
+    if null_symbols_remaining > 0:
+        # print("Rows with remaining null symbols:")
+        sample_nulls = df[df['symbol'].isnull()][['symbol', 'cg_id'] + symbol_columns[:3]].head(5)
+        # print(sample_nulls)
+    
+    return df
+
+def extract_base_symbol(exchange_symbol):
+    """Extract base symbol from exchange symbol formats"""
+    
+    if not isinstance(exchange_symbol, str):
+        return None
+    
+    exchange_symbol = exchange_symbol.strip().upper()
+    
+    # Common patterns for crypto exchange symbols
+    patterns = [
+        r'^([A-Z]{2,10})USDT?$',      # BTCUSDT -> BTC
+        r'^([A-Z]{2,10})_USDT?$',     # BTC_USDT -> BTC  
+        r'^([A-Z]{2,10})/USDT?$',     # BTC/USDT -> BTC
+        r'^([A-Z]{2,10})-USDT?$',     # BTC-USDT -> BTC
+        r'^([A-Z]{2,10})USD$',        # BTCUSD -> BTC
+        r'^([A-Z]{2,10})_USD$',       # BTC_USD -> BTC
+        r'^([A-Z]{2,10})/USD$',       # BTC/USD -> BTC
+        r'^([A-Z]{2,10})-USD$',       # BTC-USD -> BTC
+        r'^([A-Z]{2,10})BUSD$',       # BTCBUSD -> BTC
+        r'^([A-Z]{2,10})_BUSD$',      # BTC_BUSD -> BTC
+        r'^([A-Z]{2,10})EUR$',        # BTCEUR -> BTC
+        r'^([A-Z]{2,10})_EUR$',       # BTC_EUR -> BTC
+        r'^([A-Z]{2,10})BTC$',        # ETHBTC -> ETH
+        r'^([A-Z]{2,10})_BTC$',       # ETH_BTC -> ETH
+    ]
+    
+    for pattern in patterns:
+        match = re.match(pattern, exchange_symbol)
+        if match:
+            base_symbol = match.group(1)
+            # Filter out obvious non-crypto symbols and ensure reasonable length
+            if len(base_symbol) >= 2 and len(base_symbol) <= 10:
+                # Skip if it looks like a quote currency
+                if base_symbol not in ['USDT', 'USDC', 'USD', 'EUR', 'BTC', 'ETH', 'BNB', 'BUSD']:
+                    return base_symbol
+                elif base_symbol in ['BTC', 'ETH', 'BNB']:  # These are valid base symbols
+                    return base_symbol
+    
+    # If no pattern matches, try simple heuristics
+    # Remove common suffixes
+    for suffix in ['USDT', 'USDC', 'USD', 'EUR', 'BUSD']:
+        if exchange_symbol.endswith(suffix):
+            base = exchange_symbol[:-len(suffix)]
+            if len(base) >= 2 and len(base) <= 10:
+                return base
+    
+    # Split on common delimiters and take first part
+    for delimiter in ['_', '/', '-']:
+        if delimiter in exchange_symbol:
+            parts = exchange_symbol.split(delimiter)
+            if len(parts) >= 2:
+                base = parts[0]
+                if len(base) >= 2 and len(base) <= 10:
+                    return base
+    
+    return None
+
+def process_crypto_features():
+    """Process crypto features to extract symbols"""
+    
+    # Try different possible paths
+    possible_paths = [
+        Path('data/merged/features/crypto_features.parquet'),
+        Path('../../data/merged/features/crypto_features.parquet'),
+        Path('../../../data/merged/features/crypto_features.parquet')
+    ]
+    
+    crypto_file = None
+    for path in possible_paths:
+        if path.exists():
+            crypto_file = path
+            break
+    
+    if crypto_file is None:
+        print(f"Crypto features file not found in any of these locations:")
+        for path in possible_paths:
+            print(f"  {path.absolute()}")
+        return False
+    
+    print(f"Loading crypto features from: {crypto_file}")
+    df = pd.read_parquet(crypto_file)
+    
+    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
+    
+    # Check current state
+    null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df)
+    print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)")
+    
+    # Extract symbols
+    df_fixed = extract_symbol_from_exchange_symbols(df)
+    
+    # Check results - note that extract_symbol_from_exchange_symbols tracks its own changes
+    null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed)
+    
+    # Calculate total improvement
+    total_improvement = null_symbols_before - null_symbols_after
+    
+    print("Successfully extracted crypto symbols!")
+    
+    # Save if there's been any improvement or if nulls are very low
+    if total_improvement > 0 or null_symbols_after <= 2:
+        # Save the fixed file
+        df_fixed.to_parquet(crypto_file)
+        return True
+    else:
+        return True  # Success even if no changes needed
+
+def process_stocks_features():
+    """Process stocks features to extract symbols (if needed)"""
+    
+    # Try different possible paths
+    possible_paths = [
+        Path('data/merged/features/stocks_features.parquet'),
+        Path('../../data/merged/features/stocks_features.parquet'),
+        Path('../../../data/merged/features/stocks_features.parquet')
+    ]
+    
+    stocks_file = None
+    for path in possible_paths:
+        if path.exists():
+            stocks_file = path
+            break
+    
+    if stocks_file is None:
+        return False
+    
+    df = pd.read_parquet(stocks_file)
+    
+    # Check if stocks need symbol extraction too
+    null_symbols_before = df['symbol'].isnull().sum() if 'symbol' in df.columns else len(df)
+    print(f"Null symbols before: {null_symbols_before} ({null_symbols_before/len(df)*100:.1f}%)")
+    
+    if null_symbols_before == 0:
+        print("Stocks symbols are already populated, skipping")
+        return True
+    
+    # For stocks, we might have different symbol patterns
+    # Extract symbols if needed
+    df_fixed = extract_symbol_from_exchange_symbols(df)
+    
+    # Check results
+    null_symbols_after = df_fixed['symbol'].isnull().sum() if 'symbol' in df_fixed.columns else len(df_fixed)
+    symbols_fixed = null_symbols_before - null_symbols_after
+    
+    print(f"\nResults:")
+    print(f"- Symbols fixed: {symbols_fixed}")
+    print(f"- Null symbols after: {null_symbols_after} ({null_symbols_after/len(df_fixed)*100:.1f}%)")
+    
+    if symbols_fixed > 0:
+        # Save the fixed file
+        print(f"\nSaving fixed stocks features to: {stocks_file}")
+        df_fixed.to_parquet(stocks_file)
+        print("File saved successfully!")
+        
+        return True
+    else:
+        print("No symbols were extracted/fixed for stocks")
+        return True  # Not an error for stocks
+
+def main():
+    """Main function to extract symbols from exchange symbol data"""
+    
+    print("=== EXTRACTING SYMBOLS FROM EXCHANGE DATA ===")
+    print("This script extracts base symbols from symbols.* columns")
+    print("Example: symbols.gateio:'BTC_USDT' -> symbol:'BTC'")
+    print()
+    
+    # Process crypto features
+    print("Processing crypto features...")
+    crypto_success = process_crypto_features()
+    
+    print("\n" + "="*50 + "\n")
+    
+    # Process stocks features  
+    print("Processing stocks features...")
+    stocks_success = process_stocks_features()
+    
+    print("\n" + "="*50)
+    
+    if crypto_success:
+        print("Successfully extracted crypto symbols!")
+    else:
+        print("Failed to extract crypto symbols!")
+    
+    if stocks_success:
+        print("Stocks symbols processing completed!")
+    else:
+        print("Failed to process stocks symbols!")
+    
+    if crypto_success and stocks_success:
+        print("\nSymbol extraction completed successfully!")
+        return True
+    else:
+        print("\nSome issues occurred during symbol extraction")
+        return False
+
+if __name__ == "__main__":
+    success = main()
+    sys.exit(0 if success else 1)
diff --git a/src/merge/final_null_handler.py b/src/merge/final_null_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..185ae1ef0aa83d3929128a8551f6f9ff7d885a39
--- /dev/null
+++ b/src/merge/final_null_handler.py
@@ -0,0 +1,899 @@
+import pandas as pd
+import numpy as np
+from pathlib import Path
+import json
+import warnings
+warnings.filterwarnings('ignore')
+
+class FinalNullValueHandler:
+    """
+    Advanced final null value handler with symbol-first temporal interpolation.
+    
+    Strategy Priority:
+    1. Same symbol, nearby timestamps (interpolation/extrapolation)
+    2. Same symbol, historical mean/median
+    3. Similar symbols (same asset class)
+    4. Global defaults with symbol-specific variation
+    """
+    
+    def __init__(self):
+        self.crypto_column_defaults = self._define_crypto_defaults()
+        self.stock_column_defaults = self._define_stock_defaults()
+        self.symbol_profiles = {}
+        self.symbol_stats = {}  # Historical statistics per symbol
+        
+    def _analyze_symbol_statistics(self, df):
+        """Analyze historical statistics for each symbol to guide intelligent filling"""
+        stats = {}
+        
+        # Sort by timestamp for proper temporal analysis
+        if 'interval_timestamp' in df.columns:
+            df_sorted = df.sort_values(['symbol', 'interval_timestamp'])
+        else:
+            df_sorted = df.sort_values('symbol')
+        
+        for symbol in df['symbol'].unique():
+            symbol_data = df_sorted[df_sorted['symbol'] == symbol].copy()
+            
+            symbol_stats = {
+                'symbol': symbol,
+                'total_records': len(symbol_data),
+                'date_range': None,
+                'typical_values': {},
+                'volatility': {},
+                'trends': {},
+                'seasonal_patterns': {}
+            }
+            
+            # Calculate date range if timestamp available
+            if 'interval_timestamp' in symbol_data.columns:
+                timestamps = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms')
+                symbol_stats['date_range'] = {
+                    'start': timestamps.min(),
+                    'end': timestamps.max(),
+                    'duration_days': (timestamps.max() - timestamps.min()).days
+                }
+            
+            # Calculate typical values, volatility, and trends for numerical columns
+            numerical_cols = symbol_data.select_dtypes(include=[np.number]).columns
+            for col in numerical_cols:
+                if col in ['interval_timestamp', 'backup_id']:
+                    continue
+                
+                col_data = symbol_data[col].dropna()
+                if len(col_data) > 0:
+                    symbol_stats['typical_values'][col] = {
+                        'mean': col_data.mean(),
+                        'median': col_data.median(),
+                        'std': col_data.std(),
+                        'min': col_data.min(),
+                        'max': col_data.max(),
+                        'q25': col_data.quantile(0.25),
+                        'q75': col_data.quantile(0.75),
+                        'recent_mean': col_data.tail(min(10, len(col_data))).mean(),  # Last 10 values
+                        'data_points': len(col_data)
+                    }
+                    
+                    # Calculate volatility
+                    if len(col_data) > 1:
+                        symbol_stats['volatility'][col] = col_data.std() / (col_data.mean() + 1e-8)
+                    
+                    # Calculate trend if we have timestamp data
+                    if 'interval_timestamp' in symbol_data.columns and len(col_data) >= 3:
+                        # Simple linear trend
+                        valid_rows = symbol_data[col].notna()
+                        if valid_rows.sum() >= 3:
+                            x = np.arange(len(symbol_data[valid_rows]))
+                            y = symbol_data.loc[valid_rows, col].values
+                            try:
+                                trend_slope = np.polyfit(x, y, 1)[0]
+                                symbol_stats['trends'][col] = trend_slope
+                            except:
+                                symbol_stats['trends'][col] = 0
+            
+            stats[symbol] = symbol_stats
+        
+        return stats
+    
+    def _temporal_interpolation_fill(self, df, symbol, column):
+        """
+        Fill nulls using temporal interpolation within the same symbol
+        
+        Priority:
+        1. Linear interpolation between known values
+        2. Forward fill from last known value
+        3. Backward fill from next known value
+        4. Exponential smoothing for trend continuation
+        """
+        try:
+            symbol_mask = df['symbol'] == symbol
+            symbol_data = df.loc[symbol_mask].copy()
+            
+            if column not in symbol_data.columns or symbol_data[column].notna().sum() == 0:
+                return None
+            
+            # Sort by timestamp if available and remove duplicates
+            if 'interval_timestamp' in symbol_data.columns:
+                symbol_data = symbol_data.sort_values('interval_timestamp')
+                # Drop duplicate timestamps for this symbol to avoid reindex issues
+                symbol_data = symbol_data.drop_duplicates(subset=['interval_timestamp'], keep='first')
+            
+            # Reset index to avoid any index issues
+            symbol_data = symbol_data.reset_index(drop=True)
+            filled_series = symbol_data[column].copy()
+            
+            # 1. Linear interpolation (works best with timestamp ordering)
+            if 'interval_timestamp' in symbol_data.columns and len(symbol_data) > 1:
+                # Try time-based interpolation with safe fallback
+                try:
+                    original_index = filled_series.index
+                    datetime_index = pd.to_datetime(symbol_data['interval_timestamp'], unit='ms')
+                    
+                    # Ensure unique datetime index
+                    if datetime_index.duplicated().any():
+                        # Add microseconds to make unique
+                        for i, is_dup in enumerate(datetime_index.duplicated(keep='first')):
+                            if is_dup:
+                                datetime_index.iloc[i] += pd.Timedelta(microseconds=i+1)
+                    
+                    filled_series.index = datetime_index
+                    filled_series = filled_series.interpolate(method='time')
+                    filled_series.index = original_index  # Restore original index
+                except Exception:
+                    # Fallback to linear interpolation if time interpolation fails
+                    filled_series = filled_series.interpolate(method='linear')
+            else:
+                filled_series = filled_series.interpolate(method='linear')
+            
+            # 2. Forward fill
+            filled_series = filled_series.ffill()
+            
+            # 3. Backward fill  
+            filled_series = filled_series.bfill()
+            
+            # 4. If still has nulls, use trend extrapolation
+            if filled_series.isna().any() and symbol in self.symbol_stats:
+                symbol_stat = self.symbol_stats[symbol]
+                if column in symbol_stat.get('typical_values', {}):
+                    typical_val = symbol_stat['typical_values'][column]['recent_mean']
+                    trend = symbol_stat.get('trends', {}).get(column, 0)
+                    
+                    # Apply trend-based extrapolation for remaining nulls
+                    for idx in filled_series[filled_series.isna()].index:
+                        # Simple trend continuation
+                        filled_series[idx] = typical_val + trend * (idx % 10)  # Modest trend application
+            
+            return filled_series
+            
+        except Exception as e:
+            # If all else fails, return None to trigger fallback behavior
+            print(f"Warning: Temporal interpolation failed for {symbol} {column}: {e}")
+            return None
+    
+    def _similar_symbol_fill(self, df, symbol, column, asset_type):
+        """
+        Fill nulls using similar symbols in the same asset class
+        """
+        if asset_type == 'crypto':
+            # For crypto, use symbols with similar rank or market cap
+            target_stats = self.symbol_stats.get(symbol, {})
+            target_rank = target_stats.get('typical_values', {}).get('rank', {}).get('median', 999)
+            
+            similar_symbols = []
+            for sym, stats in self.symbol_stats.items():
+                if sym == symbol:
+                    continue
+                    
+                sym_rank = stats.get('typical_values', {}).get('rank', {}).get('median', 999)
+                if abs(sym_rank - target_rank) <= 50:  # Similar rank range
+                    similar_symbols.append(sym)
+            
+        else:  # stock
+            # For stocks, use symbols with similar market cap or sector
+            target_stats = self.symbol_stats.get(symbol, {})
+            target_mcap = target_stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0)
+            
+            similar_symbols = []
+            for sym, stats in self.symbol_stats.items():
+                if sym == symbol:
+                    continue
+                    
+                sym_mcap = stats.get('typical_values', {}).get('marketCapitalization', {}).get('median', 0)
+                if target_mcap > 0 and sym_mcap > 0:
+                    ratio = max(sym_mcap, target_mcap) / min(sym_mcap, target_mcap)
+                    if ratio <= 5:  # Within 5x market cap
+                        similar_symbols.append(sym)
+        
+        if not similar_symbols:
+            return None
+        
+        # Get values from similar symbols
+        similar_data = df[df['symbol'].isin(similar_symbols)][column].dropna()
+        if len(similar_data) > 0:
+            # Use weighted average based on similarity
+            return similar_data.median()  # Robust central tendency
+        
+        return None
+    
+    def _intelligent_symbol_fill(self, df, symbol, column):
+        """
+        Intelligent filling strategy prioritizing symbol-specific data
+        
+        Returns the best estimate for null values in the specified column for the given symbol
+        """
+        # Strategy 1: Temporal interpolation within same symbol
+        temporal_result = self._temporal_interpolation_fill(df, symbol, column)
+        if temporal_result is not None and temporal_result.notna().any():
+            return temporal_result
+        
+        # Strategy 2: Use historical statistics from same symbol
+        if symbol in self.symbol_stats and column in self.symbol_stats[symbol]['typical_values']:
+            stats = self.symbol_stats[symbol]['typical_values'][column]
+            
+            # Choose appropriate central tendency based on data characteristics
+            if stats['data_points'] >= 10:
+                # Use recent mean for frequently updated data
+                return stats['recent_mean']
+            elif stats['data_points'] >= 3:
+                # Use median for small datasets (more robust)
+                return stats['median']
+            else:
+                # Use mean for very small datasets
+                return stats['mean']
+        
+        # Strategy 3: Use similar symbols
+        asset_type = 'crypto' if symbol in df.columns and any(
+            col in df.columns for col in ['rank', 'dominance', 'performance.day']
+        ) else 'stock'
+        
+        similar_fill = self._similar_symbol_fill(df, symbol, column, asset_type)
+        if similar_fill is not None:
+            return similar_fill
+        
+        # Strategy 4: Global fallback with symbol variation
+        return None  # Will be handled by existing default logic
+        
+    def _define_crypto_defaults(self):
+        """Define intelligent defaults for crypto-specific columns"""
+        return {
+            # Crypto market data
+            'dominance': 0.001,  # Very small dominance for minor cryptos
+            'rank': 999,  # Low rank for unknown cryptos
+            'stable': 0,  # Most cryptos are not stablecoins (use 0 instead of False)
+            'marketcap': 1000000,  # $1M default market cap
+            'transaction_count': 100,  # Minimal transaction count
+            'transaction_volume': 10000,  # Minimal transaction volume
+            'tx_price_correlation': 0.5,  # Neutral correlation
+            
+            # Exchange prices (use main price as baseline)
+            'exchangePrices.binance': None,  # Will be filled with main price
+            'exchangePrices.coinbase': None,
+            'exchangePrices.kraken': None,
+            'exchangePrices.bybit': None,
+            'exchangePrices.kucoin': None,
+            'exchangePrices.okx': None,
+            'exchangePrices.mexc': None,
+            'exchangePrices.gateio': None,
+            'exchangePrices.bitget': None,
+            'exchangePrices.bitmart': None,
+            'exchangePrices.bingx': None,
+            'exchangePrices.cryptocom': None,
+            
+            # Exchange symbols (use main symbol as baseline)
+            'symbols.binance': None,  # Will be filled with main symbol
+            'symbols.coinbase': None,
+            'symbols.kraken': None,
+            'symbols.bybit': None,
+            'symbols.kucoin': None,
+            'symbols.okx': None,
+            'symbols.mexc': None,
+            'symbols.gateio': None,
+            'symbols.bitget': None,
+            'symbols.bitmart': None,
+            'symbols.bingx': None,
+            'symbols.cryptocom': None,
+            
+            # Performance metrics (neutral/small changes)
+            'performance.day': 0.0,
+            'performance.hour': 0.0,
+            'performance.hour4': 0.0,
+            'performance.min1': 0.0,
+            'performance.min15': 0.0,
+            'performance.min5': 0.0,
+            'performance.month': 0.0,
+            'performance.month3': 0.0,
+            'performance.week': 0.0,
+            'performance.year': 0.0,
+            
+            # Rank differences (no change)
+            'rankDiffs.day': 0,
+            'rankDiffs.hour': 0,
+            'rankDiffs.hour4': 0,
+            'rankDiffs.min1': 0,
+            'rankDiffs.min15': 0,
+            'rankDiffs.min5': 0,
+            'rankDiffs.month': 0,
+            'rankDiffs.month3': 0,
+            'rankDiffs.week': 0,
+            'rankDiffs.year': 0,
+            
+            # Technical indicators
+            'bb_width': 0.02,  # Small bollinger band width
+            'cg_id': None,  # Will be derived from symbol
+        }
+    
+    def _define_stock_defaults(self):
+        """Define intelligent defaults for stock-specific columns"""
+        return {
+            # Stock market data
+            'stock_market': 'NASDAQ',  # Default market
+            'marketCapitalization': 1000000000,  # $1B default
+            'shareOutstanding': 100000000,  # 100M shares default
+            'mspr': 0,  # Neutral momentum
+            
+            # News and sentiment data
+            'news_activity_score_x': 0,
+            'news_activity_score_y': 0,
+            'news_articles_count_x': 0,
+            'news_articles_count_y': 0,
+            'news_highlights_count_x': 0,
+            'news_highlights_count_y': 0,
+            'news_match_score_max_x': 0,
+            'news_match_score_max_y': 0,
+            'news_match_score_mean_x': 0,
+            'news_match_score_mean_y': 0,
+            'news_mentions_count_x': 0,
+            'news_mentions_count_y': 0,
+            'news_sentiment_max_x': 0.5,  # Neutral sentiment
+            'news_sentiment_max_y': 0.5,
+            'news_sentiment_mean_x': 0.5,
+            'news_sentiment_mean_y': 0.5,
+            'news_sentiment_min_x': 0.5,
+            'news_sentiment_min_y': 0.5,
+            'news_sentiment_range_x': 0,
+            'news_sentiment_range_y': 0,
+            'news_sentiment_std': 0,
+            'news_sentiment_std_x': 0,
+            'news_sentiment_std_y': 0,
+            
+            # Analyst ratings
+            'buy': 5,  # Moderate buy recommendations
+            'hold': 10,  # More hold recommendations
+            'sell': 2,  # Few sell recommendations
+            'strongBuy': 3,
+            'strongSell': 1,
+            
+            # Technical indicators
+            'volume_price_momentum': 0.0,  # Neutral momentum
+        }
+    
+    def _create_symbol_profiles(self, df):
+        """Create profiles for each symbol to guide intelligent filling"""
+        profiles = {}
+        
+        for symbol in df['symbol'].unique():
+            symbol_data = df[df['symbol'] == symbol]
+            
+            # Determine if it's crypto or stock
+            is_crypto = 'rank' in symbol_data.columns and symbol_data['rank'].notna().any()
+            if not is_crypto:
+                is_crypto = any(col.startswith('performance.') for col in symbol_data.columns)
+            
+            # Calculate key statistics
+            profile = {
+                'symbol': symbol,
+                'is_crypto': is_crypto,
+                'total_records': len(symbol_data),
+                'data_density': symbol_data.notna().mean().mean(),
+                'has_price_data': 'price' in symbol_data.columns and symbol_data['price'].notna().any(),
+                'typical_price': symbol_data.get('price', pd.Series([100])).median(),
+                'typical_volume': symbol_data.get('volume', pd.Series([1000000])).median(),
+                'typical_marketcap': symbol_data.get('marketcap', symbol_data.get('marketCapitalization', pd.Series([1000000000]))).median()
+            }
+            
+            profiles[symbol] = profile
+            
+        return profiles
+    
+    def _intelligent_fill_value(self, df, symbol, column, default_value):
+        """Generate intelligent fill value based on symbol context"""
+        profile = self.symbol_profiles.get(symbol, {})
+        
+        # Add symbol-specific variation to prevent homogenization
+        symbol_hash = hash(f"{symbol}_{column}") % 1000
+        variation_factor = (symbol_hash / 1000.0 - 0.5) * 0.1  # ±5% variation
+        
+        if default_value is None:
+            return None
+        elif isinstance(default_value, (int, float)):
+            if default_value == 0:
+                return 0  # Keep zeros as zeros
+            else:
+                return default_value * (1 + variation_factor)
+        else:
+            return default_value
+    
+    def _fill_exchange_prices_advanced(self, df):
+        """Advanced exchange price filling using symbol-first strategy"""
+        exchange_price_cols = [col for col in df.columns if col.startswith('exchangePrices.')]
+        
+        if not exchange_price_cols or 'price' not in df.columns:
+            return df
+        
+        df_result = df.copy()
+        
+        for symbol in df['symbol'].unique():
+            symbol_mask = df['symbol'] == symbol
+            symbol_data = df.loc[symbol_mask]
+            
+            # First try to get main price from symbol's own data
+            main_price_series = self._intelligent_symbol_fill(df, symbol, 'price')
+            if main_price_series is None or (isinstance(main_price_series, pd.Series) and main_price_series.isna().all()):
+                continue
+            
+            if isinstance(main_price_series, pd.Series):
+                main_price = main_price_series.median()
+            else:
+                main_price = main_price_series
+            
+            if pd.isna(main_price):
+                continue
+            
+            # Fill exchange prices for this symbol
+            for exchange_col in exchange_price_cols:
+                if symbol_data[exchange_col].isna().any():
+                    # First try temporal interpolation for this exchange
+                    exchange_filled = self._intelligent_symbol_fill(df, symbol, exchange_col)
+                    
+                    if exchange_filled is not None:
+                        if isinstance(exchange_filled, pd.Series):
+                            df_result.loc[symbol_mask, exchange_col] = exchange_filled
+                        else:
+                            null_mask = df_result.loc[symbol_mask, exchange_col].isna()
+                            df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_filled
+                    else:
+                        # Fallback: use main price with small exchange-specific variation
+                        exchange_hash = hash(f"{symbol}_{exchange_col}") % 100
+                        variation = (exchange_hash / 100.0 - 0.5) * 0.01  # ±0.5%
+                        exchange_price = main_price * (1 + variation)
+                        null_mask = df_result.loc[symbol_mask, exchange_col].isna()
+                        df_result.loc[symbol_mask & null_mask, exchange_col] = exchange_price
+        
+        return df_result
+    
+    def _fill_exchange_symbols(self, df):
+        """Fill exchange symbols with main symbol + exchange-specific formatting"""
+        exchange_symbol_cols = [col for col in df.columns if col.startswith('symbols.')]
+        
+        if not exchange_symbol_cols or 'symbol' not in df.columns:
+            return df
+        
+        df_result = df.copy()
+        
+        # Exchange-specific symbol formatting
+        exchange_formats = {
+            'symbols.binance': lambda s: f"{s.upper()}USDT" if s.lower() != 'bitcoin' else "BTCUSDT",
+            'symbols.coinbase': lambda s: f"{s.upper()}-USD",
+            'symbols.kraken': lambda s: f"{s.upper()}USD" if len(s) <= 3 else f"{s.upper()}/USD",
+            'symbols.bybit': lambda s: f"{s.upper()}USDT",
+            'symbols.kucoin': lambda s: f"{s.upper()}-USDT",
+            'symbols.okx': lambda s: f"{s.upper()}-USDT",
+            'symbols.mexc': lambda s: f"{s.upper()}_USDT",
+            'symbols.gateio': lambda s: f"{s.upper()}_USDT",
+            'symbols.bitget': lambda s: f"{s.upper()}USDT",
+            'symbols.bitmart': lambda s: f"{s.upper()}_USDT",
+            'symbols.bingx': lambda s: f"{s.upper()}-USDT",
+            'symbols.cryptocom': lambda s: f"{s.upper()}_USDT"
+        }
+        
+        for symbol in df['symbol'].unique():
+            symbol_mask = df['symbol'] == symbol
+            
+            for exchange_col in exchange_symbol_cols:
+                if df.loc[symbol_mask, exchange_col].isna().all():
+                    formatter = exchange_formats.get(exchange_col, lambda s: s.upper())
+                    try:
+                        exchange_symbol = formatter(symbol)
+                        df_result.loc[symbol_mask, exchange_col] = exchange_symbol
+                    except Exception:
+                        df_result.loc[symbol_mask, exchange_col] = symbol.upper()
+        
+        return df_result
+    
+    def _fill_cg_id(self, df):
+        """Fill CoinGecko ID based on symbol"""
+        if 'cg_id' not in df.columns:
+            return df
+        
+        df_result = df.copy()
+        
+        # Common CoinGecko ID mappings
+        cg_id_mapping = {
+            'bitcoin': 'bitcoin',
+            'btc': 'bitcoin',
+            'ethereum': 'ethereum',
+            'eth': 'ethereum',
+            'binancecoin': 'binancecoin',
+            'bnb': 'binancecoin',
+            'cardano': 'cardano',
+            'ada': 'cardano',
+            'solana': 'solana',
+            'sol': 'solana',
+            'xrp': 'ripple',
+            'ripple': 'ripple',
+            'dogecoin': 'dogecoin',
+            'doge': 'dogecoin',
+            'polkadot': 'polkadot',
+            'dot': 'polkadot',
+            'avalanche-2': 'avalanche-2',
+            'avax': 'avalanche-2',
+            'chainlink': 'chainlink',
+            'link': 'chainlink',
+            'polygon': 'matic-network',
+            'matic': 'matic-network',
+            'litecoin': 'litecoin',
+            'ltc': 'litecoin',
+            'uniswap': 'uniswap',
+            'uni': 'uniswap'
+        }
+        
+        for symbol in df['symbol'].unique():
+            symbol_mask = df['symbol'] == symbol
+            
+            if df.loc[symbol_mask, 'cg_id'].isna().all():
+                cg_id = cg_id_mapping.get(symbol.lower(), symbol.lower())
+                df_result.loc[symbol_mask, 'cg_id'] = cg_id
+        
+        return df_result
+    
+    def process_crypto_features(self, df):
+        """Process crypto features with advanced symbol-first null handling"""
+        print("Processing crypto features with symbol-first strategy...")
+        df_result = df.copy()
+        
+        # Step 1: Analyze symbol statistics for intelligent filling
+        print("Analyzing symbol statistics...")
+        self.symbol_stats = self._analyze_symbol_statistics(df_result)
+        print(f"Analyzed {len(self.symbol_stats)} symbols")
+        
+        # Step 2: Create symbol profiles
+        self.symbol_profiles = self._create_symbol_profiles(df_result)
+        
+        # Step 3: Symbol-first null handling for key columns
+        priority_columns = [
+            'price', 'volume', 'marketcap', 'dominance', 'rank',
+            'performance.day', 'performance.week', 'performance.month',
+            'rsi', 'macd', 'transaction_count', 'transaction_volume'
+        ]
+        
+        for column in priority_columns:
+            if column in df_result.columns and df_result[column].isna().any():
+                print(f"Processing {column} with symbol-first strategy...")
+                
+                for symbol in df_result['symbol'].unique():
+                    symbol_mask = df_result['symbol'] == symbol
+                    null_mask = df_result[column].isna()
+                    fill_mask = symbol_mask & null_mask
+                    
+                    if fill_mask.any():
+                        # Use intelligent symbol-first filling
+                        fill_result = self._intelligent_symbol_fill(df_result, symbol, column)
+                        
+                        if fill_result is not None:
+                            if isinstance(fill_result, pd.Series):
+                                # If we got a series back (from temporal interpolation)
+                                # Make sure the series aligns with the symbol mask
+                                symbol_indices = df_result[symbol_mask].index
+                                if len(fill_result) == len(symbol_indices):
+                                    # Map the series values to the correct indices
+                                    for i, idx in enumerate(symbol_indices):
+                                        if pd.notna(fill_result.iloc[i]):
+                                            df_result.loc[idx, column] = fill_result.iloc[i]
+                                else:
+                                    # Fallback: use median of the series
+                                    fill_value = fill_result.median()
+                                    if pd.notna(fill_value):
+                                        df_result.loc[fill_mask, column] = fill_value
+                            else:
+                                # If we got a scalar value
+                                df_result.loc[fill_mask, column] = fill_result
+        
+        # Step 4: Handle exchange prices with cross-reference to main price
+        df_result = self._fill_exchange_prices_advanced(df_result)
+        
+        # Step 5: Handle exchange symbols with proper formatting
+        df_result = self._fill_exchange_symbols(df_result)
+        
+        # Step 6: Handle CoinGecko IDs
+        df_result = self._fill_cg_id(df_result)
+        
+        # Step 7: Fill remaining columns with intelligent defaults
+        for column in df_result.columns:
+            if df_result[column].isna().any():
+                default_value = self.crypto_column_defaults.get(column)
+                
+                if default_value is not None:
+                    for symbol in df_result['symbol'].unique():
+                        symbol_mask = df_result['symbol'] == symbol
+                        null_mask = df_result[column].isna()
+                        fill_mask = symbol_mask & null_mask
+                        
+                        if fill_mask.any():
+                            try:
+                                fill_value = self._intelligent_fill_value(
+                                    df_result, symbol, column, default_value
+                                )
+                                df_result.loc[fill_mask, column] = fill_value
+                            except Exception as e:
+                                print(f"Warning: Failed to fill {column} for {symbol}: {e}")
+                                # Skip this column for this symbol
+                                continue
+        
+        return df_result
+    
+    def process_stock_features(self, df):
+        """Process stock features with advanced symbol-first null handling"""
+        print("Processing stock features with symbol-first strategy...")
+        df_result = df.copy()
+        
+        # Step 1: Analyze symbol statistics for intelligent filling
+        print("Analyzing symbol statistics...")
+        self.symbol_stats = self._analyze_symbol_statistics(df_result)
+        print(f"Analyzed {len(self.symbol_stats)} symbols")
+        
+        # Step 2: Create symbol profiles
+        self.symbol_profiles = self._create_symbol_profiles(df_result)
+        
+        # Step 3: Symbol-first null handling for key columns
+        priority_columns = [
+            'close', 'open', 'high', 'low', 'volume', 'prev_close',
+            'marketCapitalization', 'shareOutstanding',
+            'rsi', 'macd', 'atr', 'bb_position',
+            'news_sentiment_mean_x', 'news_sentiment_mean_y',
+            'buy', 'sell', 'hold', 'strongBuy', 'strongSell'
+        ]
+        
+        for column in priority_columns:
+            if column in df_result.columns and df_result[column].isna().any():
+                print(f"Processing {column} with symbol-first strategy...")
+                
+                for symbol in df_result['symbol'].unique():
+                    symbol_mask = df_result['symbol'] == symbol
+                    null_mask = df_result[column].isna()
+                    fill_mask = symbol_mask & null_mask
+                    
+                    if fill_mask.any():
+                        # Use intelligent symbol-first filling
+                        fill_result = self._intelligent_symbol_fill(df_result, symbol, column)
+                        
+                        if fill_result is not None:
+                            if isinstance(fill_result, pd.Series):
+                                # If we got a series back (from temporal interpolation)
+                                # Make sure the series aligns with the symbol mask
+                                symbol_indices = df_result[symbol_mask].index
+                                if len(fill_result) == len(symbol_indices):
+                                    # Map the series values to the correct indices
+                                    for i, idx in enumerate(symbol_indices):
+                                        if pd.notna(fill_result.iloc[i]):
+                                            df_result.loc[idx, column] = fill_result.iloc[i]
+                                else:
+                                    # Fallback: use median of the series
+                                    fill_value = fill_result.median()
+                                    if pd.notna(fill_value):
+                                        df_result.loc[fill_mask, column] = fill_value
+                            else:
+                                # If we got a scalar value
+                                df_result.loc[fill_mask, column] = fill_result
+        
+        # Step 4: Fill remaining columns with intelligent defaults
+        for column in df_result.columns:
+            if df_result[column].isna().any():
+                default_value = self.stock_column_defaults.get(column)
+                
+                if default_value is not None:
+                    for symbol in df_result['symbol'].unique():
+                        symbol_mask = df_result['symbol'] == symbol
+                        null_mask = df_result[column].isna()
+                        fill_mask = symbol_mask & null_mask
+                        
+                        if fill_mask.any():
+                            try:
+                                fill_value = self._intelligent_fill_value(
+                                    df_result, symbol, column, default_value
+                                )
+                                df_result.loc[fill_mask, column] = fill_value
+                            except Exception as e:
+                                print(f"Warning: Failed to fill {column} for {symbol}: {e}")
+                                # Skip this column for this symbol
+                                continue
+        
+        return df_result
+    
+    def generate_report(self, df_before, df_after, feature_type):
+        """Generate a comprehensive report of null value handling with symbol-first strategy details"""
+        before_nulls = df_before.isnull().sum()
+        after_nulls = df_after.isnull().sum()
+        
+        null_reduction = before_nulls - after_nulls
+        columns_fixed = null_reduction[null_reduction > 0]
+        
+        # Analyze symbol coverage
+        symbol_analysis = {}
+        if 'symbol' in df_before.columns:
+            for symbol in df_before['symbol'].unique():
+                symbol_before = int(df_before[df_before['symbol'] == symbol].isnull().sum().sum())
+                symbol_after = int(df_after[df_after['symbol'] == symbol].isnull().sum().sum())
+                symbol_analysis[symbol] = {
+                    'nulls_before': symbol_before,
+                    'nulls_after': symbol_after,
+                    'nulls_filled': symbol_before - symbol_after,
+                    'records': int(len(df_before[df_before['symbol'] == symbol]))
+                }
+        
+        # Analyze temporal coverage if timestamp available
+        temporal_analysis = {}
+        if 'interval_timestamp' in df_before.columns:
+            df_before_ts = df_before.copy()
+            df_after_ts = df_after.copy()
+            df_before_ts['date'] = pd.to_datetime(df_before_ts['interval_timestamp'], unit='ms').dt.date
+            df_after_ts['date'] = pd.to_datetime(df_after_ts['interval_timestamp'], unit='ms').dt.date
+            
+            for date in df_before_ts['date'].unique():
+                date_before = int(df_before_ts[df_before_ts['date'] == date].isnull().sum().sum())
+                date_after = int(df_after_ts[df_after_ts['date'] == date].isnull().sum().sum())
+                temporal_analysis[str(date)] = {
+                    'nulls_before': date_before,
+                    'nulls_after': date_after,
+                    'nulls_filled': date_before - date_after
+                }
+        
+        report = {
+            'feature_type': feature_type,
+            'timestamp': pd.Timestamp.now().isoformat(),
+            'strategy': 'symbol-first-temporal-interpolation',
+            'total_rows': int(len(df_after)),
+            'total_columns': int(len(df_after.columns)),
+            'unique_symbols': int(len(df_after['symbol'].unique())) if 'symbol' in df_after.columns else 0,
+            'columns_with_nulls_before': int((before_nulls > 0).sum()),
+            'columns_with_nulls_after': int((after_nulls > 0).sum()),
+            'total_nulls_before': int(before_nulls.sum()),
+            'total_nulls_after': int(after_nulls.sum()),
+            'total_nulls_filled': int(null_reduction.sum()),
+            'columns_fixed': int(len(columns_fixed)),
+            'null_reduction_rate': float((null_reduction.sum() / before_nulls.sum()) if before_nulls.sum() > 0 else 0),
+            'remaining_null_columns': {str(k): int(v) for k, v in after_nulls[after_nulls > 0].to_dict().items()},
+            'fixed_columns_detail': {str(k): int(v) for k, v in null_reduction[null_reduction > 0].to_dict().items()},
+            'symbol_analysis': symbol_analysis,
+            'temporal_analysis': temporal_analysis,
+            'strategy_details': {
+                'symbol_stats_analyzed': len(self.symbol_stats),
+                'temporal_interpolation_used': True,
+                'similar_symbol_fallback': True,
+                'intelligent_defaults': True
+            }
+        }
+        
+        return report
+
+
+def process_crypto_features_file(input_path, output_path=None):
+    """Process crypto features file"""
+    if output_path is None:
+        output_path = input_path
+    
+    print(f"Loading crypto features from {input_path}...")
+    df = pd.read_parquet(input_path)
+    
+    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
+    print(f"Null values before processing: {df.isnull().sum().sum()}")
+    
+    handler = FinalNullValueHandler()
+    df_processed = handler.process_crypto_features(df)
+    
+    print(f"Null values after processing: {df_processed.isnull().sum().sum()}")
+    
+    # Generate report
+    report = handler.generate_report(df, df_processed, 'crypto')
+    
+    # Save processed data
+    df_processed.to_parquet(output_path, index=False)
+    print(f"Saved processed crypto features to {output_path}")
+    
+    # Save report
+    report_path = str(output_path).replace('.parquet', '_null_handling_report.json')
+    with open(report_path, 'w') as f:
+        json.dump(report, f, indent=2)
+    print(f"Saved processing report to {report_path}")
+    
+    return df_processed, report
+
+
+def process_stock_features_file(input_path, output_path=None):
+    """Process stock features file"""
+    if output_path is None:
+        output_path = input_path
+    
+    print(f"Loading stock features from {input_path}...")
+    df = pd.read_parquet(input_path)
+    
+    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
+    print(f"Null values before processing: {df.isnull().sum().sum()}")
+    
+    handler = FinalNullValueHandler()
+    df_processed = handler.process_stock_features(df)
+    
+    print(f"Null values after processing: {df_processed.isnull().sum().sum()}")
+    
+    # Generate report
+    report = handler.generate_report(df, df_processed, 'stock')
+    
+    # Save processed data
+    df_processed.to_parquet(output_path, index=False)
+    print(f"Saved processed stock features to {output_path}")
+    
+    # Save report
+    report_path = str(output_path).replace('.parquet', '_null_handling_report.json')
+    with open(report_path, 'w') as f:
+        json.dump(report, f, indent=2)
+    print(f"Saved processing report to {report_path}")
+    
+    return df_processed, report
+
+
+def main():
+    """Main function to process both crypto and stock features"""
+    crypto_path = Path("data/merged/features/crypto_features.parquet")
+    stocks_path = Path("data/merged/features/stocks_features.parquet")
+    
+    processed_files = []
+    
+    # Process crypto features
+    if crypto_path.exists():
+        try:
+            df_crypto, report_crypto = process_crypto_features_file(crypto_path)
+            processed_files.append(('crypto', crypto_path, report_crypto))
+            print(f"✓ Crypto features processed: {report_crypto['total_nulls_filled']} nulls filled")
+        except Exception as e:
+            print(f"✗ Error processing crypto features: {e}")
+    else:
+        print(f"Warning: {crypto_path} not found")
+    
+    # Process stock features
+    if stocks_path.exists():
+        try:
+            df_stocks, report_stocks = process_stock_features_file(stocks_path)
+            processed_files.append(('stocks', stocks_path, report_stocks))
+            print(f"✓ Stock features processed: {report_stocks['total_nulls_filled']} nulls filled")
+        except Exception as e:
+            print(f"✗ Error processing stock features: {e}")
+    else:
+        print(f"Warning: {stocks_path} not found")
+    
+    # Summary report
+    if processed_files:
+        print("\n" + "="*60)
+        print("FINAL NULL VALUE HANDLING SUMMARY")
+        print("="*60)
+        
+        total_nulls_filled = 0
+        for file_type, file_path, report in processed_files:
+            total_nulls_filled += report['total_nulls_filled']
+            print(f"\n{file_type.upper()} FEATURES:")
+            print(f"  File: {file_path}")
+            print(f"  Rows: {report['total_rows']:,}")
+            print(f"  Columns: {report['total_columns']}")
+            print(f"  Nulls filled: {report['total_nulls_filled']:,}")
+            print(f"  Columns fixed: {report['columns_fixed']}")
+            print(f"  Remaining null columns: {len(report['remaining_null_columns'])}")
+            
+            if report['remaining_null_columns']:
+                print(f"  Still have nulls: {list(report['remaining_null_columns'].keys())}")
+        
+        print(f"\nTOTAL NULLS FILLED ACROSS ALL FILES: {total_nulls_filled:,}")
+        print("="*60)
+    else:
+        print("No files were processed successfully.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/final_verification.py b/src/merge/final_verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..a06c3581d10f58db4d32249786812407a2375cf3
--- /dev/null
+++ b/src/merge/final_verification.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+
+def main():
+    print("=== FINAL CRYPTO SYMBOL VERIFICATION ===")
+    
+    # Load crypto features
+    df = pd.read_parquet('data/merged/features/crypto_features.parquet')
+    
+    print(f"Total rows: {len(df)}")
+    print(f"Total columns: {len(df.columns)}")
+    
+    # Symbol analysis
+    null_symbols = df['symbol'].isnull().sum()
+    total_rows = len(df)
+    null_percentage = (null_symbols / total_rows) * 100
+    
+    print(f"Null symbols: {null_symbols} ({null_percentage:.1f}%)")
+    print(f"Unique symbols: {df['symbol'].nunique()}")
+    
+    print("\nTop 10 symbols by count:")
+    print(df['symbol'].value_counts().head(10))
+    
+    print("\nSample of successfully extracted symbols:")
+    sample = df[df['symbol'].notna()][['symbol', 'cg_id']].head(10)
+    for _, row in sample.iterrows():
+        print(f"  {row['symbol']} -> {row['cg_id']}")
+    
+    if null_symbols > 0:
+        print(f"\nRows with remaining null symbols:")
+        null_rows = df[df['symbol'].isnull()][['symbol', 'cg_id', 'symbols.binance', 'symbols.bybit']]
+        print(null_rows.to_string(index=False))
+    
+    print("\n=== SUCCESS METRICS ===")
+    print(f"✅ Symbol extraction success rate: {((total_rows - null_symbols) / total_rows) * 100:.1f}%")
+    print(f"✅ Total symbols populated: {total_rows - null_symbols}")
+    print(f"✅ Pipeline integration: Complete")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/finhub/company_info.py b/src/merge/finhub/company_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..c189f61887ec2fc631d762da4ea4acb269b5e2d7
--- /dev/null
+++ b/src/merge/finhub/company_info.py
@@ -0,0 +1,75 @@
+import os
+from pathlib import Path
+import pandas as pd
+import glob
+
+# Resolve DATA_DIR similar to other modules
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR  # when run as module
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR  # when run as script
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+
+def _resolve_under_data(path_like: str | os.PathLike) -> Path:
+    """Map a repo-style path like 'data/...' to <DATA_DIR>/...; keep absolute paths as-is."""
+    p = Path(path_like)
+    if p.is_absolute():
+        return p
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return Path(CFG_DATA_DIR) / rel
+
+def load_company_profiles(profiles_dir):
+    """
+    Load all company profile parquet files from the directory into a DataFrame.
+    Returns a DataFrame indexed by symbol.
+    """
+    profile_files = glob.glob(os.path.join(profiles_dir, '*_company_profile.parquet'))
+    profiles = []
+    for file in profile_files:
+        df = pd.read_parquet(file)
+        # Extract symbol from filename
+        symbol = os.path.basename(file).split('_')[0]
+        df['symbol'] = symbol
+        profiles.append(df)
+    if profiles:
+        profiles_df = pd.concat(profiles, ignore_index=True)
+        profiles_df.set_index('symbol', inplace=True)
+        return profiles_df
+    else:
+        return pd.DataFrame()
+
+def merge_company_info_to_features(features_path, profiles_dir, output_path):
+    """
+    Merge company profile info into stocks features DataFrame by symbol.
+    """
+    # Resolve all paths under DATA_DIR
+    features_path = _resolve_under_data(features_path)
+    profiles_dir = _resolve_under_data(profiles_dir)
+    output_path = _resolve_under_data(output_path)
+    # Load features
+    features_df = pd.read_parquet(features_path)
+    # Load company profiles
+    profiles_df = load_company_profiles(profiles_dir)
+    # Merge on symbol
+    merged_df = features_df.join(profiles_df, on='symbol', rsuffix='_company')
+    # Save result
+    merged_df.to_parquet(output_path, compression='snappy')
+    return merged_df
+
+# Example usage
+def main():
+    features_path = "data/merged/features/stocks_features.parquet"
+    profiles_dir = "data/finnhub/company_info"
+    output_path = features_path
+    merge_company_info_to_features(features_path, profiles_dir, output_path)
+    print(f"[INFO] Merged company info into features and saved to: {output_path}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/finhub/quote.py b/src/merge/finhub/quote.py
new file mode 100644
index 0000000000000000000000000000000000000000..b478ec1f2e424108ed4464d305e5480b461228cc
--- /dev/null
+++ b/src/merge/finhub/quote.py
@@ -0,0 +1,115 @@
+import os
+from pathlib import Path
+import pandas as pd
+import glob
+import json
+
+# Resolve DATA_DIR similar to other modules
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+
+def _resolve_under_data(path_like: str | os.PathLike) -> Path:
+    p = Path(path_like)
+    if p.is_absolute():
+        return p
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return Path(CFG_DATA_DIR) / rel
+
+def add_latest_quotes_to_features(features_path, quotes_dir, output_path):
+    # Resolve paths under DATA_DIR
+    features_path = _resolve_under_data(features_path)
+    quotes_dir = _resolve_under_data(quotes_dir)
+    output_path = _resolve_under_data(output_path)
+
+    # Load features
+    features_df = pd.read_parquet(features_path)
+
+    # Load all quote JSONs
+    quote_rows = []
+    for file in glob.glob(os.path.join(str(quotes_dir), '*_current_quote.parquet')):
+        try:
+            df = pd.read_parquet(file)
+            # If DataFrame has a 'data' column, expand it
+            if 'data' in df.columns:
+                import numpy as np
+                data_list = df['data'].tolist()
+                if data_list and isinstance(data_list[0], np.ndarray):
+                    flat_list = [dict(item) for item in data_list[0]]
+                    df = pd.DataFrame.from_records(flat_list)
+                elif data_list and isinstance(data_list[0], dict):
+                    df = pd.DataFrame.from_records(data_list)
+                elif data_list and isinstance(data_list[0], list):
+                    expected_cols = ["c", "d", "dp", "h", "l", "o", "pc", "t"]
+                    df = pd.DataFrame(data_list, columns=expected_cols[:len(data_list[0])])
+                else:
+                    df = pd.DataFrame()
+            # If DataFrame has only one row, convert to dict
+            if not df.empty:
+                record = df.iloc[0].to_dict()
+                record['symbol'] = os.path.basename(file).split('_')[0]
+                quote_rows.append(record)
+        except Exception as e:
+            print(f"[WARN] Skipping {file}: {e}")
+
+    if not quote_rows:
+        print("[WARN] No valid quote data found to merge. Output not updated.")
+        return
+
+    quotes_df = pd.DataFrame(quote_rows).set_index('symbol')
+
+    def merge_quote_into_row(row):
+        symbol = row['symbol']
+        if symbol not in quotes_df.index:
+            return row
+
+        quote = quotes_df.loc[symbol]
+
+        field_map = {
+            'o': 'open',
+            'h': 'high',
+            'l': 'low',
+            'c': 'close',
+            'd': 'change',
+            'dp': 'price_change_1',
+        }
+
+        for q_key, f_key in field_map.items():
+            val = quote.get(q_key)
+            if pd.notnull(val):
+                if f_key in features_df.columns:
+                    row[f_key] = val
+                else:
+                    row[f'{f_key}_quote'] = val  # if feature doesn’t exist, add it
+
+        # Add extra fields
+        if pd.notnull(quote.get('pc')):
+            row['prev_close'] = quote['pc']
+
+        if pd.notnull(quote.get('t')):
+            row['timestamp'] = quote['t'] * 1000
+            row['datetime'] = pd.to_datetime(quote['t'], unit='s')
+
+        return row
+
+    features_df = features_df.apply(merge_quote_into_row, axis=1)
+    features_df.to_parquet(output_path, index=False, compression='snappy')
+    print(f"[INFO] Added latest quote data for all available symbols and saved to: {output_path}")
+
+def main():
+    features_path = "data/merged/features/stocks_features.parquet"
+    quotes_dir = "data/finnhub/stock_data"
+    output_path = features_path
+    add_latest_quotes_to_features(features_path, quotes_dir, output_path)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/finhub/ratings.py b/src/merge/finhub/ratings.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b6580299870dbcec5c18214e72e042cb53bc0a9
--- /dev/null
+++ b/src/merge/finhub/ratings.py
@@ -0,0 +1,66 @@
+import os
+from pathlib import Path
+import pandas as pd
+import glob
+
+# Resolve DATA_DIR similar to other modules
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+
+def _resolve_under_data(path_like: str | os.PathLike) -> Path:
+    p = Path(path_like)
+    if p.is_absolute():
+        return p
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return Path(CFG_DATA_DIR) / rel
+
+def add_latest_ratings_to_features(features_path, ratings_dir, output_path):
+    # Resolve paths under DATA_DIR
+    features_path = _resolve_under_data(features_path)
+    ratings_dir = _resolve_under_data(ratings_dir)
+    output_path = _resolve_under_data(output_path)
+
+    # Load features
+    features_df = pd.read_parquet(features_path)
+
+    # Find all ratings files
+    ratings_files = glob.glob(os.path.join(str(ratings_dir), '*_recommendation_trends.parquet'))
+    latest_rows = []
+    for file in ratings_files:
+        # Read as Parquet file
+        df = pd.read_parquet(file)
+        # Get latest row by period (assuming period is YYYY-MM-DD)
+        if 'period' in df.columns:
+            df['period'] = pd.to_datetime(df['period'])
+            latest = df.sort_values('period', ascending=False).iloc[[0]]
+            latest_rows.append(latest)
+    if latest_rows:
+        all_latest_ratings = pd.concat(latest_rows, ignore_index=True)
+    else:
+        all_latest_ratings = pd.DataFrame()
+    # Merge only if ratings data is available and has 'symbol' column
+    if not all_latest_ratings.empty and 'symbol' in all_latest_ratings.columns:
+        merged_df = features_df.merge(all_latest_ratings, on='symbol', how='left', suffixes=('', '_ratings'))
+        merged_df.to_parquet(output_path, compression='snappy')
+        print(f"[INFO] Added latest ratings data for all available symbols and saved to: {output_path}")
+    else:
+        print("[WARN] No valid ratings data found to merge. Output not updated.")
+
+def main():
+    features_path = "data/merged/features/stocks_features.parquet"
+    ratings_dir = "data/finnhub/ratings"
+    output_path = features_path
+    add_latest_ratings_to_features(features_path, ratings_dir, output_path)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/finhub/sentiment.py b/src/merge/finhub/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..5680ac5406325fb06e73ecc06cdf7ee25d6f2db8
--- /dev/null
+++ b/src/merge/finhub/sentiment.py
@@ -0,0 +1,86 @@
+import os
+from pathlib import Path
+import pandas as pd
+
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+
+def _resolve_under_data(path_like: str | os.PathLike) -> Path:
+    p = Path(path_like)
+    if p.is_absolute():
+        return p
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return Path(CFG_DATA_DIR) / rel
+
+
+def add_sentiment_to_features(features_path, output_path, sentiment_data):
+    # Resolve paths under DATA_DIR
+    features_path = _resolve_under_data(features_path)
+    output_path = _resolve_under_data(output_path)
+
+    # Load features
+    features_df = pd.read_parquet(features_path)
+
+    # Load newest sentiment data for all symbols from ownership directory under DATA_DIR
+    ownership_dir = Path(CFG_DATA_DIR) / 'finnhub' / 'ownership'
+    import glob
+    sentiment_files = glob.glob(os.path.join(str(ownership_dir), '*_insider_sentiment.parquet'))
+    newest_rows = []
+    for file in sentiment_files:
+        df = pd.read_parquet(file)
+        # If file has a 'data' column, expand it
+        if 'data' in df.columns:
+            data_list = df['data'].tolist()
+            # If first item is a numpy array, flatten to list of dicts
+            import numpy as np
+            if data_list and isinstance(data_list[0], np.ndarray):
+                # Flatten array to list
+                flat_list = [dict(item) for item in data_list[0]]
+                df = pd.DataFrame.from_records(flat_list)
+            elif data_list and isinstance(data_list[0], dict):
+                df = pd.DataFrame.from_records(data_list)
+            elif data_list and isinstance(data_list[0], list):
+                expected_cols = ["change", "month", "mspr", "symbol", "year"]
+                df = pd.DataFrame(data_list, columns=expected_cols[:len(data_list[0])])
+            else:
+                df = pd.DataFrame()
+        # Extract symbol from filename if not present
+        if 'symbol' not in df.columns:
+            symbol = os.path.basename(file).split('_')[0]
+            df['symbol'] = symbol
+        # Only process if both 'year' and 'month' columns exist
+        if 'year' in df.columns and 'month' in df.columns:
+            newest = df.sort_values(['year', 'month'], ascending=[False, False]).iloc[[0]]
+            newest_rows.append(newest)
+        else:
+            print(f"[WARN] Skipping {file}: missing 'year' or 'month' column after expansion.")
+    if newest_rows:
+        all_newest_sentiment = pd.concat(newest_rows, ignore_index=True)
+    else:
+        all_newest_sentiment = pd.DataFrame()
+    # Merge only if sentiment data is available and has 'symbol' column
+    if not all_newest_sentiment.empty and 'symbol' in all_newest_sentiment.columns:
+        merged_df = features_df.merge(all_newest_sentiment, on='symbol', how='left', suffixes=('', '_sentiment'))
+        # Save result
+        merged_df.to_parquet(output_path, compression='snappy')
+        print(f"[INFO] Added newest sentiment data for all available symbols and saved to: {output_path}")
+    else:
+        print("[WARN] No valid sentiment data found to merge. Output not updated.")
+
+def main():
+    features_path = "data/merged/features/stocks_features.parquet"
+    output_path = features_path
+    add_sentiment_to_features(features_path, output_path, None)
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/full_report.py b/src/merge/full_report.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ebb8e122453719d5dec9f5a44fea8fa70c593b
--- /dev/null
+++ b/src/merge/full_report.py
@@ -0,0 +1,385 @@
+"""
+Unified report generator for merged features - generates all 3 reports with automatic column discovery.
+Supports merged, crypto, and stocks feature files with dynamic schema detection.
+
+Usage:
+    # Generate all 3 reports
+    python unified_report_generator.py --generate-all
+    
+    # Generate specific reports
+    python unified_report_generator.py --merged-input data/merged/features/merged_features.parquet
+    python unified_report_generator.py --crypto-input data/merged/features/crypto_features.parquet
+    python unified_report_generator.py --stocks-input data/merged/features/stocks_features.parquet
+    
+    # Custom paths
+    python unified_report_generator.py \
+      --merged-input path/to/merged.parquet \
+      --crypto-input path/to/crypto.parquet \
+      --stocks-input path/to/stocks.parquet \
+      --output-dir reports/ \
+      --baseline-schema schemas/baseline.json
+"""
+
+import argparse
+import pandas as pd
+import json
+import os
+from datetime import datetime
+from typing import Dict, List, Set, Optional
+from pathlib import Path
+
+def categorize_column_by_name(col_name: str) -> str:
+    """Automatically categorize columns based on naming patterns."""
+    col_lower = col_name.lower()
+    
+    # Exchange-related
+    if col_name.startswith(('symbols.', 'exchangePrices.')):
+        return "Exchange Data"
+    
+    # Performance metrics
+    if col_name.startswith(('performance.', 'rankDiffs.')):
+        return "Performance Metrics"
+    
+    # Technical indicators
+    if col_lower in ['rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_width', 
+                     'bb_position', 'stoch_k', 'stoch_d', 'cci', 'mfi'] or col_name.startswith('roc_'):
+        return "Technical Indicators"
+    
+    # Price-related
+    if any(word in col_lower for word in ['price', 'open', 'volume', 'marketcap', 'volatility']):
+        return "Price & Volume"
+    
+    # On-chain/blockchain
+    if any(word in col_lower for word in ['transaction', 'gas', 'fees', 'tx_', 'blockchain']):
+        return "On-chain Features"
+    
+    # Sentiment
+    if any(word in col_lower for word in ['sentiment', 'social', 'confidence']):
+        return "Sentiment Features"
+    
+    # Temporal
+    if any(word in col_lower for word in ['timestamp', 'hour', 'day', 'weekend', 'trading_hours']):
+        return "Temporal Features"
+    
+    # Completeness metrics
+    if 'completeness' in col_lower or 'data_quality' in col_lower:
+        return "Data Quality Metrics"
+    
+    # Market/Exchange info
+    if col_lower in ['dominance', 'rank', 'stable', 'cg_id']:
+        return "Market Metrics"
+    
+    # Flags
+    if col_name.startswith('is_') or col_lower in ['stable']:
+        return "Asset Flags"
+    
+    # Metadata
+    if col_name.startswith('_') or col_lower in ['backup_id', 'stock_market', 'blockchain_network']:
+        return "Metadata"
+    
+    # Links
+    if col_name.startswith('links.'):
+        return "External Links"
+    
+    # Interaction features
+    if any(word in col_lower for word in ['correlation', 'convergence', 'alignment', 'trend']):
+        return "Interaction Features"
+    
+    # Default for unknown
+    return "Other Features"
+
+def load_baseline_schema(baseline_path: str) -> Set[str]:
+    """Load baseline schema if it exists."""
+    if os.path.exists(baseline_path):
+        try:
+            with open(baseline_path, 'r') as f:
+                baseline = json.load(f)
+                return set(baseline.get('columns', []))
+        except (json.JSONDecodeError, KeyError):
+            print(f"Warning: Could not load baseline schema from {baseline_path}")
+    return set()
+
+def save_baseline_schema(columns: List[str], baseline_path: str):
+    """Save current columns as baseline schema."""
+    os.makedirs(os.path.dirname(baseline_path), exist_ok=True)
+    schema = {
+        "generated_at": datetime.utcnow().isoformat() + "Z",
+        "total_columns": len(columns),
+        "columns": sorted(columns)
+    }
+    with open(baseline_path, 'w') as f:
+        json.dump(schema, f, indent=2)
+
+def detect_asset_type(df: pd.DataFrame, all_columns: List[str]) -> str:
+    """Detect asset type based on column patterns."""
+    if any(col.startswith('symbols.') for col in all_columns):
+        return "crypto"
+    elif "stock_market" in all_columns:
+        return "stocks"
+    elif "is_crypto" in all_columns and "is_stock" in all_columns:
+        return "mixed"
+    else:
+        return "unknown"
+
+def get_asset_specific_stats(df: pd.DataFrame, asset_type: str, all_columns: List[str]) -> Dict:
+    """Get asset-specific statistics."""
+    stats = {"asset_type": asset_type}
+    
+    if asset_type == "crypto":
+        # Crypto-specific stats
+        if "stable" in df.columns:
+            stats["stable_coins_count"] = int(df["stable"].sum())
+        
+        if "cg_id" in df.columns or "symbol" in df.columns:
+            symbol_col = "symbol" if "symbol" in df.columns else "cg_id"
+            stats["unique_crypto_assets"] = df[symbol_col].nunique()
+        
+        # Exchange coverage
+        exchange_columns = [col for col in all_columns if col.startswith(("symbols.", "exchangePrices."))]
+        if exchange_columns:
+            exchange_coverage = {}
+            for col in exchange_columns[:10]:  # Limit to avoid huge reports
+                coverage = (df[col].notna().sum() / len(df)) * 100
+                exchange_coverage[col] = round(coverage, 2)
+            stats["exchange_coverage"] = exchange_coverage
+    
+    elif asset_type == "stocks":
+        # Stock-specific stats
+        if "symbol" in df.columns:
+            stats["unique_stock_symbols"] = df["symbol"].nunique()
+        
+        if "stock_market" in df.columns:
+            stats["stock_market_distribution"] = df["stock_market"].value_counts().to_dict()
+        
+        if "is_trading_hours" in df.columns:
+            trading_hours_pct = (df["is_trading_hours"].sum() / len(df)) * 100
+            stats["trading_hours_coverage_pct"] = round(trading_hours_pct, 2)
+    
+    elif asset_type == "mixed":
+        # Mixed dataset stats
+        if "is_crypto" in df.columns:
+            stats["crypto_records"] = int(df["is_crypto"].sum())
+        if "is_stock" in df.columns:
+            stats["stock_records"] = int(df["is_stock"].sum())
+        if "symbol" in df.columns:
+            stats["total_unique_symbols"] = df["symbol"].nunique()
+    
+    return stats
+
+def generate_report(input_path: str, output_path: str, baseline_schema_path: Optional[str] = None, report_type: str = "auto") -> bool:
+    """Generate a feature report for any dataset type."""
+    
+    # Check if input file exists
+    if not os.path.exists(input_path):
+        print(f"Warning: Input file not found: {input_path}")
+        return False
+    
+    try:
+        # Load the dataset
+        df = pd.read_parquet(input_path)
+        all_columns = list(df.columns)
+        
+        print(f"Processing {input_path}...")
+        print(f"  - Shape: {df.shape}")
+        print(f"  - Columns: {len(all_columns)}")
+        
+        # Load baseline schema for comparison
+        baseline_columns = set()
+        if baseline_schema_path:
+            baseline_columns = load_baseline_schema(baseline_schema_path)
+        
+        # Identify new columns
+        current_columns = set(all_columns)
+        new_columns = current_columns - baseline_columns if baseline_columns else set()
+        
+        # Auto-categorize all columns
+        categories = {}
+        new_features_by_category = {}
+        
+        for col in all_columns:
+            category = categorize_column_by_name(col)
+            
+            if category not in categories:
+                categories[category] = {"count": 0, "features": []}
+                new_features_by_category[category] = []
+            
+            categories[category]["features"].append(col)
+            categories[category]["count"] += 1
+            
+            # Track if it's a new feature
+            if col in new_columns:
+                new_features_by_category[category].append(col)
+        
+        # Clean up empty new feature lists
+        new_features_by_category = {k: v for k, v in new_features_by_category.items() if v}
+        
+        # Basic dataset stats
+        ts_col = df["interval_timestamp"] if "interval_timestamp" in df.columns else df.iloc[:, 0]
+        if pd.api.types.is_datetime64_any_dtype(ts_col):
+            start_ts = int(ts_col.min().timestamp() * 1000)
+            end_ts = int(ts_col.max().timestamp() * 1000)
+        else:
+            start_ts = int(ts_col.min())
+            end_ts = int(ts_col.max())
+        
+        memory_mb = df.memory_usage(deep=True).sum() / 1024**2
+        
+        # Data quality
+        missing = df.isna().sum().to_dict()
+        total_cells = df.size
+        non_missing = int(df.notna().sum().sum())
+        completeness_pct = (non_missing / total_cells) * 100
+        avg_dq_score = df.get("data_quality_score", pd.Series(dtype=float)).mean()
+        
+        # Detect asset type and get specific stats
+        asset_type = detect_asset_type(df, all_columns)
+        asset_stats = get_asset_specific_stats(df, asset_type, all_columns)
+        
+        # Build the report
+        report = {
+            "generated_at_utc": datetime.utcnow().isoformat() + "Z",
+            "report_type": report_type,
+            "schema_version": "unified_v1.0",
+            "source_file": os.path.basename(input_path),
+            "dataset_info": {
+                "shape": list(df.shape),
+                "memory_usage_mb": round(memory_mb, 2),
+                "time_range": {"start": start_ts, "end": end_ts},
+                "total_columns": len(all_columns),
+                "total_categories": len(categories),
+                "new_columns_count": len(new_columns),
+                **asset_stats
+            },
+            "feature_categories": categories,
+            "data_quality": {
+                "overall_completeness_pct": round(completeness_pct, 2),
+                "missing_values_by_column": missing,
+                "average_data_quality_score": None if pd.isna(avg_dq_score) else round(avg_dq_score, 4)
+            }
+        }
+        
+        # Add new features section if any exist
+        if new_columns:
+            report["new_features"] = {
+                "total_new_features": len(new_columns),
+                "new_features_by_category": new_features_by_category,
+                "all_new_features": sorted(list(new_columns))
+            }
+        
+        # Add baseline comparison if available
+        if baseline_columns:
+            removed_columns = baseline_columns - current_columns
+            if removed_columns:
+                report["removed_features"] = sorted(list(removed_columns))
+        
+        # Ensure output directory exists
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        
+        # Write report
+        with open(output_path, "w") as f:
+            json.dump(report, f, indent=2)
+        
+        print(f"  Report generated: {output_path}")
+        print(f"    - {len(categories)} categories")
+        if new_columns:
+            print(f"    - {len(new_columns)} new features detected")
+        
+        return True
+        
+    except Exception as e:
+        print(f"  Error processing {input_path}: {str(e)}")
+        return False
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    
+    # Input files
+    parser.add_argument("--merged-input", default="data/merged/features/merged_features.parquet", help="Path to merged_features.parquet")
+    parser.add_argument("--crypto-input", default="data/merged/features/crypto_features.parquet", help="Path to crypto_features.parquet")
+    parser.add_argument("--stocks-input", default="data/merged/features/stocks_features.parquet", help="Path to stocks_features.parquet")
+
+    # Output settings
+    parser.add_argument("--output-dir", default="data/merged/features/", help="Output directory for reports")
+    parser.add_argument("--baseline-schema", default="schemas/baseline.json", help="Path to baseline schema JSON")
+    
+    # Convenience flags
+    parser.add_argument("--generate-all", action="store_true", help="Generate all reports using default paths")
+    
+    args = parser.parse_args()
+    
+    # Default paths for --generate-all
+    if args.generate_all:
+        default_paths = {
+            "merged": "data/merged/features/merged_features.parquet",
+            "crypto": "data/merged/features/crypto_features.parquet",
+            "stocks": "data/merged/features/stocks_features.parquet"
+        }
+
+        print("Generating all feature reports...")
+        success_count = 0
+
+        for report_type, input_path in default_paths.items():
+            output_dir = args.output_dir if args.output_dir else "data/merged/features/"
+            output_path = os.path.join(output_dir, f"{report_type}_report.json")
+            baseline_path = args.baseline_schema if args.baseline_schema else f"schemas/{report_type}_baseline.json"
+
+            if generate_report(input_path, output_path, baseline_path, report_type):
+                success_count += 1
+
+        print(f"\nGenerated {success_count}/3 reports successfully!")
+
+        # Update baseline schema with merged features if it exists
+        if args.baseline_schema and os.path.exists(default_paths["merged"]):
+            df = pd.read_parquet(default_paths["merged"])
+            save_baseline_schema(list(df.columns), args.baseline_schema)
+            print(f"Updated baseline schema: {args.baseline_schema}")
+
+        return
+    
+    # Individual file processing
+    reports_generated = 0
+    
+    if args.merged_input:
+        output_dir = args.output_dir if args.output_dir else "data/merged/features/"
+        output_path = os.path.join(output_dir, "merged_report.json")
+        if generate_report(args.merged_input, output_path, args.baseline_schema, "merged"):
+            reports_generated += 1
+    
+    if args.crypto_input:
+        output_dir = args.output_dir if args.output_dir else "data/merged/features/"
+        output_path = os.path.join(output_dir, "crypto_report.json") 
+        if generate_report(args.crypto_input, output_path, args.baseline_schema, "crypto"):
+            reports_generated += 1
+            # Print crypto count and data quality
+            try:
+                with open(output_path, "r") as f:
+                    report = json.load(f)
+                count = report.get("dataset_info", {}).get("shape", [None])[0]
+                dq = report.get("data_quality", {}).get("overall_completeness_pct", None)
+                print(f"[CRYPTO] Count: {count}, Data Quality: {dq}%")
+            except Exception as e:
+                print(f"[CRYPTO] Error reading report for stats: {e}")
+    
+    if args.stocks_input:
+        output_dir = args.output_dir if args.output_dir else "data/merged/features/"
+        output_path = os.path.join(output_dir, "stocks_report.json")
+        if generate_report(args.stocks_input, output_path, args.baseline_schema, "stocks"):
+            reports_generated += 1
+            # Print stocks count and data quality
+            try:
+                with open(output_path, "r") as f:
+                    report = json.load(f)
+                count = report.get("dataset_info", {}).get("shape", [None])[0]
+                dq = report.get("data_quality", {}).get("overall_completeness_pct", None)
+                print(f"[STOCKS] Count: {count}, Data Quality: {dq}%")
+            except Exception as e:
+                print(f"[STOCKS] Error reading report for stats: {e}")
+    
+    if reports_generated == 0:
+        print("No input files specified. Use --generate-all or specify input files.")
+        parser.print_help()
+    else:
+        print(f"\nGenerated {reports_generated} report(s) successfully!")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/main.py b/src/merge/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dc86aafdf08041b3caf4e8440c2e17cd9ac2720
--- /dev/null
+++ b/src/merge/main.py
@@ -0,0 +1,259 @@
+import subprocess
+from pathlib import Path
+import sys
+import pandas as pd
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+import gc
+import psutil
+import os
+
+# Memory optimization for merge operations
+class MergeMemoryOptimizer:
+    """Memory optimizer for merge operations"""
+    
+    def __init__(self, max_memory_mb=350):
+        self.max_memory_mb = max_memory_mb
+        self.process = psutil.Process()
+    
+    def get_memory_usage(self):
+        return self.process.memory_info().rss / 1024 / 1024
+    
+    def cleanup_after_script(self, script_name):
+        collected = gc.collect()
+        memory_after = self.get_memory_usage()
+        print(f"[MemOpt] After {script_name}: {memory_after:.1f}MB (freed {collected} objects)")
+        
+        if memory_after > self.max_memory_mb:
+            print(f"[MemOpt] WARNING: High memory after {script_name}")
+            # Additional cleanup attempt
+            gc.collect()
+        
+        return memory_after
+
+# Global memory optimizer instance
+memory_optimizer = MergeMemoryOptimizer()
+
+DAYS_OLD = 7
+MERGED_PATH = Path("data/merged/features/merged_features.parquet")
+ARCHIVE_DIR = Path("data/merged/archive")
+ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
+
+def run_script(script, args=None):
+    cmd = [sys.executable, str(Path(__file__).parent / script)]
+    if args:
+        cmd += args
+    print(f"Running: {' '.join(cmd)}")
+    
+    # Check memory before running
+    memory_before = memory_optimizer.get_memory_usage()
+    print(f"[MemOpt] Before {script}: {memory_before:.1f}MB")
+    
+    result = subprocess.run(cmd, check=True)
+    
+    # Cleanup after running
+    memory_optimizer.cleanup_after_script(script)
+    
+    return result
+
+def archive_old_records():
+    feature_files = [
+        Path("data/merged/features/crypto_features.parquet"),
+        Path("data/merged/features/stocks_features.parquet")
+    ]
+    now = datetime.utcnow()
+    cutoff = int((now - timedelta(days=DAYS_OLD)).timestamp() * 1000)
+
+    for feature_path in feature_files:
+        if not feature_path.exists():
+            print(f"[WARN] {feature_path} does not exist.")
+            continue
+
+        df = pd.read_parquet(feature_path)
+        old = df.loc[df['interval_timestamp'] < cutoff].copy()
+        keep = df.loc[df['interval_timestamp'] >= cutoff].copy()
+
+        if old.empty:
+            print(f"[INFO] No records to archive in {feature_path}.")
+            continue
+
+        # Group by day (UTC) and write each group to a separate parquet file under archive/{day}/
+        old['archive_date'] = pd.to_datetime(old['interval_timestamp'], unit='ms').dt.strftime('%Y%m%d')
+        for day, group in old.groupby('archive_date'):
+            day_dir = ARCHIVE_DIR / day
+            day_dir.mkdir(parents=True, exist_ok=True)
+            out_path = day_dir / f"{feature_path.stem}_archived_{day}.parquet"
+            if out_path.exists():
+                existing = pd.read_parquet(out_path)
+                group = pd.concat([existing, group.drop(columns=['archive_date'])], ignore_index=True)
+            else:
+                group = group.drop(columns=['archive_date'])
+
+            group.to_parquet(out_path, index=False)
+            print(f"[ARCHIVE] {len(group)} records -> {out_path}")
+
+        # Save the remaining (unarchived) records back to the feature file
+        keep.to_parquet(feature_path, index=False)
+        print(f"[INFO] Archived {len(old)} records from {feature_path}. {len(keep)} remain.")
+
+def store_in_cloud():
+    # Import StorageHandler from cloud_utils, ensuring src is in sys.path
+    import os
+    import sys
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'src')))
+    from data_cloud.cloud_utils import StorageHandler
+
+    # Filebase credentials from env
+    load_dotenv()
+    endpoint_url = os.getenv("FILEBASE_ENDPOINT")
+    access_key = os.getenv("FILEBASE_ACCESS_KEY")
+    secret_key = os.getenv("FILEBASE_SECRET_KEY")
+    bucket_name = os.getenv("FILEBASE_BUCKET")
+    if not all([endpoint_url, access_key, secret_key, bucket_name]):
+        print("[ERROR] Filebase credentials not set in environment.")
+        return
+
+    storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name)
+
+    merged_dir = os.path.join("data", "merged")
+    archive_dir = os.path.join(merged_dir, "archive")
+    # Upload all files in merged except archive
+    for root, dirs, files in os.walk(merged_dir):
+        # Skip archive subdir for now
+        if os.path.abspath(root) == os.path.abspath(archive_dir):
+            continue
+        for fname in files:
+            local_path = os.path.join(root, fname)
+            rel_path = os.path.relpath(local_path, "data")
+            key = rel_path.replace(os.sep, "/")
+            with open(local_path, "rb") as f:
+                data = f.read()
+            storage.upload(key, data)
+
+    # Only upload archive files newer than DAYS_OLD days
+    import time
+    cutoff = time.time() - DAYS_OLD * 86400
+    if os.path.exists(archive_dir):
+        for fname in os.listdir(archive_dir):
+            local_path = os.path.join(archive_dir, fname)
+            if not os.path.isfile(local_path):
+                continue
+            mtime = os.path.getmtime(local_path)
+            if mtime >= cutoff:
+                rel_path = os.path.relpath(local_path, "data")
+                key = rel_path.replace(os.sep, "/")
+                with open(local_path, "rb") as f:
+                    data = f.read()
+                storage.upload(key, data)
+
+# Save stocks and crypto features to data/merged/raw
+def save_raw_features():
+    import shutil
+    raw_dir = Path('data/merged/raw')
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    src_stocks = Path('data/merged/features/stocks_features.parquet')
+    src_crypto = Path('data/merged/features/crypto_features.parquet')
+    dst_stocks = raw_dir / 'stocks_features.parquet'
+    dst_crypto = raw_dir / 'crypto_features.parquet'
+    if src_stocks.exists():
+        shutil.copy2(src_stocks, dst_stocks)
+        print(f"[RAW] Saved stocks features to {dst_stocks}")
+    else:
+        print(f"[RAW] Source stocks features not found: {src_stocks}")
+    if src_crypto.exists():
+        shutil.copy2(src_crypto, dst_crypto)
+        print(f"[RAW] Saved crypto features to {dst_crypto}")
+    else:
+        print(f"[RAW] Source crypto features not found: {src_crypto}")
+
+def main():
+    print("[MergeOpt] Starting memory-optimized merge pipeline...")
+    initial_memory = memory_optimizer.get_memory_usage()
+    print(f"[MergeOpt] Initial memory: {initial_memory:.1f}MB")
+    
+    # Run all merge steps with memory monitoring
+    run_script('merge_0.py')
+    run_script('merge_1.py', [
+        '--latest', 'data/advisorai-data/features/latest_features.parquet',
+        '--finnhub', 'data/advisorai-data/features/latest_features.parquet',
+        '--out', 'data/merged/features/merged_features.parquet'
+    ])
+    run_script('merge_2.py')
+    run_script('merge_3.py')
+    run_script('merge_4.py')
+    run_script('separator.py')
+    run_script('merge_5.py')
+    run_script('merge_6.py')
+    run_script('merge_7.py')
+
+    save_raw_features()
+
+    # Extract symbols from exchange symbol data before data fillers
+    try:
+        run_script('extract_symbols.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Symbol extraction failed: {e}")
+
+    # Remove rows with null symbols after symbol extraction
+    try:
+        run_script('remove_null_symbols.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Null symbol removal failed: {e}")
+
+    # # Run normalization scripts with error handling
+    # run_script('stocks_data_filler.py')
+    
+    # try:
+    #     run_script('crypto_data_filler.py')
+    # except subprocess.CalledProcessError as e:
+    #     print(f"[WARNING] Crypto data filler failed: {e}")
+
+    # Merge temp files into merged - with error handling
+    try:
+        run_script('merge_temp.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Merge temp failed: {e}")
+
+    try:
+        run_script('merge_sant.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Santiment merge failed: {e}")
+        
+    try:
+        run_script('merge_santiment_with_crypto.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Santiment-crypto merge failed: {e}")
+
+    # # Final comprehensive null value handling - clean up any remaining nulls
+    # try:
+    #     run_script('run_final_null_handling.py')
+    # except subprocess.CalledProcessError as e:
+    #     print(f"[WARNING] Final null handling failed: {e}")
+
+    # # Normalize features
+    # run_script('normalize.py')
+    # # Normalize train files for both crypto and stocks
+    # run_script('norm/crypto.py', ['--train'])
+    # run_script('norm/stocks.py', ['--train'])
+
+    # Archive old records
+    archive_old_records()
+
+    # Generate and store full report
+    run_script('full_report.py')
+
+    # Store all merged data in cloud
+    store_in_cloud()
+
+    # Final memory check
+    final_memory = memory_optimizer.get_memory_usage()
+    print(f"[MergeOpt] Final memory usage: {final_memory:.1f}MB")
+    
+    if final_memory > 400:
+        print("[MergeOpt] WARNING: High final memory usage")
+        memory_optimizer.cleanup_after_script("final cleanup")
+
+    print("[OK] All merge steps, null handling, normalization, and reporting completed successfully.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/main_memory_optimized.py b/src/merge/main_memory_optimized.py
new file mode 100644
index 0000000000000000000000000000000000000000..7be4ffe5a3b57e82a414a3a53a5ec77220bfcb32
--- /dev/null
+++ b/src/merge/main_memory_optimized.py
@@ -0,0 +1,85 @@
+"""
+Memory-Optimized Merge Wrapper
+Wraps the main merge function with memory monitoring and cleanup
+"""
+
+import gc
+import os
+import sys
+import psutil
+from pathlib import Path
+
+class MergeMemoryOptimizer:
+    """Memory optimizer specifically for merge operations"""
+    
+    def __init__(self, max_memory_mb=350):  # Conservative limit for merge operations
+        self.max_memory_mb = max_memory_mb
+        self.process = psutil.Process()
+    
+    def get_memory_usage(self):
+        """Get current memory usage in MB"""
+        return self.process.memory_info().rss / 1024 / 1024
+    
+    def cleanup_after_script(self, script_name):
+        """Cleanup after running a merge script"""
+        # Force garbage collection
+        collected = gc.collect()
+        
+        # Clear any cached modules
+        modules_to_clear = [m for m in sys.modules.keys() if 'merge' in m or 'pandas' in m]
+        for module in modules_to_clear:
+            if module in sys.modules and module != __name__:
+                try:
+                    del sys.modules[module]
+                except:
+                    pass
+        
+        memory_after = self.get_memory_usage()
+        print(f"[MemOpt] After {script_name}: {memory_after:.1f}MB (freed {collected} objects)")
+        
+        if memory_after > self.max_memory_mb:
+            print(f"[MemOpt] WARNING: High memory after {script_name}: {memory_after:.1f}MB")
+        
+        return memory_after
+
+# Import the original main function
+def main():
+    """Memory-optimized wrapper for merge main"""
+    optimizer = MergeMemoryOptimizer()
+    
+    initial_memory = optimizer.get_memory_usage()
+    print(f"[MergeOpt] Starting merge operations - Memory: {initial_memory:.1f}MB")
+    
+    try:
+        # Import and run the original main function
+        from merge.main_original import main as original_main
+        
+        # Monitor memory during execution
+        result = original_main()
+        
+        # Final cleanup
+        final_memory = optimizer.cleanup_after_script("all merge operations")
+        print(f"[MergeOpt] Final merge memory: {final_memory:.1f}MB")
+        
+        return result
+        
+    except ImportError:
+        # Fallback to current main if original doesn't exist
+        print("[MergeOpt] No original main found, running current implementation...")
+        
+        # Import the current implementation
+        import merge.main as current_main
+        result = current_main.main()
+        
+        # Cleanup
+        optimizer.cleanup_after_script("current merge implementation")
+        
+        return result
+    
+    except Exception as e:
+        print(f"[MergeOpt] Error in merge operations: {e}")
+        optimizer.cleanup_after_script("error cleanup")
+        raise
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/main_original.py b/src/merge/main_original.py
new file mode 100644
index 0000000000000000000000000000000000000000..228786940689423a2661de9205459a3b485f99a0
--- /dev/null
+++ b/src/merge/main_original.py
@@ -0,0 +1,209 @@
+import subprocess
+from pathlib import Path
+import sys
+import pandas as pd
+from datetime import datetime, timedelta
+from dotenv import load_dotenv
+
+DAYS_OLD = 7
+MERGED_PATH = Path("data/merged/features/merged_features.parquet")
+ARCHIVE_DIR = Path("data/merged/archive")
+ARCHIVE_DIR.mkdir(parents=True, exist_ok=True)
+
+def run_script(script, args=None):
+    cmd = [sys.executable, str(Path(__file__).parent / script)]
+    if args:
+        cmd += args
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, check=True)
+    return result
+
+def archive_old_records():
+    feature_files = [
+        Path("data/merged/features/crypto_features.parquet"),
+        Path("data/merged/features/stocks_features.parquet")
+    ]
+    now = datetime.utcnow()
+    cutoff = int((now - timedelta(days=DAYS_OLD)).timestamp() * 1000)
+
+    for feature_path in feature_files:
+        if not feature_path.exists():
+            print(f"[WARN] {feature_path} does not exist.")
+            continue
+
+        df = pd.read_parquet(feature_path)
+        old = df.loc[df['interval_timestamp'] < cutoff].copy()
+        keep = df.loc[df['interval_timestamp'] >= cutoff].copy()
+
+        if old.empty:
+            print(f"[INFO] No records to archive in {feature_path}.")
+            continue
+
+        # Group by day (UTC) and write each group to a separate parquet file under archive/{day}/
+        old['archive_date'] = pd.to_datetime(old['interval_timestamp'], unit='ms').dt.strftime('%Y%m%d')
+        for day, group in old.groupby('archive_date'):
+            day_dir = ARCHIVE_DIR / day
+            day_dir.mkdir(parents=True, exist_ok=True)
+            out_path = day_dir / f"{feature_path.stem}_archived_{day}.parquet"
+            if out_path.exists():
+                existing = pd.read_parquet(out_path)
+                group = pd.concat([existing, group.drop(columns=['archive_date'])], ignore_index=True)
+            else:
+                group = group.drop(columns=['archive_date'])
+
+            group.to_parquet(out_path, index=False)
+            print(f"[ARCHIVE] {len(group)} records -> {out_path}")
+
+        # Save the remaining (unarchived) records back to the feature file
+        keep.to_parquet(feature_path, index=False)
+        print(f"[INFO] Archived {len(old)} records from {feature_path}. {len(keep)} remain.")
+
+def store_in_cloud():
+    # Import StorageHandler from cloud_utils, ensuring src is in sys.path
+    import os
+    import sys
+    sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'src')))
+    from data_cloud.cloud_utils import StorageHandler
+
+    # Filebase credentials from env
+    load_dotenv()
+    endpoint_url = os.getenv("FILEBASE_ENDPOINT")
+    access_key = os.getenv("FILEBASE_ACCESS_KEY")
+    secret_key = os.getenv("FILEBASE_SECRET_KEY")
+    bucket_name = os.getenv("FILEBASE_BUCKET")
+    if not all([endpoint_url, access_key, secret_key, bucket_name]):
+        print("[ERROR] Filebase credentials not set in environment.")
+        return
+
+    storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name)
+
+    merged_dir = os.path.join("data", "merged")
+    archive_dir = os.path.join(merged_dir, "archive")
+    # Upload all files in merged except archive
+    for root, dirs, files in os.walk(merged_dir):
+        # Skip archive subdir for now
+        if os.path.abspath(root) == os.path.abspath(archive_dir):
+            continue
+        for fname in files:
+            local_path = os.path.join(root, fname)
+            rel_path = os.path.relpath(local_path, "data")
+            key = rel_path.replace(os.sep, "/")
+            with open(local_path, "rb") as f:
+                data = f.read()
+            storage.upload(key, data)
+
+    # Only upload archive files newer than DAYS_OLD days
+    import time
+    cutoff = time.time() - DAYS_OLD * 86400
+    if os.path.exists(archive_dir):
+        for fname in os.listdir(archive_dir):
+            local_path = os.path.join(archive_dir, fname)
+            if not os.path.isfile(local_path):
+                continue
+            mtime = os.path.getmtime(local_path)
+            if mtime >= cutoff:
+                rel_path = os.path.relpath(local_path, "data")
+                key = rel_path.replace(os.sep, "/")
+                with open(local_path, "rb") as f:
+                    data = f.read()
+                storage.upload(key, data)
+
+# Save stocks and crypto features to data/merged/raw
+def save_raw_features():
+    import shutil
+    raw_dir = Path('data/merged/raw')
+    raw_dir.mkdir(parents=True, exist_ok=True)
+    src_stocks = Path('data/merged/features/stocks_features.parquet')
+    src_crypto = Path('data/merged/features/crypto_features.parquet')
+    dst_stocks = raw_dir / 'stocks_features.parquet'
+    dst_crypto = raw_dir / 'crypto_features.parquet'
+    if src_stocks.exists():
+        shutil.copy2(src_stocks, dst_stocks)
+        print(f"[RAW] Saved stocks features to {dst_stocks}")
+    else:
+        print(f"[RAW] Source stocks features not found: {src_stocks}")
+    if src_crypto.exists():
+        shutil.copy2(src_crypto, dst_crypto)
+        print(f"[RAW] Saved crypto features to {dst_crypto}")
+    else:
+        print(f"[RAW] Source crypto features not found: {src_crypto}")
+
+def main():
+    # Run all merge steps
+    run_script('merge_0.py')
+    run_script('merge_1.py', [
+        '--latest', 'data/advisorai-data/features/latest_features.parquet',
+        '--finnhub', 'data/advisorai-data/features/latest_features.parquet',
+        '--out', 'data/merged/features/merged_features.parquet'
+    ])
+    run_script('merge_2.py')
+    run_script('merge_3.py')
+    run_script('merge_4.py')
+    run_script('separator.py')
+    run_script('merge_5.py')
+    run_script('merge_6.py')
+    run_script('merge_7.py')
+
+    save_raw_features()
+
+    # Extract symbols from exchange symbol data before data fillers
+    try:
+        run_script('extract_symbols.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Symbol extraction failed: {e}")
+
+    # Remove rows with null symbols after symbol extraction
+    try:
+        run_script('remove_null_symbols.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Null symbol removal failed: {e}")
+
+    # # Run normalization scripts with error handling
+    # run_script('stocks_data_filler.py')
+    
+    # try:
+    #     run_script('crypto_data_filler.py')
+    # except subprocess.CalledProcessError as e:
+    #     print(f"[WARNING] Crypto data filler failed: {e}")
+
+    # Merge temp files into merged - with error handling
+    try:
+        run_script('merge_temp.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Merge temp failed: {e}")
+
+    try:
+        run_script('merge_sant.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Santiment merge failed: {e}")
+        
+    try:
+        run_script('merge_santiment_with_crypto.py')
+    except subprocess.CalledProcessError as e:
+        print(f"[WARNING] Santiment-crypto merge failed: {e}")
+
+    # # Final comprehensive null value handling - clean up any remaining nulls
+    # try:
+    #     run_script('run_final_null_handling.py')
+    # except subprocess.CalledProcessError as e:
+    #     print(f"[WARNING] Final null handling failed: {e}")
+
+    # # Normalize features
+    # run_script('normalize.py')
+    # # Normalize train files for both crypto and stocks
+    # run_script('norm/crypto.py', ['--train'])
+    # run_script('norm/stocks.py', ['--train'])
+
+    # Archive old records
+    archive_old_records()
+
+    # Generate and store full report
+    run_script('full_report.py')
+
+    # Store all merged data in cloud
+    store_in_cloud()
+
+    print("[OK] All merge steps, null handling, normalization, and reporting completed successfully.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/manual_null_handler.py b/src/merge/manual_null_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f49eb16199312ca6481981529157100ddb1786a8
--- /dev/null
+++ b/src/merge/manual_null_handler.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""
+Manual Null Handler - Standalone script for manual execution
+Use this when you need to handle null values without running the full pipeline
+"""
+
+import argparse
+import sys
+from pathlib import Path
+import pandas as pd
+from final_null_handler import process_crypto_features_file, process_stock_features_file, process_merged_features_file
+from run_final_null_handling import process_merged_features_file
+
+def main():
+    parser = argparse.ArgumentParser(description='Handle null values in feature files')
+    parser.add_argument('--crypto', action='store_true', help='Process crypto features only')
+    parser.add_argument('--stocks', action='store_true', help='Process stock features only')
+    parser.add_argument('--merged', action='store_true', help='Process merged features only')
+    parser.add_argument('--all', action='store_true', help='Process all feature files')
+    parser.add_argument('--input', type=str, help='Input file path (overrides default paths)')
+    parser.add_argument('--output', type=str, help='Output file path (defaults to input path)')
+    parser.add_argument('--dry-run', action='store_true', help='Show what would be done without making changes')
+    
+    args = parser.parse_args()
+    
+    # Default paths
+    default_paths = {
+        'crypto': Path("data/merged/features/crypto_features.parquet"),
+        'stocks': Path("data/merged/features/stocks_features.parquet"),
+        'merged': Path("data/merged/features/merged_features.parquet")
+    }
+    
+    if not any([args.crypto, args.stocks, args.merged, args.all, args.input]):
+        print("Error: Must specify --crypto, --stocks, --merged, --all, or --input")
+        parser.print_help()
+        return 1
+    
+    files_to_process = []
+    
+    if args.input:
+        # Custom input file
+        input_path = Path(args.input)
+        if not input_path.exists():
+            print(f"Error: Input file {input_path} does not exist")
+            return 1
+        
+        # Detect file type based on name or content
+        if 'crypto' in input_path.name.lower():
+            file_type = 'crypto'
+        elif 'stock' in input_path.name.lower():
+            file_type = 'stocks'
+        elif 'merged' in input_path.name.lower():
+            file_type = 'merged'
+        else:
+            # Try to detect from content
+            try:
+                df_sample = pd.read_parquet(input_path, nrows=10)
+                if 'rank' in df_sample.columns or 'dominance' in df_sample.columns:
+                    file_type = 'crypto'
+                elif 'strongBuy' in df_sample.columns or 'news_activity_score_x' in df_sample.columns:
+                    file_type = 'stocks'
+                else:
+                    file_type = 'merged'
+            except Exception:
+                file_type = 'merged'  # Default
+        
+        output_path = Path(args.output) if args.output else input_path
+        files_to_process.append((input_path, output_path, file_type))
+        
+    else:
+        # Use default paths based on flags
+        if args.all:
+            for file_type, path in default_paths.items():
+                if path.exists():
+                    files_to_process.append((path, path, file_type))
+        else:
+            if args.crypto and default_paths['crypto'].exists():
+                files_to_process.append((default_paths['crypto'], default_paths['crypto'], 'crypto'))
+            if args.stocks and default_paths['stocks'].exists():
+                files_to_process.append((default_paths['stocks'], default_paths['stocks'], 'stocks'))
+            if args.merged and default_paths['merged'].exists():
+                files_to_process.append((default_paths['merged'], default_paths['merged'], 'merged'))
+    
+    if not files_to_process:
+        print("Error: No files found to process")
+        return 1
+    
+    print("="*60)
+    print("MANUAL NULL VALUE HANDLER")
+    print("="*60)
+    
+    if args.dry_run:
+        print("DRY RUN MODE - No changes will be made")
+        print()
+    
+    for input_path, output_path, file_type in files_to_process:
+        print(f"\nProcessing: {input_path}")
+        print(f"Type: {file_type}")
+        print(f"Output: {output_path}")
+        
+        if args.dry_run:
+            try:
+                df = pd.read_parquet(input_path)
+                null_count = df.isnull().sum().sum()
+                print(f"Would process {len(df)} rows with {null_count} null values")
+            except Exception as e:
+                print(f"Error reading file: {e}")
+            continue
+        
+        try:
+            if file_type == 'crypto':
+                df_processed, report = process_crypto_features_file(input_path, output_path)
+            elif file_type == 'stocks':
+                df_processed, report = process_stock_features_file(input_path, output_path)
+            elif file_type == 'merged':
+                df_processed, report = process_merged_features_file(input_path)
+            
+            print(f"✅ Successfully processed {file_type} features:")
+            print(f"   - Rows: {len(df_processed):,}")
+            print(f"   - Nulls filled: {report['total_nulls_filled']:,}")
+            print(f"   - Columns fixed: {report['columns_fixed']}")
+            
+        except Exception as e:
+            print(f"❌ Error processing {input_path}: {e}")
+            return 1
+    
+    print("\n" + "="*60)
+    print("MANUAL NULL HANDLING COMPLETED")
+    print("="*60)
+    
+    return 0
+
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)
diff --git a/src/merge/merge_0.py b/src/merge/merge_0.py
new file mode 100644
index 0000000000000000000000000000000000000000..1137edd8c0bac2fb4fb56e0d66a0f7e378c16e48
--- /dev/null
+++ b/src/merge/merge_0.py
@@ -0,0 +1,20 @@
+import os
+import shutil
+from pathlib import Path
+
+def step0_move_old_merged():
+    """
+    Move the old merged features file to data/merged/temp for later remerge and deletion.
+    """
+    merged_dir = Path("data/merged/features")
+    temp_dir = Path("data/merged/temp")
+    temp_dir.mkdir(parents=True, exist_ok=True)
+    # Move all files from merged_dir to temp_dir
+    for f in merged_dir.glob("*"):
+        if f.is_file():
+            dest = temp_dir / f.name
+            print(f"[INFO] Moving {f} -> {dest}")
+            shutil.move(str(f), str(dest))
+
+if __name__ == "__main__":
+    step0_move_old_merged()
diff --git a/src/merge/merge_1.py b/src/merge/merge_1.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b74c46b44687e8573fdc13e91c4becc64d554c5
--- /dev/null
+++ b/src/merge/merge_1.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""
+Simple merge step 1: Copy latest features to merged features.
+This creates the initial merged_features.parquet file for the pipeline.
+"""
+
+import shutil
+from pathlib import Path
+
+def main():
+    """Copy latest features to merged features directory."""
+    
+    # Source file
+    source_path = Path("data/advisorai-data/features/latest_features.parquet")
+    
+    # Destination file
+    dest_path = Path("data/merged/features/merged_features.parquet")
+    
+    # Create destination directory if it doesn't exist
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+    
+    # Check if source file exists
+    if not source_path.exists():
+        raise FileNotFoundError(f"Source file not found: {source_path}")
+    
+    # Copy the file
+    shutil.copy2(source_path, dest_path)
+    
+    # Read and report basic info
+    import pandas as pd
+    df = pd.read_parquet(dest_path)
+    
+    print(f"OK  wrote {dest_path} -> {len(df)} rows x {len(df.columns)} cols")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/merge_2.py b/src/merge/merge_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..9df753f2ce82fed18df588a64008314e839735b8
--- /dev/null
+++ b/src/merge/merge_2.py
@@ -0,0 +1,233 @@
+"""
+Merge your features JSON with coin-metadata JSON, or merge a crypto-bubbles
+Parquet into your merged-features Parquet.
+
+Usage:
+  # JSON mode (default):
+  python merge_2.py json \
+    --features data/merged/features/merged_features.json \
+    --coininfo data/coininfo/coin_metadata.json \
+    --out merged_with_coininfo.ndjson
+
+  # Parquet mode:
+  python merge_2.py parquet \
+    --base data/merged/features/merged_features.parquet \
+    --bubbles data/crypto-bubbles/crypto_bubbles_2025-07-15.parquet \
+    --out data/merged/features/merged_features.parquet
+"""
+import json
+import pandas as pd
+from datetime import datetime
+from pathlib import Path
+import argparse
+
+def merge_parquet_features(base_fp: Path, bubbles_fp: Path, out_fp: Path):
+    """
+    Merge crypto bubbles Parquet into merged features Parquet.
+    For overlapping columns, non-null bubbles values overwrite base.
+    New columns from bubbles are added.
+    """
+
+    import time
+    base = pd.read_parquet(base_fp)
+    bubbles = pd.read_parquet(bubbles_fp)
+
+    # Fill missing interval_timestamp with current UTC ms, ensure int (ms) robustly
+    now_ms = int(time.time() * 1000)
+    def to_millis(val):
+        if pd.isna(val):
+            return pd.NA
+        if isinstance(val, (pd.Timestamp, datetime)):
+            return val.value // 1_000_000
+        try:
+            return int(float(val))
+        except (ValueError, TypeError):
+            try:
+                return int(pd.to_datetime(val).value // 1_000_000)
+            except Exception:
+                return pd.NA
+
+    for df in (base, bubbles):
+        if 'interval_timestamp' in df.columns:
+            df['interval_timestamp'] = df['interval_timestamp'].fillna(now_ms)
+            df['interval_timestamp'] = df['interval_timestamp'].map(to_millis).astype('Int64')
+
+    # Rename 'slug' in bubbles to 'symbol' for join, if needed
+    bubbles_renamed = bubbles.rename(columns={"slug": "symbol"}) if "slug" in bubbles.columns else bubbles
+    # Remove duplicate columns, keep first occurrence
+    bubbles_renamed = bubbles_renamed.loc[:, ~bubbles_renamed.columns.duplicated()]
+
+    # Use 'symbol' and 'interval_timestamp' as join keys
+    keys = [k for k in ["symbol", "interval_timestamp"] if k in base.columns and k in bubbles_renamed.columns]
+    if not all(k in base.columns for k in keys) or not all(k in bubbles_renamed.columns for k in keys):
+        raise ValueError("No common key columns found for merge (need 'symbol' and 'interval_timestamp').")
+
+    # Normalize symbol column in both DataFrames for robust merging
+    def normalize_symbol_col(df):
+        df['symbol'] = df['symbol'].astype(str).str.lower()
+        # Map 'ripple' <-> 'xrp' both ways for robust merging
+        df['symbol'] = df['symbol'].replace({'ripple': 'xrp', 'xrp/ripple': 'xrp'})
+        # Also add a step to map 'xrp' to 'ripple' for output if needed
+        df['symbol'] = df['symbol'].replace({'xrp': 'ripple'})
+        return df
+    bubbles_renamed = normalize_symbol_col(bubbles_renamed)
+    base = normalize_symbol_col(base)
+
+    # Pick top 50 by rank if present, else first 50 unique
+    if 'rank' in bubbles_renamed.columns:
+        sorted_bubbles = bubbles_renamed.sort_values('rank')
+    else:
+        sorted_bubbles = bubbles_renamed
+    top_50 = sorted_bubbles.drop_duplicates(subset='symbol').head(50)
+
+    # Always include these must-have assets
+    must_have = {'xrp', 'ripple', 'solana','eth','btc','bitcoin','ethereum', 'sol', 'ada', 'cardano'}
+    extra = bubbles_renamed[bubbles_renamed['symbol'].isin(must_have)]
+
+    # Combine and dedupe on available keys
+    dedup_cols = ['symbol']
+    if 'interval_timestamp' in pd.concat([top_50, extra]).columns:
+        dedup_cols.append('interval_timestamp')
+    bubbles_renamed = pd.concat([top_50, extra]).drop_duplicates(subset=dedup_cols)
+
+    base = base.set_index(keys)
+    bubbles_renamed = bubbles_renamed.set_index(keys)
+
+    # Union of columns, with bubbles first so its columns take precedence
+    all_cols = list(dict.fromkeys(bubbles_renamed.columns.tolist() + base.columns.tolist()))
+    base = base.reindex(columns=all_cols)
+    bubbles_renamed = bubbles_renamed.reindex(columns=all_cols)
+
+    merged = bubbles_renamed.combine_first(base).reset_index()
+    # Ensure 'symbol' column matches the index value for every row
+    if 'symbol' in merged.columns:
+        merged['symbol'] = merged['symbol'].astype(str)
+        # Always output 'ripple' instead of 'xrp'
+        merged['symbol'] = merged['symbol'].replace({'xrp': 'ripple'})
+
+    # Ensure interval_timestamp is never null in the output and is int (ms), robustly
+    if 'interval_timestamp' in merged.columns:
+        merged['interval_timestamp'] = merged['interval_timestamp'].fillna(now_ms)
+        merged['interval_timestamp'] = merged['interval_timestamp'].map(to_millis).astype('Int64')
+
+    # Set is_crypto=1 where is_crypto is null or symbol is 'solana'
+    if 'is_crypto' in merged.columns:
+        merged['is_crypto'] = merged['is_crypto'].fillna(1)
+        if 'symbol' in merged.columns:
+            merged.loc[merged['symbol'].str.lower() == 'solana', 'is_crypto'] = 1
+
+    # Drop unwanted columns
+    for col in ['id', 'name', 'image']:
+        if col in merged.columns:
+            merged = merged.drop(columns=col)
+
+    merged.to_parquet(out_fp, index=False)
+    print(f"OK  Merged top 50 from {bubbles_fp} into {base_fp} -> {out_fp} "
+          f"({merged.shape[0]} rows x {merged.shape[1]} cols)")
+
+
+def load_json_records(path: Path):
+    """
+    Load a JSON file that is either:
+     - A single JSON object,
+     - A list of objects,
+     - Or NDJSON (one JSON object per line).
+    Returns: List[dict]
+    """
+    text = path.read_text(encoding="utf8")
+    try:
+        data = json.loads(text)
+    except json.JSONDecodeError:
+        data = [json.loads(line) for line in text.splitlines() if line.strip()]
+    if isinstance(data, dict):
+        data = [data]
+    return data
+
+
+def main_json_merge(features_fp: Path, coininfo_fp: Path, out_fp: Path):
+    # 1) load features
+    feats = load_json_records(features_fp)
+    df_feats = pd.json_normalize(feats)
+
+    # 2) load coin metadata
+    coins = load_json_records(coininfo_fp)
+    df_coins = pd.json_normalize(coins)
+
+    # 3) prepare a normalized join key
+    df_feats["join_key"] = df_feats["symbol"]
+    df_coins["join_key"] = df_coins["slug"].str.lower()
+
+    # 4) merge
+    df_merged = df_feats.merge(
+        df_coins,
+        on="join_key",
+        how="left",
+        suffixes=("", "_meta")
+    )
+
+    # 5) clean up
+    df_merged = df_merged.drop(columns=["join_key"])
+    if "symbol_meta" in df_merged.columns:
+        df_merged = df_merged.drop(columns=["symbol_meta"])
+
+    # 6) write out as NDJSON
+    out_fp.parent.mkdir(parents=True, exist_ok=True)
+    with open(out_fp, "w", encoding="utf8") as f:
+        for rec in df_merged.to_dict(orient="records"):
+            f.write(json.dumps(rec) + "\n")
+
+    print(f"✅ Wrote {len(df_merged)} merged records to {out_fp}")
+
+
+def cli():
+    p = argparse.ArgumentParser(__doc__)
+    sub = p.add_subparsers(dest="mode", required=False)
+
+    # JSON merge mode (default)
+    js = sub.add_parser("json", help="Merge features JSON with coininfo JSON")
+    js.add_argument("--features",  type=Path,
+                    default=Path("data/merged/features/merged_features.json"),
+                    help="Path to merged_features JSON/NDJSON")
+    js.add_argument("--coininfo",  type=Path,
+                    default=Path("data/coininfo/coin_metadata.json"),
+                    help="Path to coin-metadata JSON/NDJSON")
+    js.add_argument("--out",       type=Path,
+                    default=Path("merged_with_coininfo.ndjson"),
+                    help="Where to write the merged NDJSON")
+
+    # Parquet merge mode
+    pq = sub.add_parser("parquet", help="Merge crypto bubbles Parquet into merged features Parquet")
+    pq.add_argument("--base",    type=Path,
+                    default=Path("data/merged/features/merged_features.parquet"),
+                    help="Path to base merged-features Parquet")
+    pq.add_argument("--bubbles", type=Path,
+                    default=None,
+                    help="Path to crypto bubbles Parquet (if not set, will use latest in data/crypto-bubbles/)")
+    pq.add_argument("--out",     type=Path,
+                    default=Path("data/merged/features/merged_features.parquet"),
+                    help="Where to write the merged Parquet")
+
+    args = p.parse_args()
+    # If no subcommand is given, default to 'parquet' and reparse
+    if args.mode is None:
+        import sys
+        sys.argv.insert(1, "parquet")
+        args = p.parse_args()
+
+    # If bubbles is not provided, find the latest crypto_bubbles_*.parquet
+    if args.mode == "parquet":
+        if args.bubbles is None or not args.bubbles.exists():
+            import glob
+            import os
+            bubble_files = glob.glob(os.path.join("data", "crypto-bubbles", "crypto_bubbles_*.parquet"))
+            if not bubble_files:
+                raise FileNotFoundError("No crypto_bubbles_*.parquet files found in data/crypto-bubbles/")
+            latest_bubble = max(bubble_files, key=os.path.getmtime)
+            print(f"[INFO] Using latest bubbles file: {latest_bubble}")
+            args.bubbles = Path(latest_bubble)
+        merge_parquet_features(args.base, args.bubbles, args.out)
+    else:
+        main_json_merge(args.features, args.coininfo, args.out)
+
+if __name__ == "__main__":
+    cli()
diff --git a/src/merge/merge_3.py b/src/merge/merge_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1046e94fdd2289ff155bfdaf65436dcd3f9e27d3
--- /dev/null
+++ b/src/merge/merge_3.py
@@ -0,0 +1,372 @@
+import sys
+import os
+import numpy as np
+import pandas as pd
+from datetime import datetime
+
+# Ensure src/merge is in the path for import
+sys.path.append(os.path.dirname(__file__))
+
+from alpaca_features import build_features, save
+
+def create_symbol_mapping():
+    """
+    Create mapping between crypto full names and ticker symbols.
+    """
+    # Common crypto symbol mappings
+    crypto_mapping = {
+        # Major cryptocurrencies
+        'bitcoin': 'BTC',
+        'ethereum': 'ETH',
+        'binancecoin': 'BNB',
+        'ripple': 'XRP',
+        'cardano': 'ADA',
+        'solana': 'SOL',
+        'dogecoin': 'DOGE',
+        'polkadot': 'DOT',
+        'matic-network': 'MATIC',
+        'polygon': 'MATIC',
+        'avalanche-2': 'AVAX',
+        'avalanche': 'AVAX',
+        'chainlink': 'LINK',
+        'litecoin': 'LTC',
+        'bitcoin-cash': 'BCH',
+        'stellar': 'XLM',
+        'vechain': 'VET',
+        'ethereum-classic': 'ETC',
+        'filecoin': 'FIL',
+        'tron': 'TRX',
+        'monero': 'XMR',
+        'eos': 'EOS',
+        'aave': 'AAVE',
+        'maker': 'MKR',
+        'compound': 'COMP',
+        'uniswap': 'UNI',
+        'yearn-finance': 'YFI',
+        'sushi': 'SUSHI',
+        'curve-dao-token': 'CRV',
+        'pancakeswap-token': 'CAKE',
+        'terra-luna': 'LUNA',
+        'fantom': 'FTM',
+        'harmony': 'ONE',
+        'near': 'NEAR',
+        'algorand': 'ALGO',
+        'cosmos': 'ATOM',
+        'internet-computer': 'ICP',
+        'helium': 'HNT',
+        'theta-token': 'THETA',
+        'chiliz': 'CHZ',
+        'decentraland': 'MANA',
+        'the-sandbox': 'SAND',
+        'axie-infinity': 'AXS',
+        'shiba-inu': 'SHIB',
+        'apecoin': 'APE',
+        'gala': 'GALA',
+        'enjincoin': 'ENJ',
+        'flow': 'FLOW',
+        'basic-attention-token': 'BAT',
+        'omg': 'OMG',
+        'loopring': 'LRC',
+        'immutable-x': 'IMX',
+        'render-token': 'RNDR',
+        'quant-network': 'QNT',
+        'injective-protocol': 'INJ',
+        'sei-network': 'SEI',
+        'arbitrum': 'ARB',
+        'optimism': 'OP',
+        'blur': 'BLUR',
+        'pepe': 'PEPE',
+        'bonk': 'BONK',
+        'wormhole': 'W',
+        'jupiter-exchange-solana': 'JUP',
+        'worldcoin-wld': 'WLD',
+        'pyth-network': 'PYTH',
+        'jito': 'JTO',
+        'tensor': 'TNSR',
+        'meme': 'MEME',
+        'cat-in-a-dogs-world': 'MEW',
+        'book-of-meme': 'BOME',
+        'dogwifhat': 'WIF',
+        'popcat': 'POPCAT',
+        'goatseus-maximus': 'GOAT',
+        'peanut-the-squirrel': 'PNUT',
+        'act-i-the-ai-prophecy': 'ACT',
+        'fartcoin': 'FARTCOIN',
+        'ai16z': 'AI16Z',
+        'virtual-protocol': 'VIRTUAL',
+        'zerebro': 'ZEREBRO',
+        'griffain': 'GRIFFAIN',
+        'aixbt-by-virtuals': 'AIXBT',
+        'marc-and-ethan-are-based': 'BASED',
+        'pudgy-penguins': 'PENGU',
+        'hyperliquid': 'HYPE',
+        'move-movement': 'MOVE',
+        'usual': 'USUAL',
+        'reserve-rights': 'RSR',
+        'ondo-finance': 'ONDO',
+        'ethena': 'ENA',
+        'eigenlayer': 'EIGEN',
+        'grass': 'GRASS',
+        'io': 'IO',
+        'notcoin': 'NOT',
+        'turbo': 'TURBO',
+        'jasmy': 'JASMY',
+        'neo': 'NEO',
+        'iota': 'IOTA',
+        'dash': 'DASH',
+        'zcash': 'ZEC',
+        'waves': 'WAVES',
+    }
+    
+    # Create reverse mapping (ticker -> full name)
+    reverse_mapping = {v.lower(): k for k, v in crypto_mapping.items()}
+    
+    # Also add the forward mapping (full name -> ticker)
+    forward_mapping = {k: v.lower() for k, v in crypto_mapping.items()}
+    
+    return crypto_mapping, reverse_mapping, forward_mapping
+
+def normalize_symbols(df, symbol_col, is_alpaca=False):
+    """
+    Normalize symbols to handle crypto name/ticker differences and stock symbols.
+    """
+    df = df.copy()
+    crypto_mapping, reverse_mapping, forward_mapping = create_symbol_mapping()
+    
+    # Convert to lowercase for consistency
+    df[symbol_col] = df[symbol_col].str.lower()
+    
+    if is_alpaca:
+        # Alpaca uses tickers (BTC, ETH, etc. for crypto, NVDA, AAPL, etc. for stocks)
+        # For crypto: Map tickers to full names to match merged data
+        # For stocks: Keep the ticker symbol as-is (in lowercase)
+        
+        def map_alpaca_symbol(symbol):
+            symbol_lower = symbol.lower()
+            
+            # Check if it's a crypto ticker that needs mapping
+            if symbol_lower in reverse_mapping:
+                return reverse_mapping[symbol_lower]
+            else:
+                # It's likely a stock symbol, keep as-is (lowercase)
+                return symbol_lower
+        
+        df[symbol_col] = df[symbol_col].apply(map_alpaca_symbol)
+    else:
+        # Merged data uses full names for crypto (bitcoin, ethereum, etc.)
+        # and should use lowercase tickers for stocks (nvda, aapl, etc.)
+        # Keep as is, but ensure lowercase
+        pass
+    
+    return df
+
+def merge_alpaca_features():
+    """
+    Merge Alpaca features with existing merged features.
+    Handles timestamp alignment, column conflicts, and symbol mapping.
+    """
+    
+    # Step 1: Create Alpaca features
+    alpaca_df = build_features()
+    save(alpaca_df)
+    
+    # Step 2: Load merged features
+    try:
+        from src import config as app_config
+        base_dir = app_config.DATA_DIR
+    except Exception:
+        base_dir = os.environ.get("DATA_DIR", "/data")
+    merged_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet")
+    
+    merged_df = pd.read_parquet(merged_path)
+    
+    # Normalize symbols
+    alpaca_df_normalized = normalize_symbols(alpaca_df, "symbol", is_alpaca=True)
+    merged_df_normalized = normalize_symbols(merged_df, "symbol", is_alpaca=False)
+    
+    # Find overlapping symbols
+    alpaca_normalized = set(alpaca_df_normalized["symbol"].unique())
+    merged_normalized = set(merged_df_normalized["symbol"].unique())
+    overlapping_symbols = alpaca_normalized.intersection(merged_normalized)
+    missing_in_merged = alpaca_normalized - merged_normalized
+    
+    
+    # Step 6: Handle symbols that exist only in Alpaca data
+    if missing_in_merged:
+        
+        new_symbol_rows = []
+        for missing_symbol in missing_in_merged:
+            # Get actual data for this symbol from Alpaca
+            symbol_data = alpaca_df_normalized[alpaca_df_normalized["symbol"] == missing_symbol]
+            if len(symbol_data) == 0:
+                continue
+                
+            
+            # Create rows based on Alpaca timestamps, not merged timestamps
+            for _, alpaca_row in symbol_data.iterrows():
+                new_row = {
+                    "symbol": missing_symbol,
+                    "interval_timestamp": alpaca_row["timestamp"],  # Use Alpaca timestamp
+                    "is_stock": True if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else False,
+                    "is_crypto": False if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else True,
+                    "stock_market": "NASDAQ" if missing_symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"] else None,
+                    "feature_timestamp": pd.Timestamp.now().value // 1000000,  # Convert to milliseconds
+                }
+
+                # Copy all Alpaca feature columns into the new row
+                for col in alpaca_row.index:
+                    if col not in new_row:
+                        new_row[col] = alpaca_row[col]
+
+                # Add all other columns from merged_df with NaN values (except the ones we set above)
+                for col in merged_df_normalized.columns:
+                    if col not in new_row:
+                        new_row[col] = np.nan
+
+                new_symbol_rows.append(new_row)
+        
+        if new_symbol_rows:
+            new_symbols_df = pd.DataFrame(new_symbol_rows)
+            merged_df_normalized = pd.concat([merged_df_normalized, new_symbols_df], ignore_index=True)
+    
+    # Step 7: Check for overlapping columns and handle them
+    join_keys = ["symbol", "timestamp", "interval_timestamp"]
+    alpaca_cols = set(alpaca_df_normalized.columns) - set(join_keys)
+    merged_cols = set(merged_df_normalized.columns) - set(join_keys)
+    overlapping_cols = alpaca_cols.intersection(merged_cols)
+    
+    # Convert timestamps to datetime for processing (use pd.concat to avoid fragmentation)
+    timestamp_columns = {}
+    
+    if "timestamp" in alpaca_df_normalized.columns:
+        timestamp_columns["timestamp_dt"] = pd.to_datetime(alpaca_df_normalized["timestamp"], unit="ms")
+    
+    if "interval_timestamp" in merged_df_normalized.columns:
+        timestamp_columns["interval_timestamp_dt"] = pd.to_datetime(merged_df_normalized["interval_timestamp"], unit="ms")
+    
+    # Add timestamp columns efficiently using pd.concat
+    if timestamp_columns:
+        for col_name, col_data in timestamp_columns.items():
+            if col_name == "timestamp_dt" and "timestamp" in alpaca_df_normalized.columns:
+                alpaca_df_normalized = pd.concat([alpaca_df_normalized, col_data.to_frame(col_name)], axis=1)
+            elif col_name == "interval_timestamp_dt" and "interval_timestamp" in merged_df_normalized.columns:
+                merged_df_normalized = pd.concat([merged_df_normalized, col_data.to_frame(col_name)], axis=1)
+    
+    # Perform an OUTER merge to capture all data from both sources
+    final_merge = pd.merge(
+        merged_df_normalized,
+        alpaca_df_normalized,
+        left_on=["symbol", "interval_timestamp"],
+        right_on=["symbol", "timestamp"],
+        how="outer",  # Changed from "left" to "outer"
+        suffixes=("", "_alpaca")
+    )
+    
+    # For rows that came only from Alpaca (new symbols), copy the timestamp to interval_timestamp
+    alpaca_only_mask = final_merge["interval_timestamp"].isna() & final_merge["timestamp"].notna()
+    if alpaca_only_mask.any():
+        final_merge.loc[alpaca_only_mask, "interval_timestamp"] = final_merge.loc[alpaca_only_mask, "timestamp"]
+
+        # Set basic metadata for these new rows
+        final_merge.loc[alpaca_only_mask, "feature_timestamp"] = pd.Timestamp.now().value // 1000000
+
+        # Set stock/crypto flags based on symbol
+        for symbol in final_merge.loc[alpaca_only_mask, "symbol"].unique():
+            symbol_mask = alpaca_only_mask & (final_merge["symbol"] == symbol)
+            is_stock = symbol.upper() in ["NVDA", "AAPL", "GOOGL", "MSFT", "TSLA", "AMZN", "META"]
+            final_merge.loc[symbol_mask, "is_stock"] = is_stock
+            final_merge.loc[symbol_mask, "is_crypto"] = not is_stock
+            if is_stock:
+                final_merge.loc[symbol_mask, "stock_market"] = "NASDAQ"
+
+        # Copy _alpaca columns into base columns for Alpaca-only rows
+        feature_cols = [
+            "open", "high", "low", "close", "volume", "trade_count", "vwap",
+            "symbol_quote", "bid_price", "bid_size", "bid_exchange", "ask_price", "ask_size", "ask_exchange",
+            "conditions", "tape", "symbol_trade", "exchange", "price", "size", "id", "conditions_trade", "tape_trade"
+        ]
+        for col in feature_cols:
+            alpaca_col = f"{col}_alpaca"
+            if alpaca_col in final_merge.columns and col in final_merge.columns:
+                final_merge.loc[alpaca_only_mask, col] = final_merge.loc[alpaca_only_mask, alpaca_col]
+    
+    # Step 11: Calculate merge statistics
+    total_merged_rows = len(merged_df_normalized)
+    total_alpaca_rows = len(alpaca_df_normalized)
+    total_final_rows = len(final_merge)
+    
+    # Count matches from original merged data
+    original_matched_rows = final_merge[
+        final_merge["timestamp"].notna() & 
+        final_merge["interval_timestamp"].notna() & 
+        (final_merge["interval_timestamp"] != final_merge["timestamp"])
+    ].shape[0]
+    
+    # Count new rows from Alpaca-only symbols
+    alpaca_only_rows = final_merge[
+        final_merge["timestamp"].notna() & 
+        (final_merge["interval_timestamp"] == final_merge["timestamp"])
+    ].shape[0]
+    
+    # Total rows with Alpaca data
+    total_alpaca_matched = final_merge[final_merge["timestamp"].notna()].shape[0]
+    
+    original_match_rate = original_matched_rows / total_merged_rows if total_merged_rows > 0 else 0
+    overall_match_rate = total_alpaca_matched / total_final_rows if total_final_rows > 0 else 0
+    
+    
+    # Step 12: Debug successful matches and new symbols
+    if total_alpaca_matched > 0:
+        successful_matches = final_merge[final_merge["timestamp"].notna()]
+        sample_cols = ["symbol", "interval_timestamp", "timestamp", "open", "high", "low", "close", "volume"]
+        available_cols = [col for col in sample_cols if col in successful_matches.columns]
+        
+    # Step 13: Add merge metadata
+    final_merge["alpaca_merge_timestamp"] = pd.Timestamp.now().value // 1000000  # Convert to milliseconds
+    final_merge["alpaca_data_available"] = final_merge["timestamp"].notna()
+    final_merge["alpaca_match_rate"] = overall_match_rate
+    final_merge["is_new_symbol"] = final_merge["interval_timestamp"] == final_merge["timestamp"]
+    
+    # Step 14: Handle duplicate columns before saving
+    duplicate_cols = final_merge.columns[final_merge.columns.duplicated()].tolist()
+    if duplicate_cols:
+        final_merge = final_merge.loc[:, ~final_merge.columns.duplicated()]
+    
+    # Save the merged features
+    out_path = os.path.join(base_dir, "merged", "features", "merged_features.parquet")
+    
+    final_merge.to_parquet(out_path, index=False)
+    
+    # Generate detailed summary report
+    print(f"Total final rows: {len(final_merge)}")
+    print(f"Rows with Alpaca data: {total_alpaca_matched}")
+    print(f"New symbols added: {alpaca_only_rows}")
+    print(f"Overall match rate: {overall_match_rate:.2%}")
+    print(f"Total columns: {len(final_merge.columns)}")
+    
+    # Show symbols with and without Alpaca data
+    symbol_summary = final_merge.groupby("symbol").agg({
+        "alpaca_data_available": ["count", "sum"],
+        "is_new_symbol": "sum"
+    }).round(2)
+    
+    symbol_summary.columns = ["total_rows", "alpaca_matches", "new_symbol_rows"]
+    symbol_summary["match_rate"] = symbol_summary["alpaca_matches"] / symbol_summary["total_rows"]
+    symbol_summary["is_new_symbol"] = symbol_summary["new_symbol_rows"] > 0
+    
+    # Show which symbols have complete data
+    complete_symbols = symbol_summary[symbol_summary["match_rate"] > 0.5]
+    if len(complete_symbols) > 0:
+        print(complete_symbols[["total_rows", "alpaca_matches", "match_rate"]])
+    
+    # Show sample of final merged data
+    sample_cols = ["symbol", "interval_timestamp", "alpaca_data_available", "is_new_symbol", "open", "high", "low", "close", "volume"]
+    
+    return final_merge
+
+if __name__ == "__main__":
+    try:
+        merged_df = merge_alpaca_features()
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
\ No newline at end of file
diff --git a/src/merge/merge_4.py b/src/merge/merge_4.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b032be4d033801ac1e363aeab7fa3db2a1a4a6f
--- /dev/null
+++ b/src/merge/merge_4.py
@@ -0,0 +1,237 @@
+import json
+import pandas as pd
+from datetime import datetime
+import numpy as np
+import os
+
+def parse_news_data(file_path):
+    """Parse the news data file containing multiple JSON objects per line"""
+    news_data = []
+    
+    with open(file_path, 'r') as f:
+        content = f.read()
+        
+    # Split by newlines and parse each JSON object
+    lines = content.strip().split('\n')
+    
+    for line in lines:
+        if line.strip():
+            try:
+                news_item = json.loads(line)
+                news_data.append(news_item)
+            except json.JSONDecodeError as e:
+                print(f"Error parsing line: {line[:100]}...")
+                print(f"Error: {e}")
+                continue
+    
+    return news_data
+
+def extract_sentiment_features(news_data):
+    """Extract sentiment features from news data for each symbol"""
+    sentiment_features = {}
+    
+    for article in news_data:
+        # Get article-level info
+        published_at = article.get('published_at')
+        title = article.get('title', '')
+        description = article.get('description', '')
+        
+        # Process entities (stocks mentioned in the article)
+        entities = article.get('entities', [])
+        
+        for entity in entities:
+            if entity.get('type') == 'equity':
+                symbol = entity.get('symbol', '').lower()  # Convert to lowercase
+                
+                if symbol:
+                    if symbol not in sentiment_features:
+                        sentiment_features[symbol] = {
+                            'news_sentiment_scores': [],
+                            'news_match_scores': [],
+                            'news_mentions_count': 0,
+                            'news_articles_count': 0,
+                            'latest_news_timestamp': None,
+                            'news_highlights_count': 0
+                        }
+                    
+                    # Add sentiment and match scores
+                    sentiment_score = entity.get('sentiment_score')
+                    match_score = entity.get('match_score')
+                    
+                    if sentiment_score is not None:
+                        sentiment_features[symbol]['news_sentiment_scores'].append(sentiment_score)
+                    
+                    if match_score is not None:
+                        sentiment_features[symbol]['news_match_scores'].append(match_score)
+                    
+                    # Count highlights
+                    highlights = entity.get('highlights', [])
+                    sentiment_features[symbol]['news_highlights_count'] += len(highlights)
+                    
+                    # Update latest timestamp
+                    if published_at:
+                        if (sentiment_features[symbol]['latest_news_timestamp'] is None or 
+                            published_at > sentiment_features[symbol]['latest_news_timestamp']):
+                            sentiment_features[symbol]['latest_news_timestamp'] = published_at
+                    
+                    sentiment_features[symbol]['news_mentions_count'] += 1
+        
+        # Count unique articles per symbol
+        mentioned_symbols = set(entity.get('symbol', '').lower() for entity in entities 
+                               if entity.get('type') == 'equity' and entity.get('symbol'))
+        
+        for symbol in mentioned_symbols:
+            if symbol in sentiment_features:
+                sentiment_features[symbol]['news_articles_count'] += 1
+    
+    return sentiment_features
+
+def aggregate_sentiment_features(sentiment_data):
+    """Aggregate sentiment features into final metrics"""
+    aggregated = {}
+    
+    for symbol, data in sentiment_data.items():
+        # Calculate aggregated metrics
+        sentiment_scores = data['news_sentiment_scores']
+        match_scores = data['news_match_scores']
+        
+        features = {
+            'news_sentiment_mean': np.mean(sentiment_scores) if sentiment_scores else None,
+            'news_sentiment_std': np.std(sentiment_scores) if len(sentiment_scores) > 1 else None,
+            'news_sentiment_min': np.min(sentiment_scores) if sentiment_scores else None,
+            'news_sentiment_max': np.max(sentiment_scores) if sentiment_scores else None,
+            'news_match_score_mean': np.mean(match_scores) if match_scores else None,
+            'news_match_score_max': np.max(match_scores) if match_scores else None,
+            'news_mentions_count': data['news_mentions_count'],
+            'news_articles_count': data['news_articles_count'],
+            'news_highlights_count': data['news_highlights_count'],
+            'latest_news_timestamp': data['latest_news_timestamp'],
+            'news_sentiment_range': (np.max(sentiment_scores) - np.min(sentiment_scores)) if len(sentiment_scores) > 0 else None,
+            'news_activity_score': data['news_mentions_count'] * np.mean(match_scores) if match_scores else 0
+        }
+        
+        aggregated[symbol] = features
+    
+    return aggregated
+
+def merge_with_existing_features(news_features, existing_features_file):
+    """Merge news features with existing market data features"""
+    
+    # Load existing features
+    if existing_features_file.endswith('.parquet'):
+        df_existing = pd.read_parquet(existing_features_file)
+    else:
+        df_existing = pd.read_csv(existing_features_file)
+    
+    print(f"Loaded existing features: {df_existing.shape}")
+    print(f"News features available for {len(news_features)} symbols")
+    
+    # Add news features as new columns
+    news_columns = [
+        'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 
+        'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max',
+        'news_mentions_count', 'news_articles_count', 'news_highlights_count',
+        'latest_news_timestamp', 'news_sentiment_range', 'news_activity_score'
+    ]
+    
+    # Initialize all news columns with NaN
+    for col in news_columns:
+        df_existing[col] = np.nan
+    
+    # Fill in news features where available
+    symbols_matched = 0
+    for idx, row in df_existing.iterrows():
+        symbol = row['symbol']
+        if symbol in news_features:
+            for col in news_columns:
+                # The keys in news_features already have the correct names
+                df_existing.loc[idx, col] = news_features[symbol].get(col, None)
+            symbols_matched += 1
+    
+    print(f"Matched news features for {symbols_matched} symbols out of {len(df_existing)} total records")
+    
+    return df_existing
+
+def main():
+    # Configuration
+    # Use Marketaux parquet file for news data
+    news_file = os.path.join('data', 'marketaux', 'news', 'news_latest.parquet')
+    existing_features_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')
+    output_file = os.path.join('data', 'merged', 'features', 'merged_features.parquet')
+
+    # Check if news file exists
+    if not os.path.exists(news_file):
+        print(f"WARNING: News file not found: {news_file}")
+        print("This usually happens when MarketAux API keys are exhausted.")
+        print("Skipping news sentiment merge and keeping existing features unchanged.")
+        
+        # Just copy existing features if they exist
+        if os.path.exists(existing_features_file):
+            import shutil
+            shutil.copy2(existing_features_file, output_file)
+            print(f"Copied existing features to output: {output_file}")
+        else:
+            print(f"WARNING: No existing features file found at {existing_features_file}")
+        return
+
+    print("Step 1: Loading news data from parquet...")
+    try:
+        news_df = pd.read_parquet(news_file)
+        news_data = news_df.to_dict(orient='records')
+        print(f"Loaded {len(news_data)} news articles from {news_file}")
+    except Exception as e:
+        print(f"ERROR: Failed to load news data: {e}")
+        print("Skipping news sentiment merge.")
+        
+        # Copy existing features as fallback
+        if os.path.exists(existing_features_file):
+            import shutil
+            shutil.copy2(existing_features_file, output_file)
+            print(f"Copied existing features to output: {output_file}")
+        return
+    
+    print("Step 2: Extracting sentiment features...")
+    sentiment_data = extract_sentiment_features(news_data)
+    print(f"Extracted sentiment data for {len(sentiment_data)} symbols")
+    
+    print("Step 3: Aggregating sentiment metrics...")
+    news_features = aggregate_sentiment_features(sentiment_data)
+    
+    # Display sample of extracted features
+    print("\nSample of extracted news features:")
+    for symbol, features in list(news_features.items())[:3]:
+        print(f"\n{symbol.upper()}:")
+        for key, value in features.items():
+            if value is not None:
+                if isinstance(value, float):
+                    print(f"  {key}: {value:.4f}")
+                else:
+                    print(f"  {key}: {value}")
+    
+    print(f"\nStep 4: Merging with existing features...")
+    try:
+        merged_df = merge_with_existing_features(news_features, existing_features_file)
+
+        # Remove 'links.pulsex' column if present
+        if 'links.pulsex' in merged_df.columns:
+            merged_df = merged_df.drop(columns=['links.pulsex'])
+
+        print(f"Step 5: Saving merged features...")
+        merged_df.to_parquet(output_file, index=False)
+        print(f"Saved merged features to {output_file}")
+        print(f"Final dataset shape: {merged_df.shape}")
+
+        # Show summary of news feature coverage
+        news_cols = [col for col in merged_df.columns if col.startswith('news_')]
+        print(f"\nNews feature coverage:")
+        for col in news_cols:
+            non_null_count = merged_df[col].notna().sum()
+            coverage = non_null_count / len(merged_df) * 100
+            print(f"  {col}: {non_null_count}/{len(merged_df)} ({coverage:.1f}%)")
+            
+    except Exception as e:
+        print(f"Error during merging: {e}")
+        print("Make sure your merged_features.parquet file exists and is accessible")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/merge_5.py b/src/merge/merge_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af85ea47c09326d3a14c4b7d6d9bd8a514d2412
--- /dev/null
+++ b/src/merge/merge_5.py
@@ -0,0 +1,376 @@
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+import json
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def normalize_stock_data(df_stocks):
+    """
+    Normalize stock data to ensure consistent format for merging.
+    """
+    logger.info("=== NORMALIZING STOCK DATA ===")
+    df_stocks = df_stocks.copy()
+    
+    # Normalize symbol to uppercase and strip whitespace
+    df_stocks['symbol'] = df_stocks['symbol'].astype(str).str.upper().str.strip()
+    
+    # Ensure interval_timestamp is int64 (Unix timestamp in milliseconds)
+    if 'interval_timestamp' in df_stocks.columns:
+        # If it's already numeric, ensure it's int64
+        df_stocks['interval_timestamp'] = pd.to_numeric(df_stocks['interval_timestamp'], errors='coerce').astype('int64')
+        logger.info(f"Stock timestamp range: {df_stocks['interval_timestamp'].min()} to {df_stocks['interval_timestamp'].max()}")
+        logger.info(f"Stock timestamp sample: {df_stocks['interval_timestamp'].head().tolist()}")
+    
+    logger.info(f"Stock symbols sample: {df_stocks['symbol'].unique()[:10].tolist()}")
+    logger.info(f"Stock data shape: {df_stocks.shape}")
+    
+    return df_stocks
+
+def normalize_news_data(df_news):
+    """
+    Normalize news data to ensure consistent format for merging.
+    """
+    logger.info("=== NORMALIZING NEWS DATA ===")
+    df_news = df_news.copy()
+    
+    # Extract entities and create individual records
+    news_records = []
+    
+    for idx, row in df_news.iterrows():
+        entities = row.get('entities', [])
+        
+        # Only proceed if entities is a non-empty list or ndarray
+        if not isinstance(entities, (list, np.ndarray)) or len(entities) == 0:
+            continue
+            
+        # Convert published_at to timestamp
+        try:
+            if isinstance(row['published_at'], str):
+                published_dt = pd.to_datetime(row['published_at'])
+            else:
+                published_dt = row['published_at']
+        except:
+            logger.warning(f"Could not parse published_at for row {idx}")
+            continue
+        
+        # Process each entity
+        for entity in entities:
+            if not isinstance(entity, dict):
+                continue
+                
+            # Only process equity type entities with symbols
+            if entity.get('type') == 'equity' and 'symbol' in entity:
+                symbol = str(entity['symbol']).upper().strip()
+                
+                # Create 30-minute intervals (matching your stock data)
+                interval_dt = published_dt.floor('30min')
+                # Convert to Unix timestamp in milliseconds
+                interval_timestamp = int(interval_dt.timestamp() * 1000)
+                
+                news_records.append({
+                    'symbol': symbol,
+                    'interval_timestamp': interval_timestamp,
+                    'published_at': published_dt,
+                    'sentiment_score': entity.get('sentiment_score', 0),
+                    'match_score': entity.get('match_score', 0),
+                    'highlights_count': len(entity.get('highlights', [])),
+                    'news_uuid': row.get('uuid', ''),
+                    'news_title': row.get('title', ''),
+                    'news_source': row.get('source', ''),
+                    'relevance_score': row.get('relevance_score', 0)
+                })
+    
+    if not news_records:
+        logger.warning("No valid news records found")
+        return pd.DataFrame()
+    
+    df_news_normalized = pd.DataFrame(news_records)
+    logger.info(f"Normalized news data shape: {df_news_normalized.shape}")
+    # Print columns that are completely null and those that aren't
+    null_columns = [col for col in df_news_normalized.columns if df_news_normalized[col].isnull().all()]
+    not_null_columns = [col for col in df_news_normalized.columns if not df_news_normalized[col].isnull().all()]
+    print(f"Completely null columns: {null_columns}")
+    print(f"Non-null columns: {not_null_columns}")
+    logger.info(f"News symbols sample: {df_news_normalized['symbol'].unique()[:10].tolist()}")
+    logger.info(f"News timestamp range: {df_news_normalized['interval_timestamp'].min()} to {df_news_normalized['interval_timestamp'].max()}")
+    logger.info(f"News timestamp sample: {df_news_normalized['interval_timestamp'].head().tolist()}")
+    
+    return df_news_normalized
+
+def find_nearest_timestamp_matches(df_stocks, df_news, time_tolerance_minutes=30):
+    """
+    Find the nearest timestamp matches within a tolerance window.
+    This handles cases where timestamps don't align exactly.
+    """
+    logger.info(f"=== FINDING NEAREST TIMESTAMP MATCHES (tolerance: {time_tolerance_minutes} min) ===")
+    
+    if df_news.empty:
+        return df_stocks.assign(**{col: 0 for col in [
+            'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max',
+            'news_match_score_mean', 'news_match_score_max', 'news_highlights_count',
+            'news_articles_count', 'latest_news_timestamp', 'news_sentiment_range',
+            'news_activity_score', 'news_mentions_count'
+        ]})
+    
+    # Convert tolerance to milliseconds
+    tolerance_ms = time_tolerance_minutes * 60 * 1000
+    
+    # Get unique combinations for efficient processing
+    stock_keys = df_stocks[['symbol', 'interval_timestamp']].drop_duplicates()
+    
+    matched_records = []
+    
+    for _, stock_row in stock_keys.iterrows():
+        symbol = stock_row['symbol']
+        stock_timestamp = stock_row['interval_timestamp']
+        
+        # Find news for this symbol
+        symbol_news = df_news[df_news['symbol'] == symbol].copy()
+        
+        if symbol_news.empty:
+            continue
+        
+        # Calculate time differences
+        symbol_news['time_diff'] = abs(symbol_news['interval_timestamp'] - stock_timestamp)
+        
+        # Filter within tolerance
+        nearby_news = symbol_news[symbol_news['time_diff'] <= tolerance_ms]
+        
+        if nearby_news.empty:
+            continue
+        
+        # Aggregate the nearby news
+        agg_data = {
+            'symbol': symbol,
+            'interval_timestamp': stock_timestamp,
+            'news_sentiment_mean': nearby_news['sentiment_score'].mean(),
+            'news_sentiment_std': nearby_news['sentiment_score'].std(),
+            'news_sentiment_min': nearby_news['sentiment_score'].min(),
+            'news_sentiment_max': nearby_news['sentiment_score'].max(),
+            'news_match_score_mean': nearby_news['match_score'].mean(),
+            'news_match_score_max': nearby_news['match_score'].max(),
+            'news_highlights_count': nearby_news['highlights_count'].sum(),
+            'news_articles_count': len(nearby_news),
+            'latest_news_timestamp': nearby_news['published_at'].max(),
+            'news_mentions_count': len(nearby_news)
+        }
+        
+        # Calculate additional features
+        agg_data['news_sentiment_range'] = agg_data['news_sentiment_max'] - agg_data['news_sentiment_min']
+        agg_data['news_activity_score'] = agg_data['news_match_score_mean'] + agg_data['news_match_score_max']
+        
+        # Fill NaN values
+        for key, value in agg_data.items():
+            if pd.isna(value) and key not in ['symbol', 'interval_timestamp', 'latest_news_timestamp']:
+                agg_data[key] = 0
+        
+        matched_records.append(agg_data)
+    
+    if matched_records:
+        df_matched_news = pd.DataFrame(matched_records)
+        logger.info(f"Found {len(df_matched_news)} symbol-timestamp matches")
+        
+        # Merge with stock data
+        df_result = df_stocks.merge(
+            df_matched_news,
+            on=['symbol', 'interval_timestamp'],
+            how='left'
+        )
+    else:
+        logger.warning("No timestamp matches found within tolerance")
+        df_result = df_stocks.copy()
+    
+    # Fill remaining NaN values for stocks without news
+    news_columns = [
+        'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 'news_sentiment_max',
+        'news_match_score_mean', 'news_match_score_max', 'news_highlights_count',
+        'news_articles_count', 'news_sentiment_range', 'news_activity_score', 'news_mentions_count'
+    ]
+    
+    for col in news_columns:
+        if col in df_result.columns:
+            df_result[col] = df_result[col].fillna(0)
+    
+    # Report results
+    if 'news_articles_count' in df_result.columns:
+        stocks_with_news = len(df_result[df_result['news_articles_count'] > 0])
+        total_news_articles = df_result['news_articles_count'].sum()
+        logger.info(f"Successfully matched news for {stocks_with_news} stock records out of {len(df_result)}")
+        logger.info(f"Total news articles matched: {total_news_articles}")
+    
+    return df_result
+
+def diagnose_data_alignment(df_stocks, df_news):
+    """
+    Diagnose alignment issues between stock and news data.
+    """
+    logger.info("=== DATA ALIGNMENT DIAGNOSIS ===")
+    
+    # Check symbol overlap
+    stock_symbols = set(df_stocks['symbol'].unique()) if 'symbol' in df_stocks.columns else set()
+    news_symbols = set(df_news['symbol'].unique()) if len(df_news) > 0 and 'symbol' in df_news.columns else set()
+    
+    common_symbols = stock_symbols.intersection(news_symbols)
+    
+    logger.info(f"Stock symbols: {len(stock_symbols)} unique")
+    logger.info(f"News symbols: {len(news_symbols)} unique")
+    logger.info(f"Common symbols: {len(common_symbols)}")
+    logger.info(f"Common symbols sample: {list(common_symbols)[:10]}")
+    
+    # Check timestamp ranges
+    if 'interval_timestamp' in df_stocks.columns:
+        stock_ts_min = df_stocks['interval_timestamp'].min()
+        stock_ts_max = df_stocks['interval_timestamp'].max()
+        stock_ts_range = pd.to_datetime([stock_ts_min, stock_ts_max], unit='ms')
+        logger.info(f"Stock timestamp range: {stock_ts_range[0]} to {stock_ts_range[1]}")
+    
+    if len(df_news) > 0 and 'interval_timestamp' in df_news.columns:
+        news_ts_min = df_news['interval_timestamp'].min()
+        news_ts_max = df_news['interval_timestamp'].max()
+        news_ts_range = pd.to_datetime([news_ts_min, news_ts_max], unit='ms')
+        logger.info(f"News timestamp range: {news_ts_range[0]} to {news_ts_range[1]}")
+        
+        # Check for timestamp overlap
+        if 'interval_timestamp' in df_stocks.columns:
+            overlap_start = max(stock_ts_min, news_ts_min)
+            overlap_end = min(stock_ts_max, news_ts_max)
+            if overlap_start <= overlap_end:
+                overlap_range = pd.to_datetime([overlap_start, overlap_end], unit='ms')
+                logger.info(f"Timestamp overlap: {overlap_range[0]} to {overlap_range[1]}")
+            else:
+                logger.warning("No timestamp overlap between stock and news data")
+
+def parse_json_news_file(news_file_path):
+    """
+    Parse news file that contains JSON records (one per line or structured).
+    """
+    logger.info(f"Parsing news file: {news_file_path}")
+    
+    try:
+        # Try reading as parquet first
+        df_news = pd.read_parquet(news_file_path)
+        logger.info(f"Successfully read parquet file with shape: {df_news.shape}")
+        
+        # Check if the data contains JSON strings that need parsing
+        if len(df_news.columns) == 1 and df_news.iloc[0, 0] and isinstance(df_news.iloc[0, 0], str):
+            logger.info("Detected JSON strings in single column, parsing...")
+            json_records = []
+            for idx, row in df_news.iterrows():
+                try:
+                    json_data = json.loads(row.iloc[0])
+                    json_records.append(json_data)
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Failed to parse JSON at row {idx}: {e}")
+                    continue
+            
+            if json_records:
+                df_news = pd.DataFrame(json_records)
+                logger.info(f"Parsed {len(json_records)} JSON records")
+        
+        return df_news
+        
+    except Exception as e:
+        logger.error(f"Error reading news file: {e}")
+        return pd.DataFrame()
+
+def main(stocks_file_path, news_file_path, output_file_path, time_tolerance_minutes=30):
+    """
+    Main function to normalize and merge stock and news data.
+    """
+    try:
+        logger.info("=== STARTING DATA NORMALIZATION AND MERGE ===")
+        
+        # Step 1: Load stock data
+        logger.info("Step 1: Loading stock data...")
+        df_stocks = pd.read_parquet(stocks_file_path)
+        logger.info(f"Loaded stock data with shape: {df_stocks.shape}")
+        
+        # Step 2: Load and parse news data
+        logger.info("Step 2: Loading news data...")
+        df_news_raw = parse_json_news_file(news_file_path)
+        
+        if df_news_raw.empty:
+            logger.warning("No news data found, creating stock data with empty news columns")
+            df_stocks = normalize_stock_data(df_stocks)
+            # Add empty news columns
+            for col in ['news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min', 
+                       'news_sentiment_max', 'news_match_score_mean', 'news_match_score_max',
+                       'news_highlights_count', 'news_articles_count', 'latest_news_timestamp',
+                       'news_sentiment_range', 'news_activity_score', 'news_mentions_count']:
+                df_stocks[col] = 0 if col != 'latest_news_timestamp' else None
+            df_stocks.to_parquet(output_file_path, index=False)
+            logger.info("Saved stock data with empty news columns")
+            return df_stocks
+        
+        # Step 3: Normalize both datasets
+        logger.info("Step 3: Normalizing stock data...")
+        df_stocks_norm = normalize_stock_data(df_stocks)
+        
+        logger.info("Step 4: Normalizing news data...")
+        df_news_norm = normalize_news_data(df_news_raw)
+        
+        # Step 5: Diagnose alignment
+        logger.info("Step 5: Diagnosing data alignment...")
+        diagnose_data_alignment(df_stocks_norm, df_news_norm)
+        
+        # Step 6: Find nearest timestamp matches and merge
+        logger.info("Step 6: Finding nearest timestamp matches and merging...")
+        df_merged = find_nearest_timestamp_matches(
+            df_stocks_norm, 
+            df_news_norm, 
+            time_tolerance_minutes=time_tolerance_minutes
+        )
+        
+        # Step 7: Save results
+        logger.info("Step 7: Saving merged data...")
+        df_merged.to_parquet(output_file_path, index=False)
+        logger.info(f"Saved merged data to {output_file_path}")
+        
+        # Final report
+        logger.info("=== MERGE COMPLETED ===")
+        logger.info(f"Final dataset shape: {df_merged.shape}")
+        
+        news_cols = [col for col in df_merged.columns if col.startswith('news_')]
+        logger.info(f"News columns added: {len(news_cols)}")
+        
+        if 'news_articles_count' in df_merged.columns:
+            total_articles = df_merged['news_articles_count'].sum()
+            records_with_news = len(df_merged[df_merged['news_articles_count'] > 0])
+            logger.info(f"Total news articles merged: {total_articles}")
+            logger.info(f"Stock records with news: {records_with_news} / {len(df_merged)}")
+        
+        return df_merged
+        
+    except Exception as e:
+        logger.error(f"Error in main process: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+        raise
+
+# Example usage
+if __name__ == "__main__":
+    import os
+    
+    # Update these paths to match your actual file locations
+    base_dir = "data/"  # Update this
+    stocks_file = os.path.join(base_dir, "merged/features/stocks_features.parquet")
+    news_file = os.path.join(base_dir, "marketaux/news/news_latest.parquet")
+    output_file = os.path.join(base_dir, "merged/features/stocks_features.parquet")
+
+    # Check if stocks_features.parquet exists before running
+    if not os.path.exists(stocks_file):
+        logger.error(f"Input file missing: {stocks_file}")
+        print(f"ERROR: Input file missing: {stocks_file}")
+        exit(1)
+
+    # Run the merge with 30-minute tolerance (adjust as needed)
+    df_result = main(
+        stocks_file_path=stocks_file,
+        news_file_path=news_file,
+        output_file_path=output_file,
+        time_tolerance_minutes=60*24 # Adjust this based on your needs
+    )
\ No newline at end of file
diff --git a/src/merge/merge_6.py b/src/merge/merge_6.py
new file mode 100644
index 0000000000000000000000000000000000000000..15967cad40a4088c5d4217d2b0cecc1661102aca
--- /dev/null
+++ b/src/merge/merge_6.py
@@ -0,0 +1,612 @@
+import os
+import pandas as pd
+import numpy as np
+from pathlib import Path
+# import #logging
+from datetime import datetime
+
+# Resolve DATA_DIR from config (container-safe) with fallback
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR  # when run as module
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR  # when run as script from src/
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+class FixedTimestampHandler:
+    def __init__(self, base_path: str | os.PathLike | None = None):
+        # Prefer explicit argument, then DATA_DIR env, then config fallback
+        resolved_base = base_path or os.getenv("DATA_DIR") or CFG_DATA_DIR
+        self.base_path = Path(resolved_base)
+        self.finviz_path = self.base_path / "finviz" / "sentiment"
+        self.crypto_features_path = self.base_path / "merged" / "features" / "crypto_features.parquet"
+        self.stocks_features_path = self.base_path / "merged" / "features" / "stocks_features.parquet"
+        self.output_path = self.base_path / "merged" / "features"
+        self.output_path.mkdir(parents=True, exist_ok=True)
+        # Configure #logging
+        #logging.basicConfig(level=#logging.INFO,
+                        #    format='%(asctime)s - %(levelname)s - %(message)s')
+        # Define tickers and mappings
+        self.stock_tickers = ["AAPL", "TSLA", "GOOGL", "NVDA", "MSFT", "COIN"]
+        self.crypto_ticker_mapping = {
+            "BTC": "bitcoin",
+            "ETH": "ethereum",
+            "SOL": "solana",
+            "XRP": "ripple",
+            "ADA": "cardano"
+        }
+        # Reverse mapping: crypto name to ticker (all lowercase keys)
+        self.crypto_name_to_ticker = {v.lower(): k for k, v in self.crypto_ticker_mapping.items()}
+
+    def crypto_name_to_symbol(self, name):
+        """Transform crypto name (e.g., 'bitcoin', 'Bitcoin', 'BITCOIN') to ticker symbol (e.g., 'BTC')"""
+        if not isinstance(name, str):
+            return None
+        name_lower = name.strip().lower()
+        # Try exact match
+        if name_lower in self.crypto_name_to_ticker:
+            return self.crypto_name_to_ticker[name_lower]
+        # Try to match ignoring spaces and underscores
+        for key in self.crypto_name_to_ticker:
+            if name_lower.replace(' ', '').replace('_', '') == key.replace(' ', '').replace('_', ''):
+                return self.crypto_name_to_ticker[key]
+        return None
+
+    def is_timestamp_column(self, df, col_name):
+        """Determine if a column is likely a timestamp column"""
+        if pd.api.types.is_datetime64_any_dtype(df[col_name]):
+            return True
+        if pd.api.types.is_numeric_dtype(df[col_name]):
+            sample_vals = df[col_name].dropna()
+            if len(sample_vals) == 0:
+                return False
+            sample_val = sample_vals.iloc[0]
+            current_time = pd.Timestamp.now().timestamp()
+            units = [
+                ('s', 1),
+                ('ms', 1000),
+                ('us', 1000000),
+                ('ns', 1000000000)
+            ]
+            for unit, divisor in units:
+                try:
+                    if unit == 's':
+                        ts_value = sample_val
+                    else:
+                        ts_value = sample_val / divisor
+                    if abs(ts_value - current_time) < (10 * 365 * 24 * 3600):
+                        return True
+                except:
+                    continue
+        if df[col_name].dtype == 'object':
+            sample_val = df[col_name].dropna().iloc[0] if not df[col_name].empty else None
+            if sample_val and isinstance(sample_val, str):
+                try:
+                    pd.to_datetime(sample_val)
+                    return True
+                except (ValueError, TypeError):
+                    pass
+        return False
+
+    def get_timestamp_columns(self, df):
+        """Identify all timestamp columns in a dataframe"""
+        timestamp_cols = []
+        potential_names = ['time', 'date', 'interval', 'timestamp', 'dt']
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in potential_names):
+                if self.is_timestamp_column(df, col):
+                    timestamp_cols.append(col)
+        return timestamp_cols
+
+    def convert_timestamp_column(self, df, col_name, unit='auto'):
+        """Convert a timestamp column to datetime format with improved validation"""
+        if pd.api.types.is_datetime64_any_dtype(df[col_name]):
+            if df[col_name].dt.tz is not None:
+                df[col_name] = df[col_name].dt.tz_localize(None)
+            return df[col_name]
+        if pd.api.types.is_numeric_dtype(df[col_name]):
+            sample_vals = df[col_name].dropna()
+            if len(sample_vals) == 0:
+                print(f"[ERROR] No valid values in timestamp column {col_name}")
+                return None
+            
+            # Convert nullable Int64 to regular numeric if needed
+            if hasattr(sample_vals, 'dtype') and str(sample_vals.dtype).startswith('Int'):
+                sample_vals = sample_vals.astype('int64')
+                
+            if unit == 'auto':
+                current_time = pd.Timestamp.now().timestamp()
+                best_unit = None
+                best_distance = float('inf')
+                for test_unit in ['s', 'ms', 'us', 'ns']:
+                    try:
+                        # Additional safety check
+                        if len(sample_vals) == 0:
+                            continue
+                        first_val = sample_vals.iloc[0]
+                        if pd.isna(first_val):
+                            continue
+                        if test_unit == 's':
+                            test_ts = pd.to_datetime(first_val, unit='s')
+                        else:
+                            divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[test_unit]
+                            test_ts = pd.to_datetime(first_val / divisor, unit='s')
+                        distance = abs((pd.Timestamp.now() - test_ts).total_seconds())
+                        if distance < best_distance:
+                            best_distance = distance
+                            best_unit = test_unit
+                    except Exception as e:
+                        #logging.debug(f"Failed to test unit {test_unit} for column {col_name}: {e}")
+                        continue
+                if best_unit is None:
+                    #logging.error(f"Could not determine unit for column {col_name}")
+                    return None
+                unit = best_unit
+                #logging.info(f"Auto-detected unit for {col_name}: {unit}")
+            try:
+                # Convert nullable Int64 to regular numeric if needed for the whole column
+                values_to_convert = df[col_name]
+                if hasattr(values_to_convert, 'dtype') and str(values_to_convert.dtype).startswith('Int'):
+                    values_to_convert = values_to_convert.astype('int64')
+                
+                if unit == 's':
+                    converted = pd.to_datetime(values_to_convert, unit='s')
+                else:
+                    divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[unit]
+                    converted = pd.to_datetime(values_to_convert / divisor, unit='s')
+                if converted.dt.tz is not None:
+                    converted = converted.dt.tz_localize(None)
+                if converted.min().year < 2000:
+                    #logging.warning(f"Converted timestamps for {col_name} seem too old. Checking alternative units.")
+                    for alt_unit in ['s', 'ms', 'us', 'ns']:
+                        if alt_unit == unit:
+                            continue
+                        try:
+                            if alt_unit == 's':
+                                alt_converted = pd.to_datetime(df[col_name], unit='s')
+                            else:
+                                alt_divisor = {'ms': 1000, 'us': 1000000, 'ns': 1000000000}[alt_unit]
+                                alt_converted = pd.to_datetime(df[col_name] / alt_divisor, unit='s')
+                            if alt_converted.min().year > 2000:
+                                #logging.info(f"Alternative unit {alt_unit} gives better results for {col_name}")
+                                converted = alt_converted
+                                break
+                        except Exception as e:
+                            #logging.debug(f"Failed to try alternative unit {alt_unit} for column {col_name}: {e}")
+                            continue
+                #logging.info(f"Successfully converted {col_name} using unit '{unit}'")
+                #logging.info(f"Date range: {converted.min()} to {converted.max()}")
+                return converted
+            except Exception as e:
+                #logging.error(f"Failed to convert {col_name} using unit '{unit}': {e}")
+                return None
+        elif df[col_name].dtype == 'object':
+            try:
+                converted = pd.to_datetime(df[col_name])
+                if converted.dt.tz is not None:
+                    converted = converted.dt.tz_localize(None)
+                #logging.info(f"Successfully converted string timestamps in {col_name}")
+                return converted
+            except Exception as e:
+                #logging.error(f"Failed to convert string timestamps in {col_name}: {e}")
+                return None
+        else:
+            #logging.error(f"Unknown timestamp format in column {col_name}")
+            return None
+
+    def select_best_timestamp_column(self, df, timestamp_columns):
+        """Select the best timestamp column from a list of potential columns"""
+        best_col = None
+        best_score = -1
+        for col in timestamp_columns:
+            try:
+                if col not in df.columns:
+                    print(f"[WARN] Column {col} not found in dataframe")
+                    continue
+                if df[col].isnull().all():
+                    print(f"[WARN] Column {col} contains only null values")
+                    continue
+                converted = self.convert_timestamp_column(df, col)
+                if converted is None:
+                    print(f"[WARN] Could not convert column {col} to timestamp")
+                    continue
+                non_null_count = converted.notna().sum()
+                recent_count = converted[converted > pd.Timestamp('2020-01-01')].count()
+                score = non_null_count + recent_count * 2
+                print(f"[DEBUG] Column {col}: score={score}, non_null={non_null_count}, recent={recent_count}")
+                if score > best_score:
+                    best_score = score
+                    best_col = col
+            except Exception as e:
+                print(f"[WARN] Error evaluating timestamp column {col}: {e}")
+                continue
+        print(f"[INFO] Best timestamp column: {best_col} (score: {best_score})")
+        return best_col
+
+    def load_sentiment_data(self, symbol):
+        """Load sentiment data with proper timestamp handling"""
+        sentiment_file = self.finviz_path / f"{symbol.upper()}_sentiment.parquet"
+        if not sentiment_file.exists():
+            print(f"[WARN] Sentiment file not found: {sentiment_file}")
+            return None
+        try:
+            df = pd.read_parquet(sentiment_file)
+            print(f"[INFO] Loaded sentiment data for {symbol}: {len(df)} rows")
+            timestamp_cols = self.get_timestamp_columns(df)
+            if not timestamp_cols:
+                print(f"[ERROR] No timestamp columns found in {symbol} sentiment data")
+                return None
+            timestamp_col = timestamp_cols[0]
+            converted = self.convert_timestamp_column(df, timestamp_col)
+            if converted is None:
+                print(f"[ERROR] Could not convert timestamp column {timestamp_col} in {symbol}")
+                return None
+            df['sentiment_timestamp'] = converted
+            df['symbol'] = symbol.upper()
+            return df
+        except Exception as e:
+            print(f"[ERROR] Error loading sentiment data for {symbol}: {e}")
+            return None
+
+    def load_features_data(self, data_type='stocks'):
+        """Load features data with improved timestamp handling"""
+        file_path = self.stocks_features_path if data_type == 'stocks' else self.crypto_features_path
+        if not file_path.exists():
+            print(f"[ERROR] Features file not found: {file_path}")
+            return None
+        try:
+            df = pd.read_parquet(file_path)
+            print(f"[INFO] Loaded {data_type} features: {len(df)} rows")
+            potential_timestamp_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['time', 'date', 'interval', 'timestamp', 'dt'])]
+            print(f"[INFO] Potential timestamp columns: {potential_timestamp_cols}")
+            
+            # Safer timestamp detection
+            timestamp_cols = []
+            for col in potential_timestamp_cols:
+                try:
+                    is_ts = self.is_timestamp_column(df, col)
+                    if is_ts:
+                        timestamp_cols.append(col)
+                        print(f"[DEBUG] {col} confirmed as timestamp column")
+                    else:
+                        print(f"[DEBUG] {col} rejected as timestamp column")
+                except Exception as e:
+                    print(f"[WARN] Error checking {col}: {e}")
+                    continue
+            
+            print(f"[INFO] Confirmed timestamp columns: {timestamp_cols}")
+            if not timestamp_cols:
+                print(f"[ERROR] No valid timestamp columns found in {data_type} features")
+                return None
+            best_col = self.select_best_timestamp_column(df, timestamp_cols)
+            if best_col is None:
+                print(f"[ERROR] Could not select a valid timestamp column from {timestamp_cols}")
+                return None
+            converted = self.convert_timestamp_column(df, best_col)
+            if converted is None:
+                print(f"[ERROR] Failed to convert selected timestamp column {best_col}")
+                return None
+            df['feature_timestamp'] = converted
+            print(f"[INFO] Selected timestamp column: {best_col}")
+            print(f"[INFO] Date range: {converted.min()} to {converted.max()}")
+            return df
+        except Exception as e:
+            import traceback
+            print(f"[ERROR] Error loading {data_type} features: {e}")
+            print(f"[ERROR] Traceback: {traceback.format_exc()}")
+            return None
+
+    def merge_sentiment_to_features(self, features_df, sentiment_df, tolerance_minutes=60*12):
+        """Merge sentiment data INTO features data based on closest timestamp, with tolerance window"""
+        features_sorted = features_df.sort_values(by='feature_timestamp')
+        sentiment_sorted = sentiment_df.sort_values(by='sentiment_timestamp')
+
+        # Use a tolerance window for timestamp matching
+        tolerance = pd.Timedelta(minutes=tolerance_minutes)
+        merged_df = pd.merge_asof(
+            features_sorted,
+            sentiment_sorted,
+            left_on='feature_timestamp',
+            right_on='sentiment_timestamp',
+            direction='nearest',
+            tolerance=tolerance
+        )
+
+        # If no sentiment match within tolerance, sentiment_score will be NaN
+        if 'sentiment_score' in merged_df.columns:
+            unmatched = merged_df['sentiment_score'].isna().sum()
+            print(f"[INFO] Rows with no sentiment match (NaN sentiment_score): {unmatched}")
+
+        print(f"[INFO] Merged {len(features_df)} feature rows with {len(sentiment_df)} sentiment rows using tolerance {tolerance_minutes} min")
+        print(f"[INFO] Result: {len(merged_df)} rows")
+        return merged_df
+
+    def process_stocks_data(self):
+        """Process all stocks data by merging finviz sentiment into stock features"""
+        print("[INFO] Processing stocks data...")
+        
+        # Load stocks features first (this is the base dataset)
+        stocks_df = self.load_features_data('stocks')
+        if stocks_df is None:
+            print("[ERROR] Failed to load stocks features data")
+            return None
+        
+        # Check what columns are available and what symbols are in the data
+        if 'symbol' in stocks_df.columns:
+            unique_symbols = stocks_df['symbol'].unique()
+        elif 'ticker' in stocks_df.columns:
+            unique_symbols = stocks_df['ticker'].unique()
+        
+        print(f"[INFO] Available symbols in stocks features: {unique_symbols}")
+        
+        # Check if any sentiment files exist
+        if not self.finviz_path.exists():
+            print(f"[WARN] Finviz sentiment directory does not exist: {self.finviz_path}")
+            print(f"[WARN] Proceeding without sentiment data merge for stocks")
+            # Save features as-is without sentiment merge
+            output_file = self.output_path / "stocks_features.parquet"
+            stocks_df.to_parquet(output_file)
+            print(f"[INFO] Stocks features saved without sentiment to: {output_file}")
+            return stocks_df
+            
+        # Check if any sentiment files exist for our tickers
+        sentiment_files_exist = any(
+            (self.finviz_path / f"{ticker.upper()}_sentiment.parquet").exists() 
+            for ticker in self.stock_tickers
+        )
+        
+        if not sentiment_files_exist:
+            print(f"[WARN] No sentiment files found for any stock tickers: {self.stock_tickers}")
+            print(f"[WARN] Proceeding without sentiment data merge for stocks")
+            # Save features as-is without sentiment merge
+            output_file = self.output_path / "stocks_features.parquet"
+            stocks_df.to_parquet(output_file)
+            print(f"[INFO] Stocks features saved without sentiment to: {output_file}")
+            return stocks_df
+        
+        merged_stocks_list = []
+        
+        for ticker in self.stock_tickers:
+            print(f"[INFO] Processing stock ticker: {ticker}")
+            
+            # Load sentiment data for this ticker
+            sentiment_df = self.load_sentiment_data(ticker)
+            if sentiment_df is None:
+                print(f"[WARN] No sentiment data for {ticker}, skipping...")
+                continue
+            
+            # Filter stocks features for this ticker
+            ticker_stocks = None
+            if 'symbol' in stocks_df.columns:
+                ticker_stocks = stocks_df[stocks_df['symbol'] == ticker].copy()
+            elif 'ticker' in stocks_df.columns:
+                ticker_stocks = stocks_df[stocks_df['ticker'] == ticker].copy()
+            
+            if ticker_stocks is None or len(ticker_stocks) == 0:
+                print(f"[WARN] No feature data found for ticker {ticker} - skipping this ticker")
+                continue
+            
+            print(f"[INFO] Found {len(ticker_stocks)} feature rows for {ticker}")
+            
+            # Merge sentiment INTO features
+            merged_ticker = self.merge_sentiment_to_features(ticker_stocks, sentiment_df)
+            
+            # Remove symbol_y and replace symbol_x with symbol
+            if 'symbol_y' in merged_ticker.columns:
+                merged_ticker = merged_ticker.drop(columns=['symbol_y'])
+            if 'symbol_x' in merged_ticker.columns:
+                merged_ticker = merged_ticker.rename(columns={'symbol_x': 'symbol'})
+            
+            # Re-order columns: symbol first, interval_timestamp second (if present)
+            cols = list(merged_ticker.columns)
+            if 'symbol' in cols:
+                cols.remove('symbol')
+                new_order = ['symbol']
+                if 'interval_timestamp' in cols:
+                    cols.remove('interval_timestamp')
+                    new_order.append('interval_timestamp')
+                new_order += cols
+                merged_ticker = merged_ticker[new_order]
+            merged_stocks_list.append(merged_ticker)
+        
+        if not merged_stocks_list:
+            print("[WARN] No stocks data was successfully merged with sentiment")
+            print("[WARN] Saving original stocks features without sentiment")
+            output_file = self.output_path / "stocks_features.parquet"
+            stocks_df.to_parquet(output_file)
+            print(f"[INFO] Stocks features saved without sentiment to: {output_file}")
+            return stocks_df
+        
+        # Combine all merged stock data
+        final_stocks_df = pd.concat(merged_stocks_list, ignore_index=True)
+        
+        # Save the result
+        output_file = self.output_path / "stocks_features.parquet"
+        final_stocks_df.to_parquet(output_file)
+        print(f"[INFO] Stocks data with sentiment saved to: {output_file}")
+        
+        return final_stocks_df
+
+    def process_crypto_data(self):
+        """Process all crypto data by merging finviz sentiment into crypto features"""
+        print("[INFO] Processing crypto data...")
+        
+        # Load crypto features first (this is the base dataset)
+        crypto_df = self.load_features_data('crypto')
+        if crypto_df is None:
+            print("[ERROR] Failed to load crypto features data")
+            return None
+        
+        # Check for various possible symbol/ticker columns
+        symbol_columns = [col for col in crypto_df.columns if any(keyword in col.lower() 
+                         for keyword in ['symbol', 'ticker', 'name', 'id', 'coin'])]
+        
+        print(f"[INFO] Available symbol columns in crypto: {symbol_columns}")
+        
+        # Try to identify unique values in potential symbol columns
+        for col in symbol_columns:
+            if crypto_df[col].dtype == 'object':
+                unique_values = crypto_df[col].unique()[:10]  # Show first 10 unique values
+                print(f"[INFO] Sample values in {col}: {unique_values}")
+        
+        # Check if any sentiment files exist
+        if not self.finviz_path.exists():
+            print(f"[WARN] Finviz sentiment directory does not exist: {self.finviz_path}")
+            print(f"[WARN] Proceeding without sentiment data merge for crypto")
+            # Save features as-is without sentiment merge
+            output_file = self.output_path / "crypto_features.parquet"
+            crypto_df.to_parquet(output_file)
+            print(f"[INFO] Crypto features saved without sentiment to: {output_file}")
+            return crypto_df
+            
+        # Check if any sentiment files exist for our crypto tickers
+        sentiment_files_exist = any(
+            (self.finviz_path / f"{ticker.upper()}_sentiment.parquet").exists() 
+            for ticker in self.crypto_ticker_mapping.keys()
+        )
+        
+        if not sentiment_files_exist:
+            print(f"[WARN] No sentiment files found for any crypto tickers: {list(self.crypto_ticker_mapping.keys())}")
+            print(f"[WARN] Proceeding without sentiment data merge for crypto")
+            # Save features as-is without sentiment merge
+            output_file = self.output_path / "crypto_features.parquet"
+            crypto_df.to_parquet(output_file)
+            print(f"[INFO] Crypto features saved without sentiment to: {output_file}")
+            return crypto_df
+        
+        merged_crypto_list = []
+        
+        for crypto_ticker, crypto_name in self.crypto_ticker_mapping.items():
+            print(f"[INFO] Processing crypto ticker: {crypto_ticker} (name: {crypto_name})")
+            
+            # Load sentiment data for this crypto ticker
+            sentiment_df = self.load_sentiment_data(crypto_ticker)
+            if sentiment_df is None:
+                print(f"[WARN] No sentiment data for {crypto_ticker}, skipping...")
+                continue
+            
+            # Try different approaches to filter crypto features
+            ticker_crypto = None
+            
+            # Approach 1: Try exact ticker match
+            for col in ['symbol', 'ticker', 'coin_id', 'id']:
+                if col in crypto_df.columns:
+                    matches = crypto_df[crypto_df[col].str.upper() == crypto_ticker].copy()
+                    if len(matches) > 0:
+                        ticker_crypto = matches
+                        print(f"[INFO] Found {len(matches)} rows matching {crypto_ticker} in column '{col}'")
+                        break
+            
+            # Approach 2: Try crypto name match
+            if ticker_crypto is None or len(ticker_crypto) == 0:
+                for col in ['name', 'coin_name']:
+                    if col in crypto_df.columns:
+                        matches = crypto_df[crypto_df[col].str.lower() == crypto_name.lower()].copy()
+                        if len(matches) > 0:
+                            ticker_crypto = matches
+                            print(f"[INFO] Found {len(matches)} rows matching {crypto_name} in column '{col}'")
+                            break
+            
+            # Approach 3: Try partial matching (in case of different formats)
+            if ticker_crypto is None or len(ticker_crypto) == 0:
+                for col in symbol_columns:
+                    if crypto_df[col].dtype == 'object':
+                        # Try case-insensitive contains match
+                        matches = crypto_df[crypto_df[col].str.contains(crypto_ticker, case=False, na=False)].copy()
+                        if len(matches) > 0:
+                            ticker_crypto = matches
+                            print(f"[INFO] Found {len(matches)} rows with partial match for {crypto_ticker} in column '{col}'")
+                            break
+                        
+                        # Try crypto name partial match
+                        matches = crypto_df[crypto_df[col].str.contains(crypto_name, case=False, na=False)].copy()
+                        if len(matches) > 0:
+                            ticker_crypto = matches
+                            print(f"[INFO] Found {len(matches)} rows with partial match for {crypto_name} in column '{col}'")
+                            break
+            
+            if ticker_crypto is None or len(ticker_crypto) == 0:
+                print(f"[WARN] No feature data found for crypto {crypto_ticker} ({crypto_name}) - skipping this crypto")
+                continue
+            
+            # Merge sentiment INTO features
+            merged_ticker = self.merge_sentiment_to_features(ticker_crypto, sentiment_df)
+            
+            # Remove symbol_x and replace symbol_y with symbol
+            if 'symbol_x' in merged_ticker.columns:
+                merged_ticker = merged_ticker.drop(columns=['symbol_x'])
+            if 'symbol_y' in merged_ticker.columns:
+                merged_ticker = merged_ticker.rename(columns={'symbol_y': 'symbol'})
+            
+            # Remove duplicate 'symbol' columns if any
+            symbol_cols = [col for col in merged_ticker.columns if col == 'symbol']
+            if len(symbol_cols) > 1:
+                # Keep only the first 'symbol' column
+                # This will drop all but the first occurrence
+                merged_ticker = merged_ticker.loc[:, ~merged_ticker.columns.duplicated()]
+            
+            # Re-order columns: symbol first, interval_timestamp second (if present)
+            cols = list(merged_ticker.columns)
+            if 'symbol' in cols:
+                cols.remove('symbol')
+                new_order = ['symbol']
+                if 'interval_timestamp' in cols:
+                    cols.remove('interval_timestamp')
+                    new_order.append('interval_timestamp')
+                new_order += cols
+                merged_ticker = merged_ticker[new_order]
+            merged_crypto_list.append(merged_ticker)
+        
+        if not merged_crypto_list:
+            print("[WARN] No crypto data was successfully merged with sentiment")
+            print("[WARN] Saving original crypto features without sentiment")
+            output_file = self.output_path / "crypto_features.parquet"
+            crypto_df.to_parquet(output_file)
+            print(f"[INFO] Crypto features saved without sentiment to: {output_file}")
+            return crypto_df
+        
+        # Combine all merged crypto data
+        final_crypto_df = pd.concat(merged_crypto_list, ignore_index=True)
+        
+        # Save the result
+        output_file = self.output_path / "crypto_features.parquet"
+        final_crypto_df.to_parquet(output_file)
+        print(f"[INFO] Crypto data with sentiment saved to: {output_file}")
+        
+        return final_crypto_df
+
+    def process_all_data(self):
+        """Process both stocks and crypto data"""
+        #logging.info("Starting data processing for all assets...")
+        
+        stocks_result = self.process_stocks_data()
+        crypto_result = self.process_crypto_data()
+        
+        if stocks_result is not None:
+            print(f"[OK] Stocks processing completed: {len(stocks_result)} rows")
+        else:
+            print("[ERROR] Stocks processing failed")
+
+        if crypto_result is not None:
+            print(f"[OK] Crypto processing completed: {len(crypto_result)} rows")
+        else:
+            print("[ERROR] Crypto processing failed")
+
+        return stocks_result, crypto_result
+
+# Example usage
+if __name__ == "__main__":
+    handler = FixedTimestampHandler()
+    
+    # Test individual components
+    #logging.info("Testing sentiment data loading...")
+    sentiment_df = handler.load_sentiment_data("AAPL")
+    
+    stocks_df = handler.load_features_data('stocks')
+    
+    # Test merge process
+    # handler.test_merge()
+    
+    # Process all data
+    handler.process_all_data()
\ No newline at end of file
diff --git a/src/merge/merge_7.py b/src/merge/merge_7.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f41d0e8bc3a3cc9e67e40018217dc083d4bcff4
--- /dev/null
+++ b/src/merge/merge_7.py
@@ -0,0 +1,28 @@
+import importlib.util
+import os
+
+def run_module(module_path, module_name):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
+    if hasattr(mod, 'main'):
+        mod.main()
+    else:
+        print(f"[WARN] {module_name} has no main() function.")
+
+def main():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    finhub_dir = os.path.join(this_dir, 'finhub')
+    modules = [
+        ('company_info.py', 'company_info'),
+        ('sentiment.py', 'sentiment'),
+        ('ratings.py', 'ratings'),
+        ('quote.py', 'quote'),
+    ]
+    for fname, mname in modules:
+        print(f"[INFO] Merging {mname.replace('_', ' ')}...")
+        run_module(os.path.join(finhub_dir, fname), mname)
+    print("[INFO] All merges complete.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/merge_sant.py b/src/merge/merge_sant.py
new file mode 100644
index 0000000000000000000000000000000000000000..0782551feb6c2decf19598793fe68ac9636ed16b
--- /dev/null
+++ b/src/merge/merge_sant.py
@@ -0,0 +1,909 @@
+"""
+Santiment Data Merger
+=====================
+
+This script merges all Santiment data files into a unified features dataset.
+It reads all parquet files from data/santiment/, merges them by slug and datetime
+with 1-hour interval tolerance, and creates merged_features.parquet.
+
+Features:
+- Reads all Santiment parquet files automatically
+- Merges by slug and datetime with 1-hour tolerance
+- Handles different data formats (financial, ohlcv, prices, etc.)
+- Creates comprehensive feature dataset
+- Robust error handling and logging
+
+Author: AI Assistant
+Date: August 2025
+"""
+
+import os
+import sys
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from datetime import datetime, timedelta
+import logging
+import glob
+from typing import List, Dict, Optional, Tuple
+import warnings
+
+# Resolve data directory base
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+
+def _resolve_under_data(path_like: str | os.PathLike) -> Path:
+    p = Path(path_like)
+    if p.is_absolute():
+        return p
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return Path(CFG_DATA_DIR) / rel
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class SantimentDataMerger:
+    """
+    Comprehensive Santiment Data Merger
+    
+    Merges all Santiment parquet files into a unified features dataset
+    with proper handling of different data formats and time alignment.
+    """
+    
+    def __init__(self, 
+                 source_dir: str = "data/santiment",
+                 output_dir: str = "data/santiment",
+                 time_tolerance_hours: int = 1):
+        """
+        Initialize the Santiment Data Merger
+        
+        Args:
+            source_dir: Directory containing Santiment parquet files
+            output_dir: Directory to save merged features
+            time_tolerance_hours: Tolerance for datetime matching (hours)
+        """
+        # Resolve under DATA_DIR for portability
+        self.source_dir = _resolve_under_data(source_dir)
+        self.output_dir = _resolve_under_data(output_dir)
+        self.time_tolerance = timedelta(hours=time_tolerance_hours)
+        
+        # Ensure directories exist
+        self.source_dir.mkdir(parents=True, exist_ok=True)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        
+        # Storage for processed data
+        self.dataframes: Dict[str, pd.DataFrame] = {}
+        self.merged_data: Optional[pd.DataFrame] = None
+        self.processing_stats = {
+            'files_found': 0,
+            'files_processed': 0,
+            'files_failed': 0,
+            'total_records': 0,
+            'unique_slugs': set(),
+            'date_range': {},
+            'categories': set()
+        }
+        
+        # Track placeholder mode (no input files)
+        self.placeholder_created = False
+
+        # Initialize symbol normalizer
+        self.symbol_normalizer = self._setup_symbol_normalizer()
+    
+    def _setup_symbol_normalizer(self):
+        """
+        Set up symbol normalization mapping for consistent asset identification
+        
+        Returns:
+            Dictionary mapping various symbol formats to canonical slugs
+        """
+        # Canonical mapping for major crypto assets
+        # Maps various symbols/names to the official uppercase symbols
+        symbol_mapping = {
+            # Bitcoin variants
+            'bitcoin': 'BTC',
+            'btc': 'BTC',
+            'Bitcoin': 'BTC',
+            'BTC': 'BTC',
+            
+            # Ethereum variants  
+            'ethereum': 'ETH',
+            'eth': 'ETH',
+            'Ethereum': 'ETH',
+            'ETH': 'ETH',
+            
+            # Ripple/XRP variants
+            'ripple': 'XRP',
+            'xrp': 'XRP',
+            'Ripple': 'XRP',
+            'XRP': 'XRP',
+            
+            # Solana variants
+            'solana': 'SOL',
+            'sol': 'SOL',
+            'Solana': 'SOL',
+            'SOL': 'SOL',
+            
+            # Cardano variants
+            'cardano': 'ADA',
+            'ada': 'ADA',
+            'Cardano': 'ADA',
+            'ADA': 'ADA',
+            
+            # Polkadot variants
+            'polkadot': 'DOT',
+            'dot': 'DOT',
+            'Polkadot': 'DOT',
+            'DOT': 'DOT',
+            
+            # Chainlink variants
+            'chainlink': 'LINK',
+            'link': 'LINK',
+            'Chainlink': 'LINK',
+            'LINK': 'LINK',
+            
+            # Litecoin variants
+            'litecoin': 'LTC',
+            'ltc': 'LTC',
+            'Litecoin': 'LTC',
+            'LTC': 'LTC',
+            
+            # Bitcoin Cash variants
+            'bitcoin-cash': 'BCH',
+            'bch': 'BCH',
+            'Bitcoin Cash': 'BCH',
+            'BCH': 'BCH',
+            
+            # Stellar variants
+            'stellar': 'XLM',
+            'xlm': 'XLM',
+            'Stellar': 'XLM',
+            'XLM': 'XLM',
+            
+            # Ethereum Classic variants
+            'ethereum-classic': 'ETC',
+            'etc': 'ETC',
+            'Ethereum Classic': 'ETC',
+            'ETC': 'ETC',
+            
+            # EOS variants
+            'eos': 'EOS',
+            'EOS': 'EOS',
+        }
+        
+        logger.info(f"Initialized symbol normalizer with {len(symbol_mapping)} mappings")
+        return symbol_mapping
+    
+    def normalize_symbol(self, symbol: str) -> str:
+        """
+        Normalize a symbol to its canonical uppercase format
+        
+        Args:
+            symbol: Symbol to normalize
+            
+        Returns:
+            Canonical uppercase symbol (e.g., BTC, ETH, SOL)
+        """
+        if symbol in self.symbol_normalizer:
+            canonical = self.symbol_normalizer[symbol]
+            if symbol != canonical:
+                logger.debug(f"Normalized '{symbol}' -> '{canonical}'")
+            return canonical
+        
+        # If not found in mapping, return uppercase version and log warning
+        logger.warning(f"Unknown symbol '{symbol}' not found in normalization mapping, using uppercase")
+        return symbol.upper()
+
+    def find_parquet_files(self) -> List[Path]:
+        """
+        Find all parquet files in the source directory
+        
+        Returns:
+            List of parquet file paths
+        """
+        parquet_files = list(self.source_dir.glob("*.parquet"))
+        
+        # Filter out non-Santiment files and already merged files
+        santiment_files = []
+        for file_path in parquet_files:
+            filename = file_path.name.lower()
+            # Include Santiment files but exclude already merged ones
+            if ('santiment_' in filename or 'ohlcv' in filename or 'prices' in filename) and 'merged' not in filename:
+                santiment_files.append(file_path)
+        
+        self.processing_stats['files_found'] = len(santiment_files)
+        logger.info(f"Found {len(santiment_files)} Santiment parquet files")
+        
+        return santiment_files
+
+    def parse_filename(self, file_path: Path) -> Dict[str, str]:
+        """
+        Parse filename to extract metadata
+        
+        Args:
+            file_path: Path to the parquet file
+            
+        Returns:
+            Dictionary with parsed metadata
+        """
+        filename = file_path.stem
+        parts = filename.split('_')
+        
+        metadata = {
+            'source': 'santiment',
+            'category': 'unknown',
+            'metric': 'unknown',
+            'asset': 'unknown',
+            'timestamp': 'unknown'
+        }
+        
+        try:
+            if filename.startswith('santiment_'):
+                # Format: santiment_category_metric_timestamp
+                if len(parts) >= 4:
+                    metadata['category'] = parts[1]
+                    metadata['metric'] = parts[2]
+                    metadata['timestamp'] = '_'.join(parts[3:])
+            elif 'ohlcv' in filename:
+                # Format: santiment_ohlcv_asset_timestamp
+                if len(parts) >= 4:
+                    metadata['category'] = 'ohlcv'
+                    metadata['metric'] = 'ohlcv'
+                    metadata['asset'] = parts[2]
+                    metadata['timestamp'] = '_'.join(parts[3:])
+            elif 'prices' in filename:
+                # Format: santiment_prices_asset_timestamp
+                if len(parts) >= 4:
+                    metadata['category'] = 'prices'
+                    metadata['metric'] = 'prices_detailed'
+                    metadata['asset'] = parts[2]
+                    metadata['timestamp'] = '_'.join(parts[3:])
+                    
+        except Exception as e:
+            logger.warning(f"Failed to parse filename {filename}: {e}")
+        
+        return metadata
+
+    def load_and_standardize_dataframe(self, file_path: Path) -> Optional[pd.DataFrame]:
+        """
+        Load and standardize a parquet file
+        
+        Args:
+            file_path: Path to the parquet file
+            
+        Returns:
+            Standardized DataFrame or None if failed
+        """
+        try:
+            df = pd.read_parquet(file_path)
+            
+            if df.empty:
+                logger.warning(f"Empty dataframe: {file_path.name}")
+                return None
+            
+            # Parse filename for metadata
+            metadata = self.parse_filename(file_path)
+            
+            # Standardize datetime index
+            if 'datetime' in df.columns:
+                df['datetime'] = pd.to_datetime(df['datetime'])
+                df.set_index('datetime', inplace=True)
+            elif df.index.name == 'datetime' or pd.api.types.is_datetime64_any_dtype(df.index):
+                df.index = pd.to_datetime(df.index)
+                df.index.name = 'datetime'
+            else:
+                # Try to find a datetime column
+                datetime_cols = [col for col in df.columns if 'date' in col.lower() or 'time' in col.lower()]
+                if datetime_cols:
+                    df[datetime_cols[0]] = pd.to_datetime(df[datetime_cols[0]])
+                    df.set_index(datetime_cols[0], inplace=True)
+                    df.index.name = 'datetime'
+                else:
+                    logger.warning(f"No datetime column found in {file_path.name}")
+                    return None
+            
+            # Ensure slug column exists
+            if 'slug' not in df.columns:
+                if metadata['asset'] != 'unknown':
+                    # Normalize the asset symbol before assigning
+                    normalized_asset = self.normalize_symbol(metadata['asset'])
+                    df['slug'] = normalized_asset
+                    if metadata['asset'] != normalized_asset:
+                        logger.info(f"Normalized asset '{metadata['asset']}' -> '{normalized_asset}' in {file_path.name}")
+                else:
+                    logger.warning(f"No slug information found in {file_path.name}")
+                    return None
+            else:
+                # Normalize existing slug column
+                df['slug'] = df['slug'].apply(self.normalize_symbol)
+                logger.debug(f"Normalized existing slug column in {file_path.name}")
+            
+            # Add metadata columns
+            df['source_file'] = file_path.name
+            df['category'] = metadata['category']
+            
+            # Rename columns to avoid conflicts and add prefixes
+            value_columns = [col for col in df.columns if col not in ['slug', 'metric', 'source_file', 'category']]
+            
+            # Add category prefix to value columns
+            category = metadata['category']
+            metric = metadata['metric']
+            
+            column_mapping = {}
+            for col in value_columns:
+                if col in ['slug', 'source_file', 'category']:
+                    continue
+                    
+                # Create meaningful column name
+                if col == 'value':
+                    new_col = f"{category}_{metric}"
+                elif col in ['open', 'high', 'low', 'close', 'volume']:
+                    new_col = f"{category}_{col}"
+                else:
+                    new_col = f"{category}_{col}"
+                    
+                column_mapping[col] = new_col
+            
+            df.rename(columns=column_mapping, inplace=True)
+            
+            # Update stats
+            self.processing_stats['unique_slugs'].update(df['slug'].unique())
+            self.processing_stats['categories'].add(category)
+            
+            logger.info(f"Loaded {file_path.name}: {len(df)} records, {len(df.columns)} columns")
+            
+            return df
+            
+        except Exception as e:
+            logger.error(f"Failed to load {file_path.name}: {e}")
+            return None
+
+    def merge_dataframes_by_slug_datetime(self, dataframes: List[pd.DataFrame]) -> pd.DataFrame:
+        """
+        Merge multiple dataframes by slug and datetime with tolerance
+        
+        Args:
+            dataframes: List of DataFrames to merge
+            
+        Returns:
+            Merged DataFrame
+        """
+        if not dataframes:
+            return pd.DataFrame()
+        
+        logger.info(f"Merging {len(dataframes)} dataframes...")
+        
+        # Start with the first dataframe
+        merged = dataframes[0].copy()
+        logger.info(f"Starting with base dataframe: {len(merged)} records")
+        
+        # Merge each subsequent dataframe
+        for i, df in enumerate(dataframes[1:], 1):
+            logger.info(f"Merging dataframe {i+1}/{len(dataframes)}: {len(df)} records")
+            
+            try:
+                # Merge on slug and datetime index with tolerance
+                merged = self._merge_with_time_tolerance(merged, df)
+                logger.info(f"After merge {i}: {len(merged)} records")
+                
+            except Exception as e:
+                logger.error(f"Failed to merge dataframe {i+1}: {e}")
+                continue
+        
+        return merged
+
+    def _merge_with_time_tolerance(self, left_df: pd.DataFrame, right_df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Merge two dataframes with time tolerance
+        
+        Args:
+            left_df: Left DataFrame
+            right_df: Right DataFrame
+            
+        Returns:
+            Merged DataFrame
+        """
+        # Reset index to make datetime a column for merging
+        left_reset = left_df.reset_index()
+        right_reset = right_df.reset_index()
+        
+        # Perform merge on slug first
+        common_slugs = set(left_reset['slug'].unique()) & set(right_reset['slug'].unique())
+        
+        if not common_slugs:
+            # No common slugs, concatenate vertically
+            logger.warning("No common slugs found, concatenating dataframes")
+            combined = pd.concat([left_df, right_df], axis=0, sort=False)
+            return combined.sort_index()
+        
+        merged_parts = []
+        
+        for slug in common_slugs:
+            left_slug = left_reset[left_reset['slug'] == slug].copy()
+            right_slug = right_reset[right_reset['slug'] == slug].copy()
+            
+            if left_slug.empty or right_slug.empty:
+                continue
+            
+            # Sort by datetime
+            left_slug = left_slug.sort_values('datetime')
+            right_slug = right_slug.sort_values('datetime')
+            
+            # Merge with time tolerance using pandas merge_asof
+            try:
+                merged_slug = pd.merge_asof(
+                    left_slug,
+                    right_slug,
+                    on='datetime',
+                    by='slug',
+                    tolerance=self.time_tolerance,
+                    direction='nearest',
+                    suffixes=('', '_right')
+                )
+                
+                # Remove duplicate columns
+                duplicate_cols = [col for col in merged_slug.columns if col.endswith('_right')]
+                for col in duplicate_cols:
+                    base_col = col.replace('_right', '')
+                    if base_col in merged_slug.columns:
+                        # Keep non-null values, preferring left side
+                        merged_slug[base_col] = merged_slug[base_col].fillna(merged_slug[col])
+                    else:
+                        # Rename the right column
+                        merged_slug[base_col] = merged_slug[col]
+                    merged_slug.drop(columns=[col], inplace=True)
+                
+                merged_parts.append(merged_slug)
+                
+            except Exception as e:
+                logger.warning(f"Failed to merge slug {slug}: {e}")
+                # Fallback: simple concatenation for this slug
+                slug_combined = pd.concat([left_slug, right_slug], axis=0, sort=False)
+                merged_parts.append(slug_combined)
+        
+        # Handle slugs that exist in only one dataframe
+        left_only_slugs = set(left_reset['slug'].unique()) - common_slugs
+        right_only_slugs = set(right_reset['slug'].unique()) - common_slugs
+        
+        for slug in left_only_slugs:
+            merged_parts.append(left_reset[left_reset['slug'] == slug])
+        
+        for slug in right_only_slugs:
+            merged_parts.append(right_reset[right_reset['slug'] == slug])
+        
+        # Combine all parts
+        if merged_parts:
+            final_merged = pd.concat(merged_parts, axis=0, sort=False, ignore_index=True)
+            # Set datetime as index
+            final_merged.set_index('datetime', inplace=True)
+            return final_merged.sort_index()
+        else:
+            return left_df
+
+    def fill_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
+        """
+        Comprehensive null filling strategy for the merged dataset
+        
+        Args:
+            df: DataFrame with potential null values
+            
+        Returns:
+            DataFrame with filled null values
+        """
+        logger.info("Applying comprehensive null filling strategy...")
+        
+        filled_df = df.copy()
+        null_counts_before = filled_df.isnull().sum().sum()
+        
+        # Strategy 1: Forward fill within each asset (time-based continuity)
+        logger.info("Step 1: Forward filling within each asset...")
+        for slug in filled_df['slug'].unique():
+            slug_mask = filled_df['slug'] == slug
+            filled_df.loc[slug_mask] = filled_df.loc[slug_mask].ffill()
+        
+        # Strategy 2: Backward fill within each asset (fill initial nulls)
+        logger.info("Step 2: Backward filling within each asset...")
+        for slug in filled_df['slug'].unique():
+            slug_mask = filled_df['slug'] == slug
+            filled_df.loc[slug_mask] = filled_df.loc[slug_mask].bfill()
+        
+        # Strategy 3: Fill specific column types with appropriate defaults
+        logger.info("Step 3: Filling remaining nulls with type-specific defaults...")
+        
+        for col in filled_df.columns:
+            if filled_df[col].isnull().any():
+                # Price and financial metrics: use median of the column
+                if any(keyword in col.lower() for keyword in ['price', 'usd', 'btc', 'eth', 'marketcap', 'volume']):
+                    median_val = filled_df[col].median()
+                    filled_df[col] = filled_df[col].fillna(median_val)
+                    logger.debug(f"Filled {col} nulls with median: {median_val}")
+                
+                # Address and network metrics: use 0 (no activity)
+                elif any(keyword in col.lower() for keyword in ['address', 'network', 'active', 'transaction']):
+                    filled_df[col] = filled_df[col].fillna(0)
+                    logger.debug(f"Filled {col} nulls with 0")
+                
+                # Exchange metrics: use 0 (no flow)
+                elif any(keyword in col.lower() for keyword in ['exchange', 'inflow', 'outflow', 'balance']):
+                    filled_df[col] = filled_df[col].fillna(0)
+                    logger.debug(f"Filled {col} nulls with 0")
+                
+                # Supply metrics: forward fill or use mean
+                elif any(keyword in col.lower() for keyword in ['supply', 'circulation', 'velocity']):
+                    mean_val = filled_df[col].mean()
+                    filled_df[col] = filled_df[col].fillna(mean_val)
+                    logger.debug(f"Filled {col} nulls with mean: {mean_val}")
+                
+                # Development metrics: use 0 (no activity)
+                elif any(keyword in col.lower() for keyword in ['dev', 'github', 'contributors']):
+                    filled_df[col] = filled_df[col].fillna(0)
+                    logger.debug(f"Filled {col} nulls with 0")
+                
+                # Social metrics: use 0 (no mentions)
+                elif any(keyword in col.lower() for keyword in ['social', 'sentiment', 'volume_4chan', 'volume_reddit']):
+                    filled_df[col] = filled_df[col].fillna(0)
+                    logger.debug(f"Filled {col} nulls with 0")
+                
+                # OHLCV metrics: use forward fill or interpolation
+                elif any(keyword in col.lower() for keyword in ['open', 'high', 'low', 'close', 'ohlcv']):
+                    filled_df[col] = filled_df[col].ffill().bfill()
+                    logger.debug(f"Filled {col} nulls with forward/backward fill")
+                
+                # Derivatives and whale metrics: use 0
+                elif any(keyword in col.lower() for keyword in ['funding', 'interest', 'whale', 'holders']):
+                    filled_df[col] = filled_df[col].fillna(0)
+                    logger.debug(f"Filled {col} nulls with 0")
+                
+                # String columns: use 'unknown' or most frequent value
+                elif filled_df[col].dtype == 'object':
+                    if col in ['slug', 'category', 'source_file', 'metric', 'development_alternative_slug_used']:
+                        # Skip these columns as they will be removed or are handled separately
+                        continue
+                    else:
+                        mode_val = filled_df[col].mode()
+                        if len(mode_val) > 0:
+                            filled_df[col] = filled_df[col].fillna(mode_val[0])
+                        else:
+                            filled_df[col] = filled_df[col].fillna('unknown')
+                        logger.debug(f"Filled {col} nulls with mode/unknown")
+                
+                # Any remaining numeric nulls: use median
+                elif pd.api.types.is_numeric_dtype(filled_df[col]):
+                    median_val = filled_df[col].median()
+                    if pd.notna(median_val):
+                        filled_df[col] = filled_df[col].fillna(median_val)
+                        logger.debug(f"Filled {col} nulls with median: {median_val}")
+                    else:
+                        filled_df[col] = filled_df[col].fillna(0)
+                        logger.debug(f"Filled {col} nulls with 0 (median was NaN)")
+        
+        null_counts_after = filled_df.isnull().sum().sum()
+        nulls_filled = null_counts_before - null_counts_after
+        
+        logger.info(f"Null filling completed:")
+        logger.info(f"  Nulls before: {null_counts_before:,}")
+        logger.info(f"  Nulls after: {null_counts_after:,}")
+        logger.info(f"  Nulls filled: {nulls_filled:,}")
+        
+        return filled_df
+
+    def process_all_files(self) -> bool:
+        """
+        Process all Santiment parquet files
+        
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            # Find all parquet files
+            parquet_files = self.find_parquet_files()
+            
+            if not parquet_files:
+                logger.warning("No Santiment parquet files found")
+                # Graceful fallback: create minimal placeholder merged file to unblock pipeline
+                try:
+                    # Create an explicitly typed empty DF with expected columns
+                    placeholder = pd.DataFrame({'slug': pd.Series(dtype='object')})
+                    # Set an empty datetime index (naive) with the expected name
+                    placeholder.index = pd.DatetimeIndex([], name='datetime')
+                    # Ensure output directory exists
+                    self.output_dir.mkdir(parents=True, exist_ok=True)
+                    out_path = self.output_dir / "merged_features.parquet"
+                    # Save directly, bypassing save_merged_features constraints
+                    placeholder.to_parquet(out_path, index=True)
+                    # Mark placeholder state and keep merged_data None
+                    self.placeholder_created = True
+                    logger.info(f"Created placeholder Santiment merged_features.parquet with 0 rows at {out_path}")
+                    return True
+                except Exception as e:
+                    logger.error(f"Failed to create placeholder Santiment file: {e}")
+                    return False
+            
+            # Load and standardize all dataframes
+            dataframes = []
+            
+            for file_path in parquet_files:
+                try:
+                    df = self.load_and_standardize_dataframe(file_path)
+                    if df is not None:
+                        dataframes.append(df)
+                        self.processing_stats['files_processed'] += 1
+                        self.processing_stats['total_records'] += len(df)
+                    else:
+                        self.processing_stats['files_failed'] += 1
+                        
+                except Exception as e:
+                    logger.error(f"Failed to process {file_path.name}: {e}")
+                    self.processing_stats['files_failed'] += 1
+            
+            if not dataframes:
+                logger.error("No dataframes were successfully loaded")
+                return False
+            
+            # Merge all dataframes
+            logger.info("Starting merge process...")
+            self.merged_data = self.merge_dataframes_by_slug_datetime(dataframes)
+            
+            if self.merged_data.empty:
+                logger.error("Merged dataframe is empty")
+                return False
+            
+            # Update final stats
+            self.processing_stats['date_range'] = {
+                'start': str(self.merged_data.index.min()),
+                'end': str(self.merged_data.index.max()),
+                'total_days': (self.merged_data.index.max() - self.merged_data.index.min()).days
+            }
+            
+            logger.info("All files processed successfully")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to process files: {e}")
+            return False
+
+    def save_merged_features(self, filename: str = "merged_features.parquet") -> bool:
+        """
+        Save the merged features to a parquet file with comprehensive null filling
+        
+        Args:
+            filename: Output filename
+            
+        Returns:
+            True if successful, False otherwise
+        """
+        if self.merged_data is None or self.merged_data.empty:
+            logger.error("No merged data to save")
+            return False
+        
+        try:
+            output_path = self.output_dir / filename
+            
+            # Clean up the dataframe before saving
+            cleaned_df = self.merged_data.copy()
+            
+            # Remove any completely null columns
+            null_columns = cleaned_df.columns[cleaned_df.isnull().all()].tolist()
+            if null_columns:
+                logger.info(f"Removing {len(null_columns)} completely null columns: {null_columns}")
+                cleaned_df = cleaned_df.dropna(axis=1, how='all')
+            
+            # Apply comprehensive null filling strategy
+            logger.info("Applying comprehensive null filling...")
+            cleaned_df = self.fill_missing_values(cleaned_df)
+            
+            # Remove unwanted columns
+            columns_to_remove = ['metric', 'source_file', 'category', 'development_alternative_slug_used']
+            existing_cols_to_remove = [col for col in columns_to_remove if col in cleaned_df.columns]
+            if existing_cols_to_remove:
+                logger.info(f"Removing unwanted columns: {existing_cols_to_remove}")
+                cleaned_df = cleaned_df.drop(columns=existing_cols_to_remove)
+            
+            # Ensure all slugs are in uppercase format
+            logger.info("Ensuring all slugs are in uppercase format...")
+            cleaned_df['slug'] = cleaned_df['slug'].apply(lambda x: x.upper() if isinstance(x, str) else x)
+            
+            # Fix data type issues for parquet compatibility
+            logger.info("Fixing data types for parquet compatibility...")
+            for col in cleaned_df.columns:
+                if cleaned_df[col].dtype == 'object':
+                    # Check if column contains mixed types
+                    sample_values = cleaned_df[col].dropna().head(100)
+                    if len(sample_values) > 0:
+                        # If it looks like it should be numeric, convert it
+                        try:
+                            pd.to_numeric(sample_values, errors='raise')
+                            # If no error, convert the entire column
+                            cleaned_df[col] = pd.to_numeric(cleaned_df[col], errors='coerce')
+                            logger.debug(f"Converted {col} to numeric")
+                        except (ValueError, TypeError):
+                            # If conversion fails, ensure it's all strings
+                            cleaned_df[col] = cleaned_df[col].astype(str)
+                            logger.debug(f"Converted {col} to string")
+            
+            # Sort by datetime and slug
+            cleaned_df = cleaned_df.sort_index()
+            cleaned_df = cleaned_df.sort_values(['slug'], kind='mergesort')
+            
+            # Final data quality check
+            remaining_nulls = cleaned_df.isnull().sum().sum()
+            if remaining_nulls > 0:
+                logger.warning(f"Warning: {remaining_nulls} null values remain after filling")
+                # Log columns with remaining nulls
+                null_cols = cleaned_df.columns[cleaned_df.isnull().any()].tolist()
+                logger.warning(f"Columns with remaining nulls: {null_cols}")
+            else:
+                logger.info("✓ All null values successfully filled")
+            
+            # Save to parquet with error handling
+            try:
+                cleaned_df.to_parquet(output_path, compression='snappy')
+            except Exception as parquet_error:
+                logger.error(f"Parquet save failed: {parquet_error}")
+                # Try to identify problematic columns
+                logger.info("Analyzing columns for parquet compatibility...")
+                for col in cleaned_df.columns:
+                    try:
+                        test_df = cleaned_df[[col]].copy()
+                        test_df.to_parquet(output_path.with_suffix('.test.parquet'))
+                        output_path.with_suffix('.test.parquet').unlink()  # Clean up test file
+                    except Exception as col_error:
+                        logger.error(f"Column {col} causing issues: {col_error}")
+                        # Force convert problematic column to string
+                        cleaned_df[col] = cleaned_df[col].astype(str)
+                        logger.info(f"Converted problematic column {col} to string")
+                
+                # Try saving again
+                cleaned_df.to_parquet(output_path, compression='snappy')
+            
+            logger.info(f"Merged features saved to {output_path}")
+            logger.info(f"Final dataset: {len(cleaned_df)} records, {len(cleaned_df.columns)} columns")
+            logger.info(f"Data completeness: {100 - (remaining_nulls / (len(cleaned_df) * len(cleaned_df.columns)) * 100):.2f}%")
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to save merged features: {e}")
+            return False
+
+    def generate_summary_report(self) -> Dict:
+        """
+        Generate a comprehensive summary report
+        
+        Returns:
+            Summary dictionary
+        """
+        summary = {
+            'processing_timestamp': datetime.now().isoformat(),
+            'files_statistics': {
+                'files_found': self.processing_stats['files_found'],
+                'files_processed': self.processing_stats['files_processed'],
+                'files_failed': self.processing_stats['files_failed'],
+                'success_rate': f"{(self.processing_stats['files_processed'] / max(1, self.processing_stats['files_found'])) * 100:.1f}%"
+            },
+            'data_statistics': {
+                'total_records': self.processing_stats['total_records'],
+                'unique_slugs': list(self.processing_stats['unique_slugs']),
+                'categories_found': list(self.processing_stats['categories']),
+                'date_range': self.processing_stats['date_range']
+            }
+        }
+        
+        if self.merged_data is not None:
+            summary['merged_statistics'] = {
+                'final_records': len(self.merged_data),
+                'final_columns': len(self.merged_data.columns),
+                'memory_usage_mb': f"{self.merged_data.memory_usage(deep=True).sum() / 1024 / 1024:.2f}",
+                'slug_distribution': self.merged_data['slug'].value_counts().to_dict(),
+                'null_percentage': f"{(self.merged_data.isnull().sum().sum() / (len(self.merged_data) * len(self.merged_data.columns))) * 100:.2f}%"
+            }
+        
+        return summary
+
+    def print_summary(self):
+        """Print a comprehensive summary of the merge process"""
+        summary = self.generate_summary_report()
+        
+        print("\n" + "="*60)
+        print("SANTIMENT DATA MERGER SUMMARY")
+        print("="*60)
+        
+        # File statistics
+        print(f"\nFile Processing:")
+        print(f"  Files found: {summary['files_statistics']['files_found']}")
+        print(f"  Files processed: {summary['files_statistics']['files_processed']}")
+        print(f"  Files failed: {summary['files_statistics']['files_failed']}")
+        print(f"  Success rate: {summary['files_statistics']['success_rate']}")
+        
+        # Data statistics
+        print(f"\nData Overview:")
+        print(f"  Total records processed: {summary['data_statistics']['total_records']:,}")
+        print(f"  Unique assets (slugs): {len(summary['data_statistics']['unique_slugs'])}")
+        print(f"  Categories found: {', '.join(summary['data_statistics']['categories_found'])}")
+        
+        if summary['data_statistics']['date_range']:
+            print(f"  Date range: {summary['data_statistics']['date_range']['start']} to {summary['data_statistics']['date_range']['end']}")
+            print(f"  Total days: {summary['data_statistics']['date_range']['total_days']}")
+        
+        # Merged statistics
+        if 'merged_statistics' in summary:
+            print(f"\nMerged Dataset:")
+            print(f"  Final records: {summary['merged_statistics']['final_records']:,}")
+            print(f"  Final columns: {summary['merged_statistics']['final_columns']}")
+            print(f"  Memory usage: {summary['merged_statistics']['memory_usage_mb']} MB")
+            print(f"  Data completeness: {100 - float(summary['merged_statistics']['null_percentage'].rstrip('%')):.1f}%")
+            
+            # Show top assets by record count
+            print(f"\nTop Assets by Record Count:")
+            slug_dist = summary['merged_statistics']['slug_distribution']
+            for slug, count in list(slug_dist.items())[:5]:
+                print(f"  {slug}: {count:,} records")
+        
+        print("="*60)
+
+
+def main():
+    """Main function to run the Santiment data merger"""
+    logger.info("Starting Santiment Data Merger...")
+    
+    # Initialize the merger
+    merger = SantimentDataMerger(
+        source_dir="data/santiment",
+        output_dir="data/santiment",
+        time_tolerance_hours=1
+    )
+    
+    try:
+        # Process all files
+        success = merger.process_all_files()
+        
+        if not success:
+            logger.error("Failed to process Santiment files")
+            return False
+        
+        # If we only created a placeholder, treat as successful and skip saving/summary
+        if merger.placeholder_created:
+            logger.info("Placeholder Santiment dataset created; skipping save and summary.")
+            return True
+        
+        # Save merged features
+        save_success = merger.save_merged_features("merged_features.parquet")
+        
+        if not save_success:
+            logger.error("Failed to save merged features")
+            return False
+        
+        # Print summary
+        merger.print_summary()
+        
+        # Save summary report
+        summary = merger.generate_summary_report()
+        summary_path = Path("data/santiment") / "merge_summary.json"
+        
+        import json
+        with open(summary_path, 'w') as f:
+            json.dump(summary, f, indent=2, default=str)
+        
+        logger.info(f"Summary report saved to {summary_path}")
+        logger.info("Santiment data merge completed successfully!")
+        
+        return True
+        
+    except Exception as e:
+        logger.error(f"Santiment data merge failed: {e}")
+        return False
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/merge_santiment_time_shifted.py b/src/merge/merge_santiment_time_shifted.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc2ca0e82db3e526825ce4a42d3c382b111e485e
--- /dev/null
+++ b/src/merge/merge_santiment_time_shifted.py
@@ -0,0 +1,282 @@
+#!/usr/bin/env python3
+"""
+Time-Shifted Santiment-Crypto Merger
+===================================
+
+This script handles the case where Santiment data and crypto data have different date ranges
+due to API limitations. It performs a time-shifted merge using pattern matching.
+
+Approaches:
+1. Offset-based: Map August crypto data to July Santiment data with consistent offset
+2. Day-of-week matching: Match same weekdays/times across different months
+3. Pattern-based: Use similar market patterns from different time periods
+"""
+
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+import os
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+def load_data():
+    """Load crypto and Santiment data"""
+    logger.info("Loading data files...")
+    
+    # Load crypto features
+    crypto_file = 'data/merged/features/crypto_features.parquet'
+    crypto_df = pd.read_parquet(crypto_file)
+    crypto_df['datetime'] = pd.to_datetime(crypto_df['interval_timestamp'], unit='ms', utc=True)
+    
+    # Load Santiment features  
+    santiment_file = 'data/santiment/merged_features.parquet'
+    santiment_df = pd.read_parquet(santiment_file)
+    
+    logger.info(f"Crypto: {len(crypto_df)} records from {crypto_df['datetime'].min()} to {crypto_df['datetime'].max()}")
+    logger.info(f"Santiment: {len(santiment_df)} records from {santiment_df.index.min()} to {santiment_df.index.max()}")
+    
+    return crypto_df, santiment_df
+
+def calculate_time_offset(crypto_df, santiment_df):
+    """Calculate the time offset between datasets"""
+    crypto_start = crypto_df['datetime'].min()
+    santiment_start = santiment_df.index.min()
+    
+    offset = crypto_start - santiment_start
+    logger.info(f"Time offset: {offset.days} days")
+    
+    return offset
+
+def merge_with_time_shift(crypto_df, santiment_df, method='offset'):
+    """
+    Merge crypto and Santiment data using time-shift techniques
+    
+    Args:
+        crypto_df: Crypto features DataFrame
+        santiment_df: Santiment features DataFrame  
+        method: 'offset', 'day_of_week', or 'pattern'
+    """
+    logger.info(f"Starting time-shifted merge using method: {method}")
+    
+    merged_results = []
+    symbol_mapping = {'BTC': 'BTC', 'ETH': 'ETH', 'ADA': 'ADA', 'SOL': 'SOL', 'XRP': 'XRP'}
+    
+    if method == 'offset':
+        # Calculate consistent time offset
+        offset = calculate_time_offset(crypto_df, santiment_df)
+        
+        for symbol, slug in symbol_mapping.items():
+            logger.info(f"Processing {symbol} → {slug} with offset method")
+            
+            crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy()
+            santiment_slug = santiment_df[santiment_df['slug'] == slug].copy()
+            
+            if crypto_symbol.empty or santiment_slug.empty:
+                logger.warning(f"Skipping {symbol} - missing data")
+                continue
+            
+            # Apply offset to match timeframes
+            merged_symbol = merge_with_offset(crypto_symbol, santiment_slug, offset)
+            merged_results.append(merged_symbol)
+            
+    elif method == 'day_of_week':
+        # Match same day-of-week and time patterns
+        for symbol, slug in symbol_mapping.items():
+            logger.info(f"Processing {symbol} → {slug} with day-of-week method")
+            
+            crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy()
+            santiment_slug = santiment_df[santiment_df['slug'] == slug].copy()
+            
+            if crypto_symbol.empty or santiment_slug.empty:
+                logger.warning(f"Skipping {symbol} - missing data")
+                continue
+                
+            merged_symbol = merge_by_day_pattern(crypto_symbol, santiment_slug)
+            merged_results.append(merged_symbol)
+    
+    # Combine results
+    if merged_results:
+        merged_df = pd.concat(merged_results, ignore_index=True)
+        logger.info(f"Merge completed: {len(merged_df)} records")
+        return merged_df
+    else:
+        logger.error("No data could be merged!")
+        return None
+
+def merge_with_offset(crypto_symbol, santiment_slug, offset):
+    """Merge using consistent time offset"""
+    merged_records = []
+    
+    for _, crypto_row in crypto_symbol.iterrows():
+        # Shift crypto timestamp back by offset to match Santiment timeframe
+        shifted_time = crypto_row['datetime'] - offset
+        
+        # Find closest Santiment record
+        time_diffs = np.abs(santiment_slug.index - shifted_time)
+        closest_idx = time_diffs.argmin()
+        closest_idx = santiment_slug.index[closest_idx]
+        
+        # Check if match is reasonable (within 1 hour)
+        if time_diffs.min() <= pd.Timedelta(hours=1):
+            santiment_row = santiment_slug.loc[closest_idx]
+            
+            # Combine data
+            combined_row = crypto_row.copy()
+            for col in santiment_slug.columns:
+                if col != 'slug':
+                    combined_row[f'santiment_{col}'] = santiment_row[col]
+            
+            merged_records.append(combined_row)
+    
+    return pd.DataFrame(merged_records)
+
+def merge_by_day_pattern(crypto_symbol, santiment_slug):
+    """Merge by matching day-of-week and time patterns"""
+    merged_records = []
+    
+    for _, crypto_row in crypto_symbol.iterrows():
+        crypto_time = crypto_row['datetime']
+        
+        # Find Santiment records with same day-of-week and similar time
+        santiment_same_weekday = santiment_slug[
+            santiment_slug.index.dayofweek == crypto_time.dayofweek
+        ]
+        
+        if not santiment_same_weekday.empty:
+            # Find closest time-of-day match
+            crypto_time_of_day = crypto_time.time()
+            
+            time_diffs = santiment_same_weekday.index.map(
+                lambda x: abs((x.time().hour * 60 + x.time().minute) - 
+                             (crypto_time_of_day.hour * 60 + crypto_time_of_day.minute))
+            )
+            
+            closest_idx = time_diffs.argmin()
+            closest_idx = santiment_same_weekday.index[closest_idx]
+            santiment_row = santiment_same_weekday.loc[closest_idx]
+            
+            # Combine data
+            combined_row = crypto_row.copy()
+            for col in santiment_slug.columns:
+                if col != 'slug':
+                    combined_row[f'santiment_{col}'] = santiment_row[col]
+            
+            merged_records.append(combined_row)
+    
+    return pd.DataFrame(merged_records)
+
+def analyze_merge_quality(merged_df, method):
+    """Analyze merge quality and provide statistics"""
+    if merged_df is None or merged_df.empty:
+        return {"error": "No merged data"}
+    
+    santiment_cols = [col for col in merged_df.columns if col.startswith('santiment_')]
+    
+    analysis = {
+        'method_used': method,
+        'total_records': len(merged_df),
+        'santiment_features_added': len(santiment_cols),
+        'symbols_processed': sorted(merged_df['symbol'].unique()),
+        'completeness_by_symbol': {}
+    }
+    
+    # Calculate completeness by symbol
+    for symbol in analysis['symbols_processed']:
+        symbol_data = merged_df[merged_df['symbol'] == symbol]
+        non_null_counts = symbol_data[santiment_cols].notna().sum(axis=1)
+        records_with_santiment = (non_null_counts > 0).sum()
+        
+        analysis['completeness_by_symbol'][symbol] = {
+            'total_records': len(symbol_data),
+            'records_with_santiment': records_with_santiment,
+            'completeness_pct': records_with_santiment / len(symbol_data) * 100
+        }
+    
+    return analysis
+
+def save_results(merged_df, analysis, method):
+    """Save merged results with method identifier"""
+    if merged_df is None:
+        logger.error("Cannot save - no merged data")
+        return None, None
+    
+    logger.info("Saving time-shifted merge results...")
+    
+    # Create output directory
+    output_dir = 'data/merged/features'
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Save with method identifier
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = os.path.join(output_dir, f'crypto_with_santiment_{method}_{timestamp}.parquet')
+    
+    merged_df.to_parquet(output_file, index=False)
+    logger.info(f"Merged features saved to: {output_file}")
+    
+    # Save analysis
+    analysis_file = os.path.join(output_dir, f'santiment_merge_analysis_{method}_{timestamp}.json')
+    import json
+    with open(analysis_file, 'w') as f:
+        json.dump(analysis, f, indent=2, default=str)
+    
+    logger.info(f"Analysis saved to: {analysis_file}")
+    
+    return output_file, analysis_file
+
+def main():
+    """Main time-shifted merge process"""
+    logger.info("Starting time-shifted Santiment-Crypto merge...")
+    
+    try:
+        # Load data
+        crypto_df, santiment_df = load_data()
+        
+        # Try different merge methods
+        methods = ['offset', 'day_of_week']
+        results = {}
+        
+        for method in methods:
+            logger.info(f"\n{'='*50}")
+            logger.info(f"TRYING METHOD: {method.upper()}")
+            logger.info(f"{'='*50}")
+            
+            merged_df = merge_with_time_shift(crypto_df, santiment_df, method=method)
+            analysis = analyze_merge_quality(merged_df, method)
+            
+            if merged_df is not None:
+                output_file, analysis_file = save_results(merged_df, analysis, method)
+                results[method] = {
+                    'success': True,
+                    'records': len(merged_df),
+                    'completeness': analysis.get('completeness_by_symbol', {}),
+                    'output_file': output_file
+                }
+            else:
+                results[method] = {'success': False}
+        
+        # Print summary
+        print("\n" + "="*60)
+        print("TIME-SHIFTED MERGE SUMMARY")
+        print("="*60)
+        
+        for method, result in results.items():
+            print(f"\n{method.upper()} METHOD:")
+            if result['success']:
+                print(f"  ✅ Success: {result['records']} records merged")
+                print(f"  📁 File: {result['output_file']}")
+                for symbol, stats in result['completeness'].items():
+                    print(f"     {symbol}: {stats['completeness_pct']:.1f}% complete")
+            else:
+                print(f"  ❌ Failed")
+        
+        print("="*60)
+        
+    except Exception as e:
+        logger.error(f"Time-shifted merge failed: {e}")
+        raise
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/merge_santiment_to_crypto.py b/src/merge/merge_santiment_to_crypto.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b896be3a68ae3b8f6d93478465f69ad8b82cd6
--- /dev/null
+++ b/src/merge/merge_santiment_to_crypto.py
@@ -0,0 +1,422 @@
+#!/usr/bin/env python3
+"""
+Merge Santiment Features with Crypto Features
+============================================
+
+This script merges Santiment data with existing crypto features by matching:
+- symbol (crypto) = slug (santiment)  
+- interval_timestamp (crypto) = datetime (santiment) with ±1 hour tolerance
+
+The result includes all original crypto features plus all Santiment features.
+"""
+
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+import os
+from pathlib import Path
+import logging
+
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Resolve data directory base
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+
+def _resolve_under_data(path_like: str | os.PathLike) -> Path:
+    p = Path(path_like)
+    if p.is_absolute():
+        return p
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return Path(CFG_DATA_DIR) / rel
+
+def convert_timestamp_to_datetime(timestamp_ms):
+    """
+    Convert millisecond timestamp to datetime
+    
+    Args:
+        timestamp_ms: Timestamp in milliseconds
+        
+    Returns:
+        Datetime object
+    """
+    return pd.to_datetime(timestamp_ms, unit='ms', utc=True)
+
+def normalize_symbol_mapping():
+    """
+    Create symbol mapping between crypto symbols and Santiment slugs
+    
+    Returns:
+        Dictionary mapping crypto symbols to Santiment slugs
+    """
+    # Both crypto and Santiment use the same symbol names
+    return {
+        'BTC': 'BTC',
+        'ETH': 'ETH', 
+        'ADA': 'ADA',
+        'SOL': 'SOL',
+        'XRP': 'XRP'
+    }
+
+def load_data():
+    """
+    Load crypto features and Santiment features
+    
+    Returns:
+        Tuple of (crypto_df, santiment_df)
+    """
+    logger.info("Loading data files...")
+    
+    # Load crypto features
+    crypto_file = _resolve_under_data('data/merged/features/crypto_features.parquet')
+    if not os.path.exists(crypto_file):
+        raise FileNotFoundError(f"Crypto features file not found: {crypto_file}")
+    
+    crypto_df = pd.read_parquet(crypto_file)
+    logger.info(f"Loaded crypto features: {crypto_df.shape[0]} rows, {crypto_df.shape[1]} columns")
+    
+    # Load Santiment features  
+    santiment_file = _resolve_under_data('data/santiment/merged_features.parquet')
+    if not os.path.exists(santiment_file):
+        logger.warning(f"Santiment features file not found: {santiment_file}")
+        logger.warning("Proceeding without Santiment features (crypto-only output)")
+        return crypto_df, None
+        
+    santiment_df = pd.read_parquet(santiment_file)
+    logger.info(f"Loaded Santiment features: {santiment_df.shape[0]} rows, {santiment_df.shape[1]} columns")
+    
+    return crypto_df, santiment_df
+
+def prepare_crypto_data(crypto_df):
+    """
+    Prepare crypto data for merging
+    
+    Args:
+        crypto_df: Crypto features DataFrame
+        
+    Returns:
+        Prepared crypto DataFrame
+    """
+    logger.info("Preparing crypto data...")
+    
+    # Convert interval_timestamp to datetime
+    crypto_df = crypto_df.copy()
+    crypto_df['datetime'] = convert_timestamp_to_datetime(crypto_df['interval_timestamp'])
+    
+    # Set datetime as index for easier merging
+    crypto_df.set_index('datetime', inplace=True)
+    
+    logger.info(f"Crypto date range: {crypto_df.index.min()} to {crypto_df.index.max()}")
+    logger.info(f"Crypto symbols: {sorted(crypto_df['symbol'].unique())}")
+    
+    return crypto_df
+
+def prepare_santiment_data(santiment_df):
+    """
+    Prepare Santiment data for merging
+    
+    Args:
+        santiment_df: Santiment features DataFrame
+        
+    Returns:
+        Prepared Santiment DataFrame  
+    """
+    logger.info("Preparing Santiment data...")
+    
+    santiment_df = santiment_df.copy()
+    
+    # Ensure datetime index is timezone-aware (convert to UTC if needed)
+    if santiment_df.index.tz is None:
+        santiment_df.index = pd.to_datetime(santiment_df.index, utc=True)
+    elif str(santiment_df.index.tz) != 'UTC':
+        santiment_df.index = santiment_df.index.tz_convert('UTC')
+    
+    logger.info(f"Santiment date range: {santiment_df.index.min()} to {santiment_df.index.max()}")
+    logger.info(f"Santiment slugs: {sorted(santiment_df['slug'].unique())}")
+    
+    return santiment_df
+
+def merge_with_tolerance(crypto_df, santiment_df, symbol_mapping, tolerance_hours=1):
+    """
+    Merge crypto and Santiment data with time tolerance
+    
+    Args:
+        crypto_df: Prepared crypto DataFrame
+        santiment_df: Prepared Santiment DataFrame  
+        symbol_mapping: Dict mapping crypto symbols to Santiment slugs
+        tolerance_hours: Time tolerance in hours for matching
+        
+    Returns:
+        Merged DataFrame
+    """
+    logger.info(f"Starting merge with ±{tolerance_hours} hour tolerance...")
+    
+    merged_results = []
+    tolerance = pd.Timedelta(hours=tolerance_hours)
+    
+    # Track merge statistics
+    total_crypto_records = len(crypto_df)
+    successful_matches = 0
+    
+    for symbol, slug in symbol_mapping.items():
+        logger.info(f"Processing {symbol} → {slug}")
+        
+        # Filter data for current symbol/slug
+        crypto_symbol = crypto_df[crypto_df['symbol'] == symbol].copy()
+        santiment_slug = santiment_df[santiment_df['slug'] == slug].copy()
+        
+        if crypto_symbol.empty:
+            logger.warning(f"No crypto data found for symbol: {symbol}")
+            continue
+            
+        if santiment_slug.empty:
+            logger.warning(f"No Santiment data found for slug: {slug}")
+            # Add crypto data with null Santiment features
+            crypto_symbol_with_nulls = add_null_santiment_features(crypto_symbol, santiment_df.columns)
+            merged_results.append(crypto_symbol_with_nulls)
+            continue
+        
+        # Perform time-tolerance merge
+        merged_symbol = merge_by_time_tolerance(crypto_symbol, santiment_slug, tolerance)
+        merged_results.append(merged_symbol)
+        
+        matches = len(merged_symbol)
+        successful_matches += matches
+        logger.info(f"  Matched {matches}/{len(crypto_symbol)} records for {symbol}")
+    
+    # Combine all results
+    if merged_results:
+        merged_df = pd.concat(merged_results, ignore_index=False)
+        logger.info(f"Merge completed: {successful_matches}/{total_crypto_records} records matched ({successful_matches/total_crypto_records*100:.1f}%)")
+    else:
+        logger.error("No data could be merged!")
+        return None
+        
+    return merged_df
+
+def merge_by_time_tolerance(crypto_symbol, santiment_slug, tolerance):
+    """
+    Merge crypto and Santiment data for a single symbol with time tolerance
+    
+    Args:
+        crypto_symbol: Crypto data for one symbol
+        santiment_slug: Santiment data for one slug
+        tolerance: Time tolerance as Timedelta
+        
+    Returns:
+        Merged DataFrame for this symbol
+    """
+    merged_records = []
+    
+    for crypto_time, crypto_row in crypto_symbol.iterrows():
+        # Find Santiment records within tolerance
+        time_diff = np.abs(santiment_slug.index - crypto_time)
+        within_tolerance = time_diff <= tolerance
+        
+        if within_tolerance.any():
+            # Get the closest match within tolerance
+            closest_idx = time_diff[within_tolerance].idxmin()
+            santiment_row = santiment_slug.loc[closest_idx]
+            
+            # Combine crypto and Santiment features
+            combined_row = crypto_row.copy()
+            
+            # Add Santiment features (excluding 'slug' to avoid duplication)
+            for col in santiment_slug.columns:
+                if col != 'slug':  # Don't overwrite symbol with slug
+                    combined_row[f'santiment_{col}'] = santiment_row[col]
+            
+            merged_records.append(combined_row)
+        else:
+            # No match found - add with null Santiment features
+            combined_row = crypto_row.copy()
+            for col in santiment_slug.columns:
+                if col != 'slug':
+                    combined_row[f'santiment_{col}'] = np.nan
+            merged_records.append(combined_row)
+    
+    return pd.DataFrame(merged_records, index=crypto_symbol.index)
+
+def add_null_santiment_features(crypto_df, santiment_columns):
+    """
+    Add null Santiment features to crypto data when no Santiment data exists
+    
+    Args:
+        crypto_df: Crypto DataFrame
+        santiment_columns: Santiment column names
+        
+    Returns:
+        Crypto DataFrame with null Santiment features
+    """
+    crypto_with_nulls = crypto_df.copy()
+    
+    for col in santiment_columns:
+        if col != 'slug':  # Don't add slug column
+            crypto_with_nulls[f'santiment_{col}'] = np.nan
+            
+    return crypto_with_nulls
+
+def analyze_merge_quality(merged_df):
+    """
+    Analyze the quality of the merge
+    
+    Args:
+        merged_df: Merged DataFrame
+        
+    Returns:
+        Dictionary with merge quality metrics
+    """
+    logger.info("Analyzing merge quality...")
+    
+    # Count Santiment features (exclude slug)
+    santiment_cols = [col for col in merged_df.columns if col.startswith('santiment_')]
+    
+    analysis = {
+        'total_records': len(merged_df),
+        'santiment_features_added': len(santiment_cols),
+        'symbols_processed': sorted(merged_df['symbol'].unique()),
+        'completeness_by_symbol': {},
+        'overall_completeness': 0.0
+    }
+    
+    # Analyze completeness by symbol
+    for symbol in analysis['symbols_processed']:
+        symbol_data = merged_df[merged_df['symbol'] == symbol]
+        
+        # Calculate how many records have non-null Santiment data
+        non_null_counts = symbol_data[santiment_cols].notna().sum(axis=1)
+        records_with_santiment = (non_null_counts > 0).sum()
+        
+        completeness = records_with_santiment / len(symbol_data) * 100
+        analysis['completeness_by_symbol'][symbol] = {
+            'total_records': len(symbol_data),
+            'records_with_santiment': records_with_santiment,
+            'completeness_pct': completeness
+        }
+    
+    # Overall completeness
+    all_santiment_data = merged_df[santiment_cols].notna().sum(axis=1)
+    records_with_any_santiment = (all_santiment_data > 0).sum()
+    analysis['overall_completeness'] = records_with_any_santiment / len(merged_df) * 100
+    
+    return analysis
+
+def save_results(merged_df, analysis):
+    """
+    Save merged results and analysis
+    
+    Args:
+        merged_df: Merged DataFrame
+        analysis: Merge quality analysis
+    """
+    logger.info("Saving results...")
+    
+    # Create output directory
+    output_dir = 'data/merged/features'
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Save merged features
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_file = os.path.join(output_dir, f'crypto_with_santiment_features_{timestamp}.parquet')
+    
+    # Reset index to include datetime as column
+    merged_df_export = merged_df.reset_index()
+    merged_df_export.to_parquet(output_file, index=False)
+    
+    logger.info(f"Merged features saved to: {output_file}")
+    
+    # Save analysis report
+    analysis_file = os.path.join(output_dir, f'santiment_merge_analysis_{timestamp}.json')
+    import json
+    with open(analysis_file, 'w') as f:
+        json.dump(analysis, f, indent=2, default=str)
+    
+    logger.info(f"Analysis saved to: {analysis_file}")
+    
+    return output_file, analysis_file
+
+def main():
+    """
+    Main merge process
+    """
+    logger.info("Starting Santiment-Crypto merge process...")
+    
+    try:
+        # Load data
+        crypto_df, santiment_df = load_data()
+        
+        # Prepare data
+        crypto_prepared = prepare_crypto_data(crypto_df)
+        if santiment_df is None:
+            logger.warning("No Santiment data available; exporting crypto-only dataset")
+            # Export crypto-only with datetime included
+            output_dir = 'data/merged/features'
+            os.makedirs(output_dir, exist_ok=True)
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            output_file = os.path.join(output_dir, f'crypto_with_santiment_features_{timestamp}.parquet')
+            crypto_prepared.reset_index().to_parquet(output_file, index=False)
+            logger.info(f"Crypto-only features saved to: {output_file}")
+            return
+        santiment_prepared = prepare_santiment_data(santiment_df)
+        
+        # Define symbol mapping
+        symbol_mapping = normalize_symbol_mapping()
+        logger.info(f"Symbol mapping: {symbol_mapping}")
+        
+        # Perform merge
+        merged_df = merge_with_tolerance(
+            crypto_prepared,
+            santiment_prepared,
+            symbol_mapping,
+            tolerance_hours=1
+        )
+        
+        if merged_df is None:
+            logger.error("Merge failed!")
+            return
+        
+        # Analyze results
+        analysis = analyze_merge_quality(merged_df)
+        
+        # Print summary
+        print("\n" + "="*60)
+        print("SANTIMENT-CRYPTO MERGE SUMMARY")
+        print("="*60)
+        print(f"Total records: {analysis['total_records']}")
+        print(f"Santiment features added: {analysis['santiment_features_added']}")
+        print(f"Overall completeness: {analysis['overall_completeness']:.1f}%")
+        print(f"Symbols processed: {analysis['symbols_processed']}")
+        
+        print(f"\nCompleteness by symbol:")
+        for symbol, stats in analysis['completeness_by_symbol'].items():
+            print(f"  {symbol}: {stats['records_with_santiment']}/{stats['total_records']} "
+                  f"({stats['completeness_pct']:.1f}%)")
+        
+        # Save results
+        output_file, analysis_file = save_results(merged_df, analysis)
+        
+        print(f"\nFiles saved:")
+        print(f"  Merged data: {output_file}")
+        print(f"  Analysis: {analysis_file}")
+        print("="*60)
+        
+        logger.info("Merge process completed successfully!")
+        
+    except Exception as e:
+        logger.error(f"Merge process failed: {e}")
+        raise
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/merge_santiment_with_crypto.py b/src/merge/merge_santiment_with_crypto.py
new file mode 100644
index 0000000000000000000000000000000000000000..766dbf5c4c47d6588c32cd36162c6ae8817c1e20
--- /dev/null
+++ b/src/merge/merge_santiment_with_crypto.py
@@ -0,0 +1,586 @@
+"""
+Santiment-Crypto Features Merger
+===============================
+
+This script merges the Santiment merged features with the existing normalized crypto features.
+It reads santiment/merged_features.parquet and crypto_features_normalized.pkl,
+aligns them by symbol and datetime, and creates a unified feature set.
+
+Features:
+- Loads Santiment merged features (parquet)
+- Loads existing crypto features (pickle)
+- Symbol alignment and normalization
+- Time-based merging with tolerance
+- Feature name conflict resolution
+- Creates unified normalized feature set
+
+Author: AI Assistant
+Date: August 2025
+"""
+
+import os
+import sys
+import pandas as pd
+import numpy as np
+import pickle
+from pathlib import Path
+from datetime import datetime, timedelta
+import logging
+from typing import List, Dict, Optional, Tuple, Union
+
+# Resolve data directory base
+try:
+    from src.config import DATA_DIR as CFG_DATA_DIR
+except Exception:
+    try:
+        from config import DATA_DIR as CFG_DATA_DIR
+    except Exception:
+        CFG_DATA_DIR = "/data"
+
+
+def _resolve_under_data(path_like: str | os.PathLike) -> Path:
+    p = Path(path_like)
+    if p.is_absolute():
+        return p
+    parts = p.parts
+    if parts and parts[0].lower() == "data":
+        rel = Path(*parts[1:]) if len(parts) > 1 else Path()
+    else:
+        rel = p
+    return Path(CFG_DATA_DIR) / rel
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class SantimentCryptoMerger:
+    """
+    Merger for combining Santiment features with existing crypto features
+    """
+    
+    def __init__(self, 
+                 santiment_file: str = "data/santiment/merged_features.parquet",
+                 crypto_file: str = "data/merged/features/crypto_features.parquet",
+                 output_file: str = "data/merged/features/crypto_features.parquet",
+                 time_tolerance_hours: int = 1):
+        """
+        Initialize the merger
+        
+        Args:
+            santiment_file: Path to original Santiment merged features parquet file
+            crypto_file: Path to original crypto features file (crypto_features.parquet)
+            output_file: Path for the final merged output file (will replace crypto_features.parquet)
+            time_tolerance_hours: Time tolerance for merging (hours)
+        """
+        self.santiment_file = _resolve_under_data(santiment_file)
+        self.crypto_file = _resolve_under_data(crypto_file)
+        self.output_file = _resolve_under_data(output_file)
+        self.time_tolerance = timedelta(hours=time_tolerance_hours)
+        
+        # Ensure output directory exists
+        self.output_file.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Data storage
+        self.santiment_data: Optional[pd.DataFrame] = None
+        self.crypto_data: Optional[pd.DataFrame] = None
+        self.merged_data: Optional[pd.DataFrame] = None
+        
+        # Processing stats
+        self.stats = {
+            'santiment_records': 0,
+            'crypto_records': 0,
+            'common_symbols': 0,
+            'merged_records': 0,
+            'santiment_features': 0,
+            'crypto_features': 0,
+            'total_features': 0,
+            'time_range': {}
+        }
+        
+        # Symbol normalizer
+        self.symbol_normalizer = self._setup_symbol_normalizer()
+    
+    def _setup_symbol_normalizer(self):
+        """Setup symbol normalization mapping"""
+        return {
+            # Common crypto symbols
+            'bitcoin': 'BTC', 'btc': 'BTC', 'Bitcoin': 'BTC', 'BTC': 'BTC',
+            'ethereum': 'ETH', 'eth': 'ETH', 'Ethereum': 'ETH', 'ETH': 'ETH',
+            'ripple': 'XRP', 'xrp': 'XRP', 'Ripple': 'XRP', 'XRP': 'XRP',
+            'solana': 'SOL', 'sol': 'SOL', 'Solana': 'SOL', 'SOL': 'SOL',
+            'cardano': 'ADA', 'ada': 'ADA', 'Cardano': 'ADA', 'ADA': 'ADA',
+            'polkadot': 'DOT', 'dot': 'DOT', 'Polkadot': 'DOT', 'DOT': 'DOT',
+            'chainlink': 'LINK', 'link': 'LINK', 'Chainlink': 'LINK', 'LINK': 'LINK',
+            'litecoin': 'LTC', 'ltc': 'LTC', 'Litecoin': 'LTC', 'LTC': 'LTC',
+            'bitcoin-cash': 'BCH', 'bch': 'BCH', 'Bitcoin Cash': 'BCH', 'BCH': 'BCH',
+            'stellar': 'XLM', 'xlm': 'XLM', 'Stellar': 'XLM', 'XLM': 'XLM',
+            'ethereum-classic': 'ETC', 'etc': 'ETC', 'Ethereum Classic': 'ETC', 'ETC': 'ETC',
+            'eos': 'EOS', 'EOS': 'EOS'
+        }
+    
+    def normalize_symbol(self, symbol: str) -> str:
+        """Normalize a symbol to canonical format"""
+        if symbol in self.symbol_normalizer:
+            return self.symbol_normalizer[symbol]
+        return symbol.upper()
+    
+    def load_santiment_data(self) -> bool:
+        """
+        Load original Santiment merged features and apply time-shift logic
+        
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            if not self.santiment_file.exists():
+                logger.error(f"Santiment file not found: {self.santiment_file}")
+                return False
+            
+            logger.info(f"Loading Santiment data from {self.santiment_file}")
+            self.santiment_data = pd.read_parquet(self.santiment_file)
+            
+            # Ensure datetime index
+            if not isinstance(self.santiment_data.index, pd.DatetimeIndex):
+                if 'datetime' in self.santiment_data.columns:
+                    self.santiment_data.set_index('datetime', inplace=True)
+                else:
+                    logger.error("No datetime index found in Santiment data")
+                    return False
+            
+            # Ensure timezone consistency (convert to UTC)
+            if self.santiment_data.index.tz is None:
+                self.santiment_data.index = self.santiment_data.index.tz_localize('UTC')
+            else:
+                self.santiment_data.index = self.santiment_data.index.tz_convert('UTC')
+            
+            # Normalize symbol column
+            if 'slug' in self.santiment_data.columns:
+                self.santiment_data['symbol'] = self.santiment_data['slug'].apply(self.normalize_symbol)
+                self.santiment_data.drop(columns=['slug'], inplace=True)
+            elif 'symbol' in self.santiment_data.columns:
+                self.santiment_data['symbol'] = self.santiment_data['symbol'].apply(self.normalize_symbol)
+            else:
+                logger.error("No symbol/slug column found in Santiment data")
+                return False
+            
+            # Add feature prefix to avoid conflicts
+            feature_cols = [col for col in self.santiment_data.columns if col != 'symbol']
+            rename_dict = {col: f"santiment_{col}" for col in feature_cols}
+            self.santiment_data.rename(columns=rename_dict, inplace=True)
+            
+            self.stats['santiment_records'] = len(self.santiment_data)
+            self.stats['santiment_features'] = len([col for col in self.santiment_data.columns if col != 'symbol'])
+            
+            logger.info(f"Loaded Santiment data: {len(self.santiment_data)} records, {len(self.santiment_data.columns)} columns")
+            logger.info(f"Santiment symbols: {sorted(self.santiment_data['symbol'].unique())}")
+            logger.info(f"Santiment date range: {self.santiment_data.index.min()} to {self.santiment_data.index.max()}")
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to load Santiment data: {e}")
+            return False
+    
+    def load_crypto_data(self) -> bool:
+        """
+        Load existing crypto features
+        
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            if not self.crypto_file.exists():
+                logger.error(f"Crypto file not found: {self.crypto_file}")
+                return False
+            
+            logger.info(f"Loading crypto data from {self.crypto_file}")
+            
+            # Load parquet file
+            self.crypto_data = pd.read_parquet(self.crypto_file)
+            
+            # Don't modify the index - work with interval_timestamp column directly
+            # The data is already clean and properly formatted from previous pipeline steps
+            if 'interval_timestamp' not in self.crypto_data.columns:
+                logger.error("No interval_timestamp column found in crypto data")
+                return False
+            
+            # Check for symbol column
+            symbol_col = None
+            for col in ['symbol', 'Symbol', 'ticker', 'asset', 'slug']:
+                if col in self.crypto_data.columns:
+                    symbol_col = col
+                    break
+            
+            if symbol_col is None:
+                logger.error("No symbol column found in crypto data")
+                logger.info(f"Available columns: {list(self.crypto_data.columns)}")
+                return False
+            
+            # Normalize symbol column
+            if symbol_col != 'symbol':
+                self.crypto_data['symbol'] = self.crypto_data[symbol_col]
+                self.crypto_data.drop(columns=[symbol_col], inplace=True)
+            
+            self.crypto_data['symbol'] = self.crypto_data['symbol'].apply(self.normalize_symbol)
+            
+            self.stats['crypto_records'] = len(self.crypto_data)
+            self.stats['crypto_features'] = len([col for col in self.crypto_data.columns if col != 'symbol'])
+            
+            logger.info(f"Loaded crypto data: {len(self.crypto_data)} records, {len(self.crypto_data.columns)} columns")
+            logger.info(f"Crypto symbols: {sorted(self.crypto_data['symbol'].unique())}")
+            logger.info(f"Crypto date range: {self.crypto_data['interval_timestamp'].min()} to {self.crypto_data['interval_timestamp'].max()}")
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to load crypto data: {e}")
+            return False
+    
+    def apply_time_shift_merge(self, crypto_df, santiment_df, symbol):
+        """
+        Apply time-shifted merge for a specific symbol using day-of-week matching
+        This function preserves ALL crypto records and adds Santiment features where possible
+        
+        Args:
+            crypto_df: Crypto data for one symbol
+            santiment_df: Santiment data for one symbol  
+            symbol: Symbol being processed
+            
+        Returns:
+            Merged DataFrame with ALL crypto records plus Santiment features
+        """
+        logger.info(f"  Time-shift merging {len(crypto_df)} crypto records for {symbol}")
+        
+        # Start with all crypto records
+        result_df = crypto_df.copy()
+        
+        # Initialize all Santiment columns with NaN
+        for col in santiment_df.columns:
+            if col != 'symbol':
+                result_df[col] = np.nan
+        
+        # For each crypto record, try to find a matching Santiment record
+        for crypto_idx, crypto_row in crypto_df.iterrows():
+            # Convert crypto timestamp to datetime for comparison
+            crypto_timestamp_ms = crypto_row['interval_timestamp']
+            crypto_time = pd.to_datetime(crypto_timestamp_ms, unit='ms', utc=True)
+            
+            # Find Santiment records with same day-of-week and similar time
+            santiment_same_weekday = santiment_df[
+                santiment_df.index.dayofweek == crypto_time.dayofweek
+            ]
+            
+            if not santiment_same_weekday.empty:
+                # Find closest time-of-day match
+                crypto_time_of_day = crypto_time.time()
+                
+                time_diffs = santiment_same_weekday.index.map(
+                    lambda x: abs((x.time().hour * 60 + x.time().minute) - 
+                                 (crypto_time_of_day.hour * 60 + crypto_time_of_day.minute))
+                )
+                
+                closest_idx = time_diffs.argmin()
+                closest_idx = santiment_same_weekday.index[closest_idx]
+                santiment_row = santiment_same_weekday.loc[closest_idx]
+                
+                # Update the result DataFrame with Santiment features for this record
+                for col in santiment_df.columns:
+                    if col != 'symbol':
+                        result_df.loc[crypto_idx, col] = santiment_row[col]
+        
+        logger.info(f"  Preserved all {len(result_df)} crypto records for {symbol}")
+        
+        # Count how many got Santiment data
+        santiment_cols = [col for col in santiment_df.columns if col != 'symbol']
+        if santiment_cols:
+            non_null_count = result_df[santiment_cols[0]].notna().sum()
+            logger.info(f"  Added Santiment features to {non_null_count}/{len(result_df)} records ({non_null_count/len(result_df)*100:.1f}%)")
+        
+        return result_df
+    def merge_datasets(self) -> bool:
+        """
+        Merge Santiment and crypto datasets using time-shift logic
+        
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            if self.santiment_data is None or self.crypto_data is None:
+                logger.error("Both datasets must be loaded before merging")
+                return False
+            
+            logger.info("Starting time-shifted merge process...")
+            
+            # Check date ranges
+            # Convert crypto interval_timestamp to datetime for comparison
+            try:
+                crypto_timestamps = pd.to_datetime(self.crypto_data['interval_timestamp'], unit='ms', utc=True)
+                crypto_start, crypto_end = crypto_timestamps.min(), crypto_timestamps.max()
+                sant_start, sant_end = self.santiment_data.index.min(), self.santiment_data.index.max()
+                
+                logger.info(f"Crypto date range: {crypto_start} to {crypto_end}")
+                logger.info(f"Santiment date range: {sant_start} to {sant_end}")
+            except Exception as e:
+                logger.warning(f"Could not calculate date ranges for comparison: {e}")
+                # Use simple range instead
+                crypto_start = crypto_end = None
+                sant_start, sant_end = self.santiment_data.index.min(), self.santiment_data.index.max()
+                logger.info(f"Santiment date range: {sant_start} to {sant_end}")
+            
+            # Check for overlap
+            if crypto_start and crypto_end:
+                overlap = (crypto_start <= sant_end) and (sant_start <= crypto_end)
+                if not overlap:
+                    logger.warning("No date overlap detected - using time-shift merge strategy")
+            else:
+                logger.warning("Using time-shift merge strategy (date comparison skipped)")
+            
+            # Find common symbols
+            santiment_symbols = set(self.santiment_data['symbol'].unique())
+            crypto_symbols = set(self.crypto_data['symbol'].unique())
+            common_symbols = santiment_symbols & crypto_symbols
+            
+            self.stats['common_symbols'] = len(common_symbols)
+            
+            logger.info(f"Common symbols found: {len(common_symbols)} - {sorted(common_symbols)}")
+            
+            if not common_symbols:
+                logger.error("No common symbols found between datasets")
+                # Fallback: produce crypto-only dataset with santiment_* columns as NaN
+                logger.info("Falling back to crypto-only merged output with empty Santiment features")
+                crypto_only = self.crypto_data.copy()
+                # If santiment_data is present but symbols mismatch, create placeholder santiment columns
+                sant_cols = []
+                if self.santiment_data is not None:
+                    sant_cols = [col for col in self.santiment_data.columns if col != 'symbol']
+                # Prefix and add NaN columns
+                for col in sant_cols:
+                    crypto_only[col] = np.nan
+                # Ensure we keep interval_timestamp and symbol ordering
+                self.merged_data = crypto_only.reset_index(drop=True)
+                self.stats['merged_records'] = len(self.merged_data)
+                self.stats['total_features'] = len([c for c in self.merged_data.columns if c != 'symbol'])
+                start_time = pd.to_datetime(self.merged_data['interval_timestamp'].min(), unit='ms', utc=True)
+                end_time = pd.to_datetime(self.merged_data['interval_timestamp'].max(), unit='ms', utc=True)
+                self.stats['time_range'] = {
+                    'start': str(start_time),
+                    'end': str(end_time),
+                    'total_days': (end_time - start_time).days
+                }
+                return True
+            
+            # Process each common symbol with time-shift merge
+            merged_parts = []
+            total_merged_records = 0
+            
+            for symbol in common_symbols:
+                logger.info(f"Processing {symbol} with time-shift merge...")
+                
+                sant_symbol = self.santiment_data[self.santiment_data['symbol'] == symbol].copy()
+                crypto_symbol = self.crypto_data[self.crypto_data['symbol'] == symbol].copy()
+                
+                if crypto_symbol.empty:
+                    logger.warning(f"Skipping {symbol} - no crypto data")
+                    continue
+                
+                if sant_symbol.empty:
+                    logger.warning(f"No Santiment data for {symbol} - adding with null Santiment features")
+                    # Add null Santiment columns to crypto data
+                    sant_cols = [col for col in self.santiment_data.columns if col != 'symbol']
+                    for col in sant_cols:
+                        crypto_symbol[col] = np.nan
+                    # Reset index to avoid conflicts
+                    crypto_symbol = crypto_symbol.reset_index(drop=True)
+                    merged_parts.append(crypto_symbol)
+                    total_merged_records += len(crypto_symbol)
+                else:
+                    # Apply time-shift merge
+                    merged_symbol = self.apply_time_shift_merge(crypto_symbol, sant_symbol, symbol)
+                    # Reset index to avoid conflicts
+                    merged_symbol = merged_symbol.reset_index(drop=True)
+                    merged_parts.append(merged_symbol)
+                    total_merged_records += len(merged_symbol)
+                
+                logger.info(f"  Processed {len(crypto_symbol)} crypto records for {symbol}")
+            
+            # Add crypto-only symbols (without Santiment features)
+            crypto_only_symbols = crypto_symbols - common_symbols
+            for symbol in crypto_only_symbols:
+                logger.info(f"Adding crypto-only symbol: {symbol}")
+                crypto_only = self.crypto_data[self.crypto_data['symbol'] == symbol].copy()
+                
+                # Add null Santiment columns
+                sant_cols = [col for col in self.santiment_data.columns if col != 'symbol']
+                for col in sant_cols:
+                    crypto_only[col] = np.nan
+                
+                # Reset index to avoid conflicts
+                crypto_only = crypto_only.reset_index(drop=True)
+                merged_parts.append(crypto_only)
+                total_merged_records += len(crypto_only)
+            
+            # Combine all parts
+            if merged_parts:
+                self.merged_data = pd.concat(merged_parts, axis=0, ignore_index=True)
+                # Sort by interval_timestamp instead of index
+                self.merged_data = self.merged_data.sort_values('interval_timestamp')
+                
+                self.stats['merged_records'] = len(self.merged_data)
+                self.stats['total_features'] = len([col for col in self.merged_data.columns if col != 'symbol'])
+                
+                # Update time range using interval_timestamp
+                start_time = pd.to_datetime(self.merged_data['interval_timestamp'].min(), unit='ms', utc=True)
+                end_time = pd.to_datetime(self.merged_data['interval_timestamp'].max(), unit='ms', utc=True)
+                self.stats['time_range'] = {
+                    'start': str(start_time),
+                    'end': str(end_time),
+                    'total_days': (end_time - start_time).days
+                }
+                
+                logger.info(f"Total crypto records processed: {total_merged_records}")
+                logger.info("Time-shifted merge completed successfully!")
+                return True
+            else:
+                logger.error("No data to merge")
+                return False
+                
+        except Exception as e:
+            logger.error(f"Failed to merge datasets: {e}")
+            return False
+    
+    def save_merged_data(self) -> bool:
+        """
+        Save the merged dataset, backing up the original crypto file
+        
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            if self.merged_data is None or self.merged_data.empty:
+                logger.error("No merged data to save")
+                return False
+            
+            # Backup original crypto file if it exists and is different from output
+            if self.crypto_file != self.output_file and self.crypto_file.exists():
+                backup_file = self.crypto_file.with_suffix('.backup.parquet')
+                import shutil
+                shutil.copy2(self.crypto_file, backup_file)
+                logger.info(f"Backed up original crypto file to: {backup_file}")
+            
+            logger.info(f"Saving merged data to {self.output_file}")
+            
+            # Save with regular index since we're using interval_timestamp column
+            # Save as parquet (primary format) - this will replace crypto_features.parquet
+            self.merged_data.to_parquet(self.output_file, index=False, compression='snappy')
+            
+            # Don't create pickle file to avoid clutter
+            # pickle_file = self.output_file.with_suffix('.pkl')
+            # with open(pickle_file, 'wb') as f:
+            #     pickle.dump(self.merged_data, f)
+            
+            logger.info(f"Merged data saved successfully!")
+            logger.info(f"Enhanced crypto file: {self.output_file}")
+            # logger.info(f"Pickle file: {pickle_file}")
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to save merged data: {e}")
+            return False
+    
+    def print_summary(self):
+        """Print merge summary"""
+        print("\n" + "="*70)
+        print("SANTIMENT-CRYPTO MERGER SUMMARY")
+        print("="*70)
+        
+        print(f"\nInput Data:")
+        print(f"  Santiment records: {self.stats['santiment_records']:,}")
+        print(f"  Santiment features: {self.stats['santiment_features']}")
+        print(f"  Crypto records: {self.stats['crypto_records']:,}")
+        print(f"  Crypto features: {self.stats['crypto_features']}")
+        
+        print(f"\nMerge Results:")
+        print(f"  Common symbols: {self.stats['common_symbols']}")
+        print(f"  Final records: {self.stats['merged_records']:,}")
+        print(f"  Total features: {self.stats['total_features']}")
+        
+        if self.stats['time_range']:
+            print(f"\nTime Range:")
+            print(f"  Start: {self.stats['time_range']['start']}")
+            print(f"  End: {self.stats['time_range']['end']}")
+            print(f"  Total days: {self.stats['time_range']['total_days']}")
+        
+        if self.merged_data is not None:
+            print(f"\nFinal Dataset:")
+            print(f"  Memory usage: {self.merged_data.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")
+            print(f"  Null percentage: {(self.merged_data.isnull().sum().sum() / (len(self.merged_data) * len(self.merged_data.columns))) * 100:.2f}%")
+            
+            # Show symbol distribution
+            symbol_dist = self.merged_data['symbol'].value_counts()
+            print(f"\nSymbol Distribution:")
+            for symbol, count in symbol_dist.head(10).items():
+                print(f"  {symbol}: {count:,} records")
+        
+        print("="*70)
+    
+    def run_merge(self) -> bool:
+        """
+        Run the complete merge process
+        
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            logger.info("Starting Santiment-Crypto merge process...")
+            
+            # Load data
+            sant_ok = self.load_santiment_data()
+            crypto_ok = self.load_crypto_data()
+
+            if not crypto_ok:
+                return False
+            if not sant_ok:
+                logger.warning("Proceeding without Santiment data; emitting crypto-only output")
+                self.merged_data = self.crypto_data.copy()
+                # Save results immediately
+                if not self.save_merged_data():
+                    return False
+                self.print_summary()
+                logger.info("Santiment-Crypto merge completed successfully with crypto-only output")
+                return True
+            
+            # Merge datasets
+            if not self.merge_datasets():
+                return False
+            
+            # Save results
+            if not self.save_merged_data():
+                return False
+            
+            # Print summary
+            self.print_summary()
+            
+            logger.info("Santiment-Crypto merge completed successfully!")
+            return True
+            
+        except Exception as e:
+            logger.error(f"Merge process failed: {e}")
+            return False
+
+
+def main():
+    """Main function"""
+    merger = SantimentCryptoMerger(
+        santiment_file="data/santiment/merged_features.parquet",
+        # crypto_file="data/merged/features/crypto_features.parquet",
+        output_file="data/merged/features/crypto_features.parquet",  # Replace original file
+        time_tolerance_hours=1
+    )
+    
+    success = merger.run_merge()
+    return success
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/merge_temp.py b/src/merge/merge_temp.py
new file mode 100644
index 0000000000000000000000000000000000000000..91f0c927385fa797635d6a631a11dee96b297871
--- /dev/null
+++ b/src/merge/merge_temp.py
@@ -0,0 +1,444 @@
+import pandas as pd
+import os
+import numpy as np
+from datetime import datetime, timedelta
+
+DAYS_OLD = 7
+MERGED_DIR = "data/merged/features"
+TEMP_DIR = "data/merged/temp"
+
+# Helper: safely cast a value to match a target column's dtype (e.g., drop tz on datetimes)
+def _cast_value_for_column(target_series: pd.Series, value):
+    try:
+        # If target is datetime64[ns], ensure assigned value is tz-naive
+        if pd.api.types.is_datetime64_any_dtype(target_series.dtype):
+            v = pd.to_datetime(value, errors='coerce', utc=True)
+            if isinstance(v, pd.Timestamp):
+                return v.tz_localize(None)
+            return v
+        return value
+    except Exception:
+        return value
+
+def fill_nulls_from_temp(df_merged, df_temp):
+    """
+    Fill null values in df_merged using non-null values from df_temp 
+    for the same symbol + interval_timestamp combination.
+    Returns the number of null values filled.
+    """
+    nulls_filled = 0
+    
+    if df_merged.empty or df_temp.empty:
+        return nulls_filled
+    
+    # Create lookup key for efficient matching
+    key_cols = ["symbol", "interval_timestamp"]
+    
+    # Check if key columns exist in both dataframes
+    if not all(col in df_merged.columns for col in key_cols):
+        print("[WARN] Key columns missing in merged data, skipping null filling")
+        return nulls_filled
+    if not all(col in df_temp.columns for col in key_cols):
+        print("[WARN] Key columns missing in temp data, skipping null filling")
+        return nulls_filled
+    
+    # Create a lookup dictionary from temp data
+    # Format: {(symbol, timestamp): {column: value, ...}}
+    temp_lookup = {}
+    for _, row in df_temp.iterrows():
+        key = (row['symbol'], row['interval_timestamp'])
+        temp_lookup[key] = row.to_dict()
+    
+    # Find common columns between merged and temp (excluding keys)
+    common_cols = [col for col in df_merged.columns 
+                   if col in df_temp.columns and col not in key_cols]
+    
+    if not common_cols:
+        print("[WARN] No common columns found for null filling")
+        return nulls_filled
+    
+    # Track columns with null values before processing
+    null_cols_before = []
+    for col in common_cols:
+        if df_merged[col].isnull().any():
+            null_cols_before.append(col)
+    
+    if not null_cols_before:
+        print("[INFO] No null values found in common columns")
+        return nulls_filled
+    
+    print(f"[INFO] Attempting to fill nulls in {len(null_cols_before)} columns: {null_cols_before}")
+    
+    # Fill null values row by row
+    for idx, row in df_merged.iterrows():
+        key = (row['symbol'], row['interval_timestamp'])
+        
+        # Check if we have corresponding temp data for this key
+        if key in temp_lookup:
+            temp_row = temp_lookup[key]
+            
+            # Fill null values for each column
+            for col in null_cols_before:
+                try:
+                    # Use more robust null checking to handle arrays/scalars
+                    row_val = row[col]
+                    temp_val = temp_row.get(col)
+                    
+                    # Check if row value is null (handle both scalar and array cases)
+                    row_is_null = pd.isnull(row_val)
+                    if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
+                        row_is_null = row_is_null.any()  # For arrays, check if any are null
+                    
+                    # Check if temp value is not null
+                    temp_is_not_null = not pd.isnull(temp_val)
+                    if hasattr(temp_is_not_null, '__len__') and len(temp_is_not_null) > 1:
+                        temp_is_not_null = temp_is_not_null.all()  # For arrays, check if all are not null
+                    
+                    if row_is_null and temp_is_not_null:
+                        # Fill the null value with dtype-compatible casting
+                        df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], temp_val)
+                        nulls_filled += 1
+                except Exception as e:
+                    # Skip problematic columns with a warning
+                    print(f"[WARN] Could not process column '{col}' for null filling: {e}")
+                    continue
+    
+    if nulls_filled > 0:
+        print(f"[INFO] Successfully filled {nulls_filled} null values from temp data")
+        
+        # Report which columns were improved
+        for col in null_cols_before:
+            nulls_remaining = df_merged[col].isnull().sum()
+            print(f"[INFO] Column '{col}': {nulls_remaining} nulls remaining")
+    
+    return nulls_filled
+
+# Helper to filter new records (DISABLED - now keeps ALL data for accumulative merging)
+def filter_new(df):
+    # IMPORTANT: Return ALL data instead of filtering by days
+    # This ensures accumulative merging from day one
+    return df.copy()
+
+def merge_temp_to_merged(temp_name, merged_name):
+    temp_path = os.path.join(TEMP_DIR, temp_name)
+    merged_path = os.path.join(MERGED_DIR, merged_name)
+    if not os.path.exists(temp_path):
+        print(f"[WARN] Temp file missing: {temp_path}")
+        return
+    if not os.path.exists(merged_path):
+        print(f"[WARN] Merged file missing: {merged_path}")
+        return
+    
+    df_temp = pd.read_parquet(temp_path)
+    df_merged = pd.read_parquet(merged_path)
+    
+    # Check if required columns exist
+    required_cols = ["symbol", "interval_timestamp"]
+    missing_cols_temp = [col for col in required_cols if col not in df_temp.columns]
+    missing_cols_merged = [col for col in required_cols if col not in df_merged.columns]
+    
+    if missing_cols_temp:
+        print(f"[ERROR] Missing columns in temp file {temp_name}: {missing_cols_temp}")
+        print(f"[INFO] Available columns in temp: {list(df_temp.columns)}")
+        return
+    
+    if missing_cols_merged:
+        print(f"[ERROR] Missing columns in merged file {merged_name}: {missing_cols_merged}")
+        print(f"[INFO] Available columns in merged: {list(df_merged.columns)}")
+        return
+    
+    new_temp = filter_new(df_temp)
+    
+    # Step 1: Fill null values in merged data using temp data for same symbol+timestamp
+    nulls_filled = fill_nulls_from_temp(df_merged, df_temp)
+    
+    # Step 2: Only add truly new rows (not already in merged)
+    key_cols = ["symbol", "interval_timestamp"]
+    merged_keys = set(tuple(row) for row in df_merged[key_cols].values)
+    new_rows = new_temp[~new_temp[key_cols].apply(tuple, axis=1).isin(merged_keys)]
+    
+    if new_rows.empty and nulls_filled == 0:
+        print(f"[INFO] No new records to add from {temp_name} and no nulls filled")
+        return
+    
+    df_final = pd.concat([df_merged, new_rows], ignore_index=True)
+    df_final.to_parquet(merged_path, index=False)
+    print(f"[OK] Added {len(new_rows)} new records from {temp_name} to {merged_name}, filled {nulls_filled} null values")
+
+def merge_all_to_train(archive_name, features_name, temp_name, train_name):
+    """
+    Merge archive, features, and temp files into a deduplicated train file under merge/train/.
+    Uniqueness is enforced on (symbol, interval_timestamp).
+    Also performs null filling between different sources.
+    """
+    ARCHIVE_DIR = os.path.join(MERGED_DIR, "archive")
+    TRAIN_DIR = os.path.join("data", "merged", "train")
+    os.makedirs(TRAIN_DIR, exist_ok=True)
+    features_path = os.path.join(MERGED_DIR, features_name)
+    temp_path = os.path.join(TEMP_DIR, temp_name)
+    train_path = os.path.join(TRAIN_DIR, train_name)
+
+    dfs = []
+    df_sources = {}  # Track which dataframe came from which source
+    
+    # 1. Read all relevant archive files (recursively)
+    archive_dfs = []
+    if os.path.isdir(ARCHIVE_DIR):
+        for root, dirs, files in os.walk(ARCHIVE_DIR):
+            for fname in files:
+                # Only include files matching the asset (e.g., crypto_features_archived_*.parquet)
+                if fname.startswith(archive_name.replace('.parquet', '_archived_')) and fname.endswith('.parquet'):
+                    fpath = os.path.join(root, fname)
+                    try:
+                        archive_dfs.append(pd.read_parquet(fpath))
+                    except Exception as e:
+                        print(f"[WARN] Could not read archive file {fpath}: {e}")
+    if archive_dfs:
+        df_archive = pd.concat(archive_dfs, ignore_index=True)
+        dfs.append(df_archive)
+        df_sources['archive'] = df_archive
+    else:
+        print(f"[WARN] No archive files found for {archive_name}")
+
+    # 2. Read features and temp
+    if os.path.exists(features_path):
+        df_features = pd.read_parquet(features_path)
+        dfs.append(df_features)
+        df_sources['features'] = df_features
+    else:
+        print(f"[WARN] Missing: {features_path}")
+        
+    if os.path.exists(temp_path):
+        df_temp = pd.read_parquet(temp_path)
+        dfs.append(df_temp)
+        df_sources['temp'] = df_temp
+    else:
+        print(f"[WARN] Missing: {temp_path}")
+    
+    if not dfs:
+        print("[ERROR] No input files found.")
+        return
+    
+    # 3. Merge all data
+    df_all = pd.concat(dfs, ignore_index=True)
+    
+    # 4. Before deduplication, try to fill nulls using data from different sources
+    total_nulls_filled = 0
+    if len(df_sources) > 1:
+        print(f"[INFO] Attempting cross-source null filling for {train_name}")
+        
+        # Create a comprehensive lookup from all sources
+        all_data_lookup = {}
+        for source_name, df_source in df_sources.items():
+            for _, row in df_source.iterrows():
+                key = (row['symbol'], row['interval_timestamp'])
+                if key not in all_data_lookup:
+                    all_data_lookup[key] = {}
+                
+                # Add non-null values from this source
+                for col in df_source.columns:
+                    try:
+                        # Use more robust null checking to handle arrays/scalars
+                        col_val = row[col]
+                        
+                        # Check if value is not null (handle both scalar and array cases)
+                        is_not_null = not pd.isnull(col_val)
+                        if hasattr(is_not_null, '__len__') and len(is_not_null) > 1:
+                            is_not_null = is_not_null.all()  # For arrays, check if all are not null
+                        
+                        if is_not_null:
+                            all_data_lookup[key][col] = col_val
+                    except Exception as e:
+                        # Skip problematic columns with a warning
+                        print(f"[WARN] Could not process column '{col}' for train lookup: {e}")
+                        continue
+        
+        # Fill nulls in the combined dataframe
+        for idx, row in df_all.iterrows():
+            key = (row['symbol'], row['interval_timestamp'])
+            if key in all_data_lookup:
+                lookup_row = all_data_lookup[key]
+                for col in df_all.columns:
+                    try:
+                        # Use more robust null checking
+                        row_val = row[col]
+                        
+                        # Check if row value is null (handle both scalar and array cases)
+                        row_is_null = pd.isnull(row_val)
+                        if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
+                            row_is_null = row_is_null.any()  # For arrays, check if any are null
+                        
+                        if row_is_null and col in lookup_row:
+                            df_all.at[idx, col] = _cast_value_for_column(df_all[col], lookup_row[col])
+                            total_nulls_filled += 1
+                    except Exception as e:
+                        # Skip problematic columns with a warning
+                        print(f"[WARN] Could not process column '{col}' for train null filling: {e}")
+                        continue
+    
+    # 5. Deduplicate by symbol+interval_timestamp, keeping the last occurrence
+    df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
+    
+    # 6. Handle problematic columns that can't be serialized to parquet
+    problematic_cols = []
+    for col in df_all.columns:
+        try:
+            # Test if column can be converted to parquet-compatible format
+            sample = df_all[col].iloc[0] if len(df_all) > 0 else None
+            if sample is not None and hasattr(sample, '__len__') and not isinstance(sample, str):
+                # If it's an array-like object (but not string), it might cause issues
+                if len(sample) > 1:  # Multi-dimensional array
+                    problematic_cols.append(col)
+        except:
+            # If we can't even check the sample, it's definitely problematic
+            problematic_cols.append(col)
+    
+    if problematic_cols:
+        print(f"[WARN] Dropping problematic columns that can't be serialized: {problematic_cols}")
+        df_all = df_all.drop(columns=problematic_cols)
+    
+    # Save to parquet
+    df_all.to_parquet(train_path, index=False)
+    
+    if total_nulls_filled > 0:
+        print(f"[OK] Created train file: {train_path} with {len(df_all)} records, filled {total_nulls_filled} nulls")
+    else:
+        print(f"[OK] Created train file: {train_path} with {len(df_all)} records.")
+
+def create_merged_features():
+    """
+    Create the main merged_features.parquet file by combining crypto and stock features
+    with intelligent null filling between the two datasets.
+    """
+    crypto_path = os.path.join(MERGED_DIR, "crypto_features.parquet")
+    stocks_path = os.path.join(MERGED_DIR, "stocks_features.parquet")
+    merged_path = os.path.join(MERGED_DIR, "merged_features.parquet")
+    
+    dfs_to_merge = []
+    
+    # Read crypto features
+    if os.path.exists(crypto_path):
+        df_crypto = pd.read_parquet(crypto_path)
+        dfs_to_merge.append(('crypto', df_crypto))
+        print(f"[INFO] Loaded crypto features: {len(df_crypto)} rows, {len(df_crypto.columns)} columns")
+    else:
+        print(f"[WARN] Crypto features not found: {crypto_path}")
+    
+    # Read stock features  
+    if os.path.exists(stocks_path):
+        df_stocks = pd.read_parquet(stocks_path)
+        dfs_to_merge.append(('stocks', df_stocks))
+        print(f"[INFO] Loaded stock features: {len(df_stocks)} rows, {len(df_stocks.columns)} columns")
+    else:
+        print(f"[WARN] Stock features not found: {stocks_path}")
+    
+    if not dfs_to_merge:
+        print("[ERROR] No feature files found to merge")
+        return
+    
+    if len(dfs_to_merge) == 1:
+        # Only one dataset available, just copy it
+        df_merged = dfs_to_merge[0][1].copy()
+        print(f"[INFO] Only {dfs_to_merge[0][0]} features available")
+    else:
+        # Multiple datasets - merge with null filling
+        print("[INFO] Merging crypto and stock features with cross-dataset null filling")
+        
+        # Combine all dataframes
+        all_dfs = [df for _, df in dfs_to_merge]
+        df_merged = pd.concat(all_dfs, ignore_index=True, sort=False)
+        
+        # Perform cross-dataset null filling
+        # Create lookup from all datasets for same symbol+timestamp
+        lookup_data = {}
+        for dataset_name, df in dfs_to_merge:
+            for _, row in df.iterrows():
+                key = (row['symbol'], row['interval_timestamp'])
+                if key not in lookup_data:
+                    lookup_data[key] = {}
+                
+                # Add non-null values from this dataset
+                for col in df.columns:
+                    try:
+                        # Use more robust null checking to handle arrays/scalars
+                        col_val = row[col]
+                        
+                        # Check if value is not null (handle both scalar and array cases)
+                        is_not_null = not pd.isnull(col_val)
+                        if hasattr(is_not_null, '__len__') and len(is_not_null) > 1:
+                            is_not_null = is_not_null.all()  # For arrays, check if all are not null
+                        
+                        if is_not_null:
+                            lookup_data[key][col] = col_val
+                    except Exception as e:
+                        # Skip problematic columns with a warning
+                        print(f"[WARN] Could not process column '{col}' for lookup: {e}")
+                        continue
+        
+        # Fill nulls using the comprehensive lookup
+        nulls_filled = 0
+        for idx, row in df_merged.iterrows():
+            key = (row['symbol'], row['interval_timestamp'])
+            if key in lookup_data:
+                lookup_row = lookup_data[key]
+                for col in df_merged.columns:
+                    try:
+                        # Use more robust null checking
+                        row_val = row[col]
+                        
+                        # Check if row value is null (handle both scalar and array cases)
+                        row_is_null = pd.isnull(row_val)
+                        if hasattr(row_is_null, '__len__') and len(row_is_null) > 1:
+                            row_is_null = row_is_null.any()  # For arrays, check if any are null
+                        
+                        if row_is_null and col in lookup_row:
+                            df_merged.at[idx, col] = _cast_value_for_column(df_merged[col], lookup_row[col])
+                            nulls_filled += 1
+                    except Exception as e:
+                        # Skip problematic columns with a warning
+                        print(f"[WARN] Could not process column '{col}' for null filling: {e}")
+                        continue
+        
+        if nulls_filled > 0:
+            print(f"[INFO] Cross-dataset null filling: {nulls_filled} values filled")
+    
+    # Remove duplicates if any (keeping last occurrence)
+    initial_len = len(df_merged)
+    df_merged = df_merged.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
+    final_len = len(df_merged)
+    
+    if initial_len != final_len:
+        print(f"[INFO] Removed {initial_len - final_len} duplicate records")
+    
+    # Save merged features
+    df_merged.to_parquet(merged_path, index=False)
+    print(f"[OK] Created merged features: {merged_path} with {len(df_merged)} rows, {len(df_merged.columns)} columns")
+    
+    # Report statistics
+    nulls_remaining = df_merged.isnull().sum().sum()
+    print(f"[INFO] Merged features null count: {nulls_remaining}")
+    
+    # Report symbol breakdown
+    if 'symbol' in df_merged.columns:
+        symbol_counts = df_merged['symbol'].value_counts()
+        print(f"[INFO] Top symbols: {dict(symbol_counts.head(10))}")
+
+def main():
+    import sys
+    
+    # Check if this is being run as a test
+    if len(sys.argv) > 1 and sys.argv[1] == '--test-null-filling':
+        from test_null_filling_merge import main as test_main
+        sys.exit(test_main())
+    
+    merge_temp_to_merged("crypto_features.parquet", "crypto_features.parquet")
+    merge_temp_to_merged("stocks_features.parquet", "stocks_features.parquet")
+    
+    # Create the main merged features file
+    create_merged_features()
+    
+    merge_all_to_train("crypto_features.parquet", "crypto_features.parquet", "crypto_features.parquet", "crypto_features_train.parquet")
+    merge_all_to_train("stocks_features.parquet", "stocks_features.parquet", "stocks_features.parquet", "stocks_features_train.parquet")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/norm/crypto.py b/src/merge/norm/crypto.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f24497b9423a52ea8b77b32e1e47489e6e7ca86
--- /dev/null
+++ b/src/merge/norm/crypto.py
@@ -0,0 +1,618 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, PowerTransformer
+import json
+import pickle
+from datetime import datetime
+import warnings
+warnings.filterwarnings('ignore')
+import os
+
+class CryptoDataNormalizer:
+    """
+    Enhanced normalization pipeline for cryptocurrency features data with crypto-specific handling
+    """
+
+    def __init__(self, preserve_symbol=True, handle_outliers=True, feature_engineering=True):
+        self.scalers = {}
+        self.encoders = {}
+        self.feature_info = {}
+        self.is_fitted = False
+        self.preserve_symbol = preserve_symbol
+        self.handle_outliers = handle_outliers
+        self.feature_engineering = feature_engineering
+        self.outlier_bounds = {}
+
+    def _detect_outliers(self, df, column):
+        """Detect outliers using IQR method"""
+        Q1 = df[column].quantile(0.25)
+        Q3 = df[column].quantile(0.75)
+        IQR = Q3 - Q1
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        return lower_bound, upper_bound
+
+    def _handle_outliers(self, df, column, method='clip'):
+        """Handle outliers in numerical data"""
+        if column not in self.outlier_bounds:
+            lower_bound, upper_bound = self._detect_outliers(df, column)
+            self.outlier_bounds[column] = (lower_bound, upper_bound)
+        else:
+            lower_bound, upper_bound = self.outlier_bounds[column]
+        
+        if method == 'clip':
+            return df[column].clip(lower_bound, upper_bound)
+        elif method == 'remove':
+            return df[column].where((df[column] >= lower_bound) & (df[column] <= upper_bound))
+        return df[column]
+
+    def _categorize_features(self, df):
+        """Enhanced feature categorization for crypto data"""
+        # Core identification features
+        id_features = ['symbol', 'backup_id', '__index_level_0__', 'cg_id']
+        
+        # Timestamp features
+        timestamp_features = [col for col in df.columns if 'timestamp' in col.lower()]
+        
+        # Binary features (0/1, True/False, or boolean-like)
+        binary_features = []
+        for col in df.columns:
+            if col not in id_features + timestamp_features:
+                unique_vals = set(df[col].dropna().unique())
+                if (df[col].dtype == bool or 
+                    (len(unique_vals) <= 2 and unique_vals.issubset({0, 1, True, False, np.nan})) or
+                    col in ['stable']):
+                    binary_features.append(col)
+
+        # Categorical features (strings, objects, or low cardinality integers)
+        categorical_features = []
+        for col in df.columns:
+            if (col not in id_features + binary_features + timestamp_features and
+                (df[col].dtype == 'object' or 
+                 df[col].dtype.name == 'category' or 
+                 (df[col].nunique() < 20 and df[col].dtype in ['int64', 'int32']))):
+                categorical_features.append(col)
+
+        # Crypto-specific features
+        crypto_specific_features = []
+        crypto_keywords = ['dominance', 'rank']
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in crypto_keywords):
+                if col not in id_features + timestamp_features + binary_features + categorical_features:
+                    crypto_specific_features.append(col)
+
+        # Price/volume/market features
+        price_volume_features = []
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in ['price', 'volume', 'marketcap', 'open']):
+                if col not in id_features + timestamp_features + binary_features + categorical_features + crypto_specific_features:
+                    price_volume_features.append(col)
+
+        # Exchange price features
+        exchange_features = []
+        for col in df.columns:
+            if col.startswith('exchangePrices.'):
+                exchange_features.append(col)
+
+        # Performance features
+        performance_features = []
+        for col in df.columns:
+            if col.startswith('performance.'):
+                performance_features.append(col)
+
+        # Rank difference features
+        rank_diff_features = []
+        for col in df.columns:
+            if col.startswith('rankDiffs.'):
+                rank_diff_features.append(col)
+
+        # Technical indicator features
+        technical_features = []
+        tech_keywords = ['rsi', 'macd', 'ema', 'sma', 'bb_', 'cci', 'mfi', 'atr', 'stoch', 'roc']
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in tech_keywords):
+                if col not in (id_features + timestamp_features + binary_features + categorical_features + 
+                              crypto_specific_features + price_volume_features + exchange_features + 
+                              performance_features + rank_diff_features):
+                    technical_features.append(col)
+
+        # Social sentiment features
+        social_features = []
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in ['social', 'sentiment', 'confidence', 'pos', 'neg', 'neu']):
+                if col not in (id_features + timestamp_features + binary_features + categorical_features + 
+                              crypto_specific_features + price_volume_features + exchange_features + 
+                              performance_features + rank_diff_features + technical_features):
+                    social_features.append(col)
+
+        # Transaction/blockchain features
+        transaction_features = []
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in ['transaction', 'tx_', 'gas', 'fees']):
+                if col not in (id_features + timestamp_features + binary_features + categorical_features + 
+                              crypto_specific_features + price_volume_features + exchange_features + 
+                              performance_features + rank_diff_features + technical_features + social_features):
+                    transaction_features.append(col)
+
+        # Data quality features
+        quality_features = []
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in ['completeness', 'quality', 'correlation']):
+                if col not in (id_features + timestamp_features + binary_features + categorical_features + 
+                              crypto_specific_features + price_volume_features + exchange_features + 
+                              performance_features + rank_diff_features + technical_features + 
+                              social_features + transaction_features):
+                    quality_features.append(col)
+
+        # Remaining numerical features
+        numerical_features = []
+        all_categorized = (id_features + timestamp_features + binary_features + categorical_features + 
+                          crypto_specific_features + price_volume_features + exchange_features + 
+                          performance_features + rank_diff_features + technical_features + 
+                          social_features + transaction_features + quality_features)
+        
+        for col in df.columns:
+            if (col not in all_categorized and 
+                pd.api.types.is_numeric_dtype(df[col])):
+                numerical_features.append(col)
+
+        return {
+            'id_features': id_features,
+            'timestamp_features': timestamp_features,
+            'binary_features': binary_features,
+            'categorical_features': categorical_features,
+            'crypto_specific_features': crypto_specific_features,
+            'price_volume_features': price_volume_features,
+            'exchange_features': exchange_features,
+            'performance_features': performance_features,
+            'rank_diff_features': rank_diff_features,
+            'technical_features': technical_features,
+            'social_features': social_features,
+            'transaction_features': transaction_features,
+            'quality_features': quality_features,
+            'numerical_features': numerical_features
+        }
+
+    def _engineer_crypto_features(self, df, normalized_df):
+        """Create crypto-specific engineered features"""
+        if not self.feature_engineering:
+            return normalized_df
+
+        # Exchange price spread analysis
+        exchange_cols = [col for col in df.columns if col.startswith('exchangePrices.')]
+        if len(exchange_cols) > 1:
+            exchange_prices = df[exchange_cols].replace([np.inf, -np.inf], np.nan)
+            if not exchange_prices.empty and exchange_prices.notna().any().any():
+                price_mean = exchange_prices.mean(axis=1)
+                price_max = exchange_prices.max(axis=1)
+                price_min = exchange_prices.min(axis=1)
+                price_std = exchange_prices.std(axis=1)
+                
+                # Only calculate if we have valid data
+                valid_mask = (price_mean > 0) & price_mean.notna()
+                if valid_mask.any():
+                    normalized_df['exchange_price_spread'] = ((price_max - price_min) / price_mean).fillna(0)
+                    normalized_df['exchange_price_std'] = (price_std / price_mean).fillna(0)
+
+        # Performance momentum
+        perf_short_cols = [col for col in df.columns if col.startswith('performance.') and any(timeframe in col for timeframe in ['min1', 'min5', 'min15', 'hour'])]
+        perf_long_cols = [col for col in df.columns if col.startswith('performance.') and any(timeframe in col for timeframe in ['day', 'week', 'month'])]
+        
+        if perf_short_cols:
+            short_perf = df[perf_short_cols].replace([np.inf, -np.inf], np.nan)
+            normalized_df['short_term_momentum'] = short_perf.mean(axis=1).fillna(0)
+        if perf_long_cols:
+            long_perf = df[perf_long_cols].replace([np.inf, -np.inf], np.nan)
+            normalized_df['long_term_momentum'] = long_perf.mean(axis=1).fillna(0)
+
+        # Rank stability
+        rank_diff_cols = [col for col in df.columns if col.startswith('rankDiffs.')]
+        if rank_diff_cols:
+            rank_diffs = df[rank_diff_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
+            normalized_df['rank_stability'] = 1 / (1 + rank_diffs.abs().sum(axis=1) + 1e-8)  # Add small epsilon to avoid division by zero
+
+        # Social sentiment aggregation
+        social_sentiment_cols = [col for col in df.columns if 'social_sentiment' in col.lower()]
+        if social_sentiment_cols:
+            social_data = df[social_sentiment_cols].replace([np.inf, -np.inf], np.nan)
+            normalized_df['avg_social_sentiment'] = social_data.mean(axis=1).fillna(0.5)  # Neutral sentiment
+
+        # Technical strength (similar to stocks but crypto-focused)
+        tech_cols = [col for col in df.columns if any(tech in col.lower() for tech in ['rsi', 'macd', 'cci'])]
+        if tech_cols:
+            tech_data = df[tech_cols].replace([np.inf, -np.inf], np.nan)
+            normalized_df['technical_strength'] = tech_data.mean(axis=1).fillna(0)
+
+        # Volume-price relationship
+        if 'volume' in df.columns and 'price' in df.columns:
+            volume = df['volume'].replace([np.inf, -np.inf], np.nan)
+            price = df['price'].replace([np.inf, -np.inf], np.nan)
+            valid_mask = (price > 0) & price.notna() & volume.notna()
+            if valid_mask.any():
+                ratio = volume / price
+                normalized_df['volume_price_ratio'] = ratio.fillna(0)
+
+        # Market dominance relative to rank
+        if 'dominance' in df.columns and 'rank' in df.columns:
+            dominance = df['dominance'].replace([np.inf, -np.inf], np.nan).fillna(0)
+            rank = df['rank'].replace([np.inf, -np.inf], np.nan).fillna(1000)  # High rank for unknown
+            # Avoid division by zero
+            rank_reciprocal = 1 / (rank + 1e-8)
+            normalized_df['dominance_rank_ratio'] = (dominance / rank_reciprocal).fillna(0)
+
+        return normalized_df
+
+    def fit(self, df):
+        """Fit the normalizer on training data with crypto-specific preprocessing"""
+        if isinstance(df, dict):
+            df = pd.DataFrame([df])
+
+        self.feature_info = self._categorize_features(df)
+        
+        # Fit scalers for different feature types
+        feature_types = {
+            'crypto_specific_features': RobustScaler(),  # Rank and dominance can have outliers
+            'price_volume_features': RobustScaler(),     # Price and volume data often has outliers
+            'exchange_features': StandardScaler(),       # Exchange prices should be similar
+            'performance_features': StandardScaler(),    # Performance percentages
+            'rank_diff_features': StandardScaler(),      # Rank differences are usually small integers
+            'technical_features': StandardScaler(),      # Technical indicators are usually normalized
+            'social_features': StandardScaler(),         # Sentiment scores
+            'transaction_features': PowerTransformer(),  # Transaction data can be very skewed
+            'quality_features': MinMaxScaler(),          # Quality scores are usually 0-1
+            'numerical_features': PowerTransformer()     # General numerical features
+        }
+        
+        for feature_type, scaler in feature_types.items():
+            features = self.feature_info[feature_type]
+            if features:
+                # Filter existing columns
+                existing_features = [col for col in features if col in df.columns]
+                if existing_features:
+                    # Handle outliers if enabled
+                    if self.handle_outliers and feature_type in ['crypto_specific_features', 'price_volume_features']:
+                        df_clean = df.copy()
+                        for col in existing_features:
+                            df_clean[col] = self._handle_outliers(df_clean, col)
+                    else:
+                        df_clean = df.copy()
+                    
+                    # Comprehensive data cleaning for fitting
+                    try:
+                        # Replace inf/-inf with NaN
+                        df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], np.nan)
+                        
+                        # Fill NaN with appropriate strategy based on feature type
+                        if feature_type in ['crypto_specific_features', 'price_volume_features']:
+                            # For price/volume data, use forward fill then median
+                            for col in existing_features:
+                                df_clean[col] = df_clean[col].fillna(method='ffill').fillna(df_clean[col].median()).fillna(0)
+                        elif feature_type in ['performance_features', 'rank_diff_features']:
+                            # Performance and rank diffs can be 0 when no change
+                            df_clean[existing_features] = df_clean[existing_features].fillna(0)
+                        elif feature_type == 'quality_features':
+                            # Quality features should default to reasonable values
+                            df_clean[existing_features] = df_clean[existing_features].fillna(0.5)
+                        else:
+                            # General strategy: median then 0
+                            for col in existing_features:
+                                df_clean[col] = df_clean[col].fillna(df_clean[col].median()).fillna(0)
+                        
+                        # Ensure no infinite values remain
+                        df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], 0)
+                        
+                        # Fit the scaler
+                        scaler.fit(df_clean[existing_features])
+                        self.scalers[feature_type] = scaler
+                        self.feature_info[f'{feature_type}_existing'] = existing_features
+                        
+                    except Exception as e:
+                        print(f"Warning: Could not fit scaler for {feature_type}: {e}")
+                        # Skip this feature type if fitting fails
+                        continue
+
+        # Fit encoders for categorical features
+        for col in self.feature_info['categorical_features']:
+            if col in df.columns:
+                self.encoders[col] = LabelEncoder()
+                self.encoders[col].fit(df[col].astype(str).fillna('unknown'))
+
+        self.is_fitted = True
+        return self
+
+    def transform(self, data):
+        """Transform data using fitted normalizers with crypto-specific handling"""
+        if not self.is_fitted:
+            raise ValueError("Normalizer must be fitted before transform")
+
+        if isinstance(data, dict):
+            df = pd.DataFrame([data])
+        else:
+            df = data.copy()
+
+        normalized_df = pd.DataFrame(index=df.index)
+
+        # 1. Preserve symbol if requested
+        if self.preserve_symbol and 'symbol' in df.columns:
+            normalized_df['symbol'] = df['symbol']
+
+        # 2. Enhanced timestamp features
+        for col in self.feature_info['timestamp_features']:
+            if col in df.columns:
+                ts = pd.to_datetime(df[col], unit='ms', errors='coerce')
+                # Crypto markets are 24/7, so different time features
+                normalized_df[f'{col}_hour'] = ts.dt.hour / 23.0
+                normalized_df[f'{col}_day_of_week'] = ts.dt.dayofweek / 6.0
+                normalized_df[f'{col}_month'] = (ts.dt.month - 1) / 11.0
+                normalized_df[f'{col}_quarter'] = (ts.dt.quarter - 1) / 3.0
+                normalized_df[f'{col}_is_weekend'] = (ts.dt.dayofweek >= 5).astype(int)
+                # For crypto, we might want to track different time patterns
+                normalized_df[f'{col}_is_asian_hours'] = ((ts.dt.hour >= 0) & (ts.dt.hour <= 8)).astype(int)
+                normalized_df[f'{col}_is_european_hours'] = ((ts.dt.hour >= 8) & (ts.dt.hour <= 16)).astype(int)
+                normalized_df[f'{col}_is_american_hours'] = ((ts.dt.hour >= 16) & (ts.dt.hour <= 24)).astype(int)
+
+        # 3. Binary features (keep as is, fill NaN with 0)
+        for col in self.feature_info['binary_features']:
+            if col in df.columns:
+                normalized_df[col] = df[col].fillna(0).astype(int)
+
+        # 4. Categorical features with better encoding
+        for col in self.feature_info['categorical_features']:
+            if col in df.columns and col in self.encoders:
+                try:
+                    # Handle unknown categories
+                    values = df[col].astype(str).fillna('unknown')
+                    encoded_values = []
+                    for val in values:
+                        try:
+                            encoded_values.append(self.encoders[col].transform([val])[0])
+                        except ValueError:
+                            # Unknown category, assign most frequent class
+                            encoded_values.append(0)
+                    normalized_df[f'{col}_encoded'] = encoded_values
+                except Exception:
+                    normalized_df[f'{col}_encoded'] = 0
+
+        # 5. Scale different feature types with appropriate scalers
+        feature_types = ['crypto_specific_features', 'price_volume_features', 'exchange_features',
+                        'performance_features', 'rank_diff_features', 'technical_features',
+                        'social_features', 'transaction_features', 'quality_features', 'numerical_features']
+        
+        for feature_type in feature_types:
+            if feature_type in self.scalers:
+                existing_features = self.feature_info.get(f'{feature_type}_existing', [])
+                available_features = [col for col in existing_features if col in df.columns]
+                if available_features:
+                    try:
+                        # Handle outliers if enabled
+                        if (self.handle_outliers and 
+                            feature_type in ['crypto_specific_features', 'price_volume_features']):
+                            df_clean = df.copy()
+                            for col in available_features:
+                                if col in self.outlier_bounds:
+                                    lower_bound, upper_bound = self.outlier_bounds[col]
+                                    df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)
+                        else:
+                            df_clean = df.copy()
+                        
+                        # Comprehensive data cleaning for transform
+                        # Replace inf/-inf with NaN
+                        df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], np.nan)
+                        
+                        # Fill NaN with appropriate strategy based on feature type
+                        if feature_type in ['crypto_specific_features', 'price_volume_features']:
+                            # For price/volume data, use forward fill then median from training
+                            for col in available_features:
+                                df_clean[col] = df_clean[col].fillna(method='ffill').fillna(method='bfill').fillna(0)
+                        elif feature_type in ['performance_features', 'rank_diff_features']:
+                            # Performance and rank diffs can be 0 when no change
+                            df_clean[available_features] = df_clean[available_features].fillna(0)
+                        elif feature_type == 'quality_features':
+                            # Quality features should default to reasonable values
+                            df_clean[available_features] = df_clean[available_features].fillna(0.5)
+                        else:
+                            # General strategy: 0 (since we don't have training medians in transform)
+                            df_clean[available_features] = df_clean[available_features].fillna(0)
+                        
+                        # Ensure no infinite values remain
+                        df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], 0)
+                        
+                        # Transform the data
+                        scaled_data = self.scalers[feature_type].transform(df_clean[available_features])
+                        
+                        # Add scaled features with descriptive names
+                        scaler_name = type(self.scalers[feature_type]).__name__.lower().replace('scaler', '').replace('transformer', '')
+                        for i, col in enumerate(available_features):
+                            normalized_df[f'{col}_{scaler_name}_scaled'] = scaled_data[:, i]
+                            
+                    except Exception as e:
+                        print(f"Warning: Could not transform {feature_type}: {e}")
+                        # If transformation fails, add original features with minimal processing
+                        for col in available_features:
+                            if col in df.columns:
+                                clean_col = df[col].replace([np.inf, -np.inf], np.nan).fillna(0)
+                                normalized_df[f'{col}_raw'] = clean_col
+
+        # 6. Crypto-specific feature engineering
+        normalized_df = self._engineer_crypto_features(df, normalized_df)
+
+        # 7. Final comprehensive cleanup of any remaining issues
+        # Replace any infinite values that might have been created
+        normalized_df = normalized_df.replace([np.inf, -np.inf], np.nan)
+        
+        # Fill remaining NaN values with appropriate defaults
+        for col in normalized_df.columns:
+            if normalized_df[col].isna().any():
+                if col == 'symbol':
+                    continue  # Don't fill symbol
+                elif 'sentiment' in col.lower():
+                    normalized_df[col] = normalized_df[col].fillna(0.5)  # Neutral sentiment
+                elif 'ratio' in col.lower() or 'momentum' in col.lower():
+                    normalized_df[col] = normalized_df[col].fillna(0)  # No change/neutral
+                elif 'hour' in col or 'day_of_week' in col or 'month' in col or 'quarter' in col:
+                    normalized_df[col] = normalized_df[col].fillna(0)  # Time features
+                elif col.endswith('_encoded'):
+                    normalized_df[col] = normalized_df[col].fillna(0)  # Encoded categories
+                else:
+                    normalized_df[col] = normalized_df[col].fillna(0)  # General fallback
+        
+        # Final validation - ensure no NaN or infinite values remain
+        try:
+            assert not normalized_df.isnull().any().any(), "Still contains NaN values after cleanup"
+            assert not np.isinf(normalized_df.select_dtypes(include=[np.number])).any().any(), "Still contains infinite values after cleanup"
+        except AssertionError as e:
+            print(f"Warning: {e}")
+            # Emergency cleanup
+            normalized_df = normalized_df.fillna(0).replace([np.inf, -np.inf], 0)
+
+        return normalized_df
+
+    def fit_transform(self, data):
+        """Fit and transform in one step"""
+        return self.fit(data).transform(data)
+
+    def get_feature_importance_info(self):
+        """Return information about feature categories for model interpretation"""
+        return {
+            'feature_categories': self.feature_info,
+            'scalers_used': {k: type(v).__name__ for k, v in self.scalers.items()},
+            'total_features': sum(len(features) for features in self.feature_info.values() if isinstance(features, list))
+        }
+
+    def save(self, filepath):
+        """Save the fitted normalizer"""
+        with open(filepath, 'wb') as f:
+            pickle.dump({
+                'scalers': self.scalers,
+                'encoders': self.encoders,
+                'feature_info': self.feature_info,
+                'is_fitted': self.is_fitted,
+                'preserve_symbol': self.preserve_symbol,
+                'handle_outliers': self.handle_outliers,
+                'feature_engineering': self.feature_engineering,
+                'outlier_bounds': self.outlier_bounds
+            }, f)
+
+    def load(self, filepath):
+        """Load a fitted normalizer"""
+        with open(filepath, 'rb') as f:
+            data = pickle.load(f)
+            self.scalers = data['scalers']
+            self.encoders = data['encoders']
+            self.feature_info = data['feature_info']
+            self.is_fitted = data['is_fitted']
+            self.preserve_symbol = data.get('preserve_symbol', True)
+            self.handle_outliers = data.get('handle_outliers', True)
+            self.feature_engineering = data.get('feature_engineering', True)
+            self.outlier_bounds = data.get('outlier_bounds', {})
+        return self
+
+def normalize_crypto_data_file(input_file, output_file, save_normalizer=True, **kwargs):
+    """
+    Enhanced normalization function for crypto data
+    """
+    # Load data
+    if input_file.endswith('.parquet'):
+        df = pd.read_parquet(input_file)
+        print(f"Loaded {len(df)} records with {len(df.columns)} features from parquet")
+    else:
+        data = []
+        with open(input_file, 'r') as f:
+            for line in f:
+                data.append(json.loads(line.strip()))
+        df = pd.DataFrame(data)
+        print(f"Loaded {len(df)} records with {len(df.columns)} features from jsonl")
+
+    # Initialize crypto normalizer
+    normalizer = CryptoDataNormalizer(**kwargs)
+    
+    # Show feature categorization
+    feature_info = normalizer._categorize_features(df)
+    print("\nCrypto Feature Categorization:")
+    for category, features in feature_info.items():
+        if features:
+            print(f"  {category}: {len(features)} features")
+    
+    # Fit and transform
+    normalized_df = normalizer.fit_transform(df)
+
+    print(f"\nNormalized to {len(normalized_df.columns)} features")
+    print(f"Data shape: {normalized_df.shape}")
+    
+    # Show feature importance info
+    importance_info = normalizer.get_feature_importance_info()
+    print(f"\nScalers used: {importance_info['scalers_used']}")
+
+    # Ensure output directory exists
+    import os
+    output_dir = os.path.dirname(output_file)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+
+    # Save normalized data as pickle instead of CSV
+    pkl_output_file = output_file.replace('.csv', '.pkl')
+    normalized_df.to_pickle(pkl_output_file)
+    print(f"Saved normalized data to {pkl_output_file}")
+
+    # Save normalizer
+    if save_normalizer:
+        normalizer_file = output_file.replace('.csv', '_crypto_normalizer.pkl')
+        normalizer.save(normalizer_file)
+        print(f"Saved normalizer to {normalizer_file}")
+
+    return normalized_df, normalizer
+
+# CLI function
+import argparse
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Enhanced normalization for cryptocurrency features with crypto-specific handling"
+    )
+    parser.add_argument('input', nargs='?', default='data/merged/features/crypto_features.parquet', 
+                       help='Input file (.parquet or .jsonl)')
+    parser.add_argument('output', nargs='?', default='data/merged/features/norm/crypto_features_normalized.pkl', 
+                       help='Output PKL file for normalized features')
+    parser.add_argument('--no-save-normalizer', action='store_true', 
+                       help='Do not save the normalizer pickle')
+    parser.add_argument('--no-preserve-symbol', action='store_true', 
+                       help='Do not preserve symbol column')
+    parser.add_argument('--no-handle-outliers', action='store_true', 
+                       help='Do not handle outliers')
+    parser.add_argument('--no-feature-engineering', action='store_true', 
+                       help='Do not create engineered features')
+    parser.add_argument('--train', action='store_true',
+                       help='Normalize the train file and save under train/norm/')
+
+    args = parser.parse_args()
+
+    kwargs = {
+        'preserve_symbol': not args.no_preserve_symbol,
+        'handle_outliers': not args.no_handle_outliers,
+        'feature_engineering': not args.no_feature_engineering
+    }
+
+    if args.train:
+        train_input = 'data/merged/train/crypto_features_train.parquet'
+        train_norm_dir = 'data/merged/train/norm'
+        os.makedirs(train_norm_dir, exist_ok=True)
+        train_output = os.path.join(train_norm_dir, 'crypto_features_train_normalized.pkl')
+        print(f"[INFO] Normalizing train file: {train_input} -> {train_output}")
+        normalize_crypto_data_file(
+            train_input,
+            train_output,
+            save_normalizer=not args.no_save_normalizer,
+            **kwargs
+        )
+    else:
+        print(f"[INFO] Enhanced crypto normalizing: {args.input} -> {args.output}")
+        print(f"[INFO] Options: {kwargs}")
+        normalize_crypto_data_file(
+            args.input,
+            args.output,
+            save_normalizer=not args.no_save_normalizer,
+            **kwargs
+        )
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/norm/stocks.py b/src/merge/norm/stocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc76264f8046c3dd567b0b90db0c5b080559a617
--- /dev/null
+++ b/src/merge/norm/stocks.py
@@ -0,0 +1,600 @@
+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, PowerTransformer
+import json
+import pickle
+from datetime import datetime
+import warnings
+warnings.filterwarnings('ignore')
+
+class ImprovedStockDataNormalizer:
+    """
+    Enhanced normalization pipeline for stock features data with better feature handling
+    """
+
+    def __init__(self, preserve_symbol=True, handle_outliers=True, feature_engineering=True):
+        self.scalers = {}
+        self.encoders = {}
+        self.feature_info = {}
+        self.is_fitted = False
+        self.preserve_symbol = preserve_symbol
+        self.handle_outliers = handle_outliers
+        self.feature_engineering = feature_engineering
+        self.outlier_bounds = {}
+
+    def _detect_outliers(self, df, column):
+        """Detect outliers using IQR method"""
+        Q1 = df[column].quantile(0.25)
+        Q3 = df[column].quantile(0.75)
+        IQR = Q3 - Q1
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        return lower_bound, upper_bound
+
+    def _handle_outliers(self, df, column, method='clip'):
+        """Handle outliers in numerical data"""
+        if column not in self.outlier_bounds:
+            lower_bound, upper_bound = self._detect_outliers(df, column)
+            self.outlier_bounds[column] = (lower_bound, upper_bound)
+        else:
+            lower_bound, upper_bound = self.outlier_bounds[column]
+        
+        if method == 'clip':
+            return df[column].clip(lower_bound, upper_bound)
+        elif method == 'remove':
+            return df[column].where((df[column] >= lower_bound) & (df[column] <= upper_bound))
+        return df[column]
+
+    def _categorize_features(self, df):
+        """Enhanced feature categorization with better detection"""
+        # Core identification features
+        id_features = ['symbol', 'backup_id', '__index_level_0__']
+        
+        # Timestamp features
+        timestamp_features = [col for col in df.columns if 'timestamp' in col.lower()]
+        
+        # Binary features (0/1, True/False, or boolean-like)
+        binary_features = []
+        for col in df.columns:
+            if col not in id_features + timestamp_features:
+                # Skip columns with array-like values (unhashable)
+                try:
+                    vals = df[col].dropna().unique()
+                    # If any value is a list/array, skip this column
+                    if any(isinstance(v, (list, np.ndarray)) for v in vals):
+                        continue
+                    unique_vals = set(vals)
+                except TypeError:
+                    continue
+                if (df[col].dtype == bool or 
+                    (len(unique_vals) <= 2 and unique_vals.issubset({0, 1, True, False, np.nan})) or
+                    col.startswith('is_')):
+                    binary_features.append(col)
+
+        # Categorical features (strings, objects, or low cardinality integers)
+        categorical_features = []
+        for col in df.columns:
+            if (col not in id_features + binary_features + timestamp_features and
+                (df[col].dtype == 'object' or 
+                 df[col].dtype.name == 'category' or 
+                 (df[col].nunique() < 20 and df[col].dtype in ['int64', 'int32']))):
+                categorical_features.append(col)
+
+        # Price/volume features (need special handling)
+        price_volume_features = []
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in ['price', 'volume', 'vwap', 'market', 'cap']):
+                if col not in id_features + timestamp_features + binary_features + categorical_features:
+                    price_volume_features.append(col)
+
+        # Technical indicator features
+        technical_features = []
+        tech_keywords = ['rsi', 'macd', 'ema', 'sma', 'bb_', 'cci', 'mfi', 'atr', 'stoch', 'roc']
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in tech_keywords):
+                if col not in id_features + timestamp_features + binary_features + categorical_features + price_volume_features:
+                    technical_features.append(col)
+
+        # News/sentiment features
+        news_features = []
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in ['news', 'sentiment', 'pos', 'neg', 'neu']):
+                if col not in id_features + timestamp_features + binary_features + categorical_features + price_volume_features + technical_features:
+                    news_features.append(col)
+
+        # Count/ratio features
+        count_features = []
+        for col in df.columns:
+            if any(keyword in col.lower() for keyword in ['count', 'size', 'ratio', 'change']):
+                if col not in id_features + timestamp_features + binary_features + categorical_features + price_volume_features + technical_features + news_features:
+                    count_features.append(col)
+
+        # Remaining numerical features
+        numerical_features = []
+        all_categorized = (id_features + timestamp_features + binary_features + 
+                          categorical_features + price_volume_features + 
+                          technical_features + news_features + count_features)
+        
+        for col in df.columns:
+            if (col not in all_categorized and 
+                pd.api.types.is_numeric_dtype(df[col])):
+                numerical_features.append(col)
+
+        return {
+            'id_features': id_features,
+            'timestamp_features': timestamp_features,
+            'binary_features': binary_features,
+            'categorical_features': categorical_features,
+            'price_volume_features': price_volume_features,
+            'technical_features': technical_features,
+            'news_features': news_features,
+            'count_features': count_features,
+            'numerical_features': numerical_features
+        }
+
+    def _engineer_features(self, df, normalized_df):
+        """Create additional engineered features"""
+        if not self.feature_engineering:
+            return normalized_df
+
+        # Price momentum features
+        if 'close' in df.columns and 'prev_close' in df.columns:
+            close = df['close'].replace([np.inf, -np.inf], np.nan)
+            prev_close = df['prev_close'].replace([np.inf, -np.inf], np.nan)
+            valid_mask = (prev_close > 0) & prev_close.notna() & close.notna()
+            if valid_mask.any():
+                momentum = (close - prev_close) / prev_close
+                normalized_df['price_momentum'] = momentum.fillna(0)
+        
+        # Volume-price relationship
+        if 'volume' in df.columns and 'close' in df.columns:
+            volume = df['volume'].replace([np.inf, -np.inf], np.nan)
+            close = df['close'].replace([np.inf, -np.inf], np.nan)
+            valid_mask = (close > 0) & close.notna() & volume.notna()
+            if valid_mask.any():
+                ratio = volume / close
+                normalized_df['volume_price_ratio'] = ratio.fillna(0)
+        
+        # Volatility features
+        if 'high' in df.columns and 'low' in df.columns and 'close' in df.columns:
+            high = df['high'].replace([np.inf, -np.inf], np.nan)
+            low = df['low'].replace([np.inf, -np.inf], np.nan)
+            close = df['close'].replace([np.inf, -np.inf], np.nan)
+            valid_mask = (close > 0) & close.notna() & high.notna() & low.notna()
+            if valid_mask.any():
+                daily_range = (high - low) / close
+                normalized_df['daily_range'] = daily_range.fillna(0)
+        
+        # News sentiment aggregation
+        sentiment_cols = [col for col in df.columns if 'sentiment' in col.lower() and 'mean' in col.lower()]
+        if sentiment_cols:
+            sentiment_data = df[sentiment_cols].replace([np.inf, -np.inf], np.nan)
+            normalized_df['avg_sentiment'] = sentiment_data.mean(axis=1).fillna(0.5)  # Neutral sentiment
+        
+        # Technical indicator strength
+        tech_cols = [col for col in df.columns if any(tech in col.lower() for tech in ['rsi', 'macd', 'cci'])]
+        if tech_cols:
+            tech_data = df[tech_cols].replace([np.inf, -np.inf], np.nan)
+            normalized_df['technical_strength'] = tech_data.mean(axis=1).fillna(0)
+
+        return normalized_df
+
+    def fit(self, df):
+        """Fit the normalizer on training data with enhanced preprocessing"""
+        if isinstance(df, dict):
+            df = pd.DataFrame([df])
+
+        self.feature_info = self._categorize_features(df)
+        
+        # Fit scalers for different feature types
+        feature_types = ['price_volume_features', 'technical_features', 'news_features', 
+                        'count_features', 'numerical_features']
+        
+        for feature_type in feature_types:
+            features = self.feature_info[feature_type]
+            if features:
+                # Filter existing columns
+                existing_features = [col for col in features if col in df.columns]
+                if existing_features:
+                    # Choose appropriate scaler based on feature type
+                    if feature_type == 'price_volume_features':
+                        scaler = RobustScaler()  # Robust to outliers
+                    elif feature_type == 'technical_features':
+                        scaler = StandardScaler()  # Most technical indicators are already normalized
+                    elif feature_type in ['count_features', 'numerical_features']:
+                        scaler = PowerTransformer(method='yeo-johnson')  # Handle skewed distributions
+                    else:
+                        scaler = StandardScaler()
+                    
+                    try:
+                        # Handle outliers if enabled
+                        if self.handle_outliers:
+                            df_clean = df.copy()
+                            for col in existing_features:
+                                df_clean[col] = self._handle_outliers(df_clean, col)
+                        else:
+                            df_clean = df.copy()
+                        
+                        # Comprehensive data cleaning for fitting
+                        # Replace inf/-inf with NaN
+                        df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], np.nan)
+                        
+                        # Fill NaN with appropriate strategy based on feature type
+                        if feature_type == 'price_volume_features':
+                            # For price/volume data, use forward fill then median
+                            for col in existing_features:
+                                df_clean[col] = df_clean[col].fillna(method='ffill').fillna(df_clean[col].median()).fillna(0)
+                        elif feature_type == 'technical_features':
+                            # Technical indicators: use median for each column
+                            for col in existing_features:
+                                median_val = df_clean[col].median()
+                                df_clean[col] = df_clean[col].fillna(median_val if not pd.isna(median_val) else 0)
+                        elif feature_type == 'news_features':
+                            # News features: neutral values
+                            for col in existing_features:
+                                if 'sentiment' in col.lower():
+                                    df_clean[col] = df_clean[col].fillna(0.5)  # Neutral sentiment
+                                elif 'count' in col.lower():
+                                    df_clean[col] = df_clean[col].fillna(0)  # No news
+                                else:
+                                    df_clean[col] = df_clean[col].fillna(df_clean[col].median()).fillna(0)
+                        else:
+                            # General strategy: median then 0
+                            for col in existing_features:
+                                df_clean[col] = df_clean[col].fillna(df_clean[col].median()).fillna(0)
+                        
+                        # Ensure no infinite values remain
+                        df_clean[existing_features] = df_clean[existing_features].replace([np.inf, -np.inf], 0)
+                        
+                        # Fit the scaler
+                        scaler.fit(df_clean[existing_features])
+                        self.scalers[feature_type] = scaler
+                        self.feature_info[f'{feature_type}_existing'] = existing_features
+                        
+                    except Exception as e:
+                        print(f"Warning: Could not fit scaler for {feature_type}: {e}")
+                        # Skip this feature type if fitting fails
+                        continue
+
+        # Fit encoders for categorical features
+        for col in self.feature_info['categorical_features']:
+            if col in df.columns:
+                self.encoders[col] = LabelEncoder()
+                self.encoders[col].fit(df[col].astype(str).fillna('unknown'))
+
+        self.is_fitted = True
+        return self
+
+    def transform(self, data):
+        """Transform data using fitted normalizers with enhanced feature handling"""
+        if not self.is_fitted:
+            raise ValueError("Normalizer must be fitted before transform")
+
+        if isinstance(data, dict):
+            df = pd.DataFrame([data])
+        else:
+            df = data.copy()
+
+        normalized_df = pd.DataFrame(index=df.index)
+
+        # 1. Preserve symbol if requested
+        if self.preserve_symbol and 'symbol' in df.columns:
+            normalized_df['symbol'] = df['symbol']
+
+        # 2. Enhanced timestamp features
+        for col in self.feature_info['timestamp_features']:
+            if col in df.columns:
+                ts = pd.to_datetime(df[col], unit='ms', errors='coerce')
+                # More comprehensive time features
+                normalized_df[f'{col}_hour'] = ts.dt.hour / 23.0
+                normalized_df[f'{col}_day_of_week'] = ts.dt.dayofweek / 6.0
+                normalized_df[f'{col}_month'] = (ts.dt.month - 1) / 11.0
+                normalized_df[f'{col}_quarter'] = (ts.dt.quarter - 1) / 3.0
+                normalized_df[f'{col}_is_weekend'] = (ts.dt.dayofweek >= 5).astype(int)
+                normalized_df[f'{col}_is_market_hours'] = ((ts.dt.hour >= 9) & (ts.dt.hour <= 16) & (ts.dt.dayofweek < 5)).astype(int)
+
+        # 3. Binary features (keep as is, fill NaN with 0)
+        for col in self.feature_info['binary_features']:
+            if col in df.columns:
+                normalized_df[col] = df[col].fillna(0).astype(int)
+
+        # 4. Categorical features with better encoding
+        for col in self.feature_info['categorical_features']:
+            if col in df.columns and col in self.encoders:
+                try:
+                    # Handle unknown categories
+                    values = df[col].astype(str).fillna('unknown')
+                    encoded_values = []
+                    for val in values:
+                        try:
+                            encoded_values.append(self.encoders[col].transform([val])[0])
+                        except ValueError:
+                            # Unknown category, assign most frequent class
+                            encoded_values.append(0)
+                    normalized_df[f'{col}_encoded'] = encoded_values
+                except Exception:
+                    normalized_df[f'{col}_encoded'] = 0
+
+        # 5. Scale different feature types with appropriate scalers
+        feature_types = ['price_volume_features', 'technical_features', 'news_features', 
+                        'count_features', 'numerical_features']
+        
+        for feature_type in feature_types:
+            if feature_type in self.scalers:
+                existing_features = self.feature_info.get(f'{feature_type}_existing', [])
+                available_features = [col for col in existing_features if col in df.columns]
+                
+                if available_features:
+                    try:
+                        # Handle outliers if enabled
+                        if self.handle_outliers:
+                            df_clean = df.copy()
+                            for col in available_features:
+                                if col in self.outlier_bounds:
+                                    lower_bound, upper_bound = self.outlier_bounds[col]
+                                    df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)
+                        else:
+                            df_clean = df.copy()
+                        
+                        # Comprehensive data cleaning for transform
+                        # Replace inf/-inf with NaN
+                        df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], np.nan)
+                        
+                        # Fill NaN with appropriate strategy based on feature type
+                        if feature_type == 'price_volume_features':
+                            # For price/volume data, use forward fill then back fill
+                            for col in available_features:
+                                df_clean[col] = df_clean[col].fillna(method='ffill').fillna(method='bfill').fillna(0)
+                        elif feature_type == 'technical_features':
+                            # Technical indicators: use neutral values
+                            for col in available_features:
+                                if 'rsi' in col.lower():
+                                    df_clean[col] = df_clean[col].fillna(50)  # Neutral RSI
+                                elif any(indicator in col.lower() for indicator in ['macd', 'cci']):
+                                    df_clean[col] = df_clean[col].fillna(0)  # Neutral MACD/CCI
+                                else:
+                                    df_clean[col] = df_clean[col].fillna(0)
+                        elif feature_type == 'news_features':
+                            # News features: neutral values
+                            for col in available_features:
+                                if 'sentiment' in col.lower():
+                                    df_clean[col] = df_clean[col].fillna(0.5)  # Neutral sentiment
+                                elif 'count' in col.lower():
+                                    df_clean[col] = df_clean[col].fillna(0)  # No news
+                                else:
+                                    df_clean[col] = df_clean[col].fillna(0)
+                        else:
+                            # General strategy: 0 (since we don't have training medians in transform)
+                            df_clean[available_features] = df_clean[available_features].fillna(0)
+                        
+                        # Ensure no infinite values remain
+                        df_clean[available_features] = df_clean[available_features].replace([np.inf, -np.inf], 0)
+                        
+                        # Transform the data
+                        scaled_data = self.scalers[feature_type].transform(df_clean[available_features])
+                        
+                        # Add scaled features with descriptive names
+                        scaler_name = type(self.scalers[feature_type]).__name__.lower().replace('scaler', '').replace('transformer', '')
+                        for i, col in enumerate(available_features):
+                            normalized_df[f'{col}_{scaler_name}_scaled'] = scaled_data[:, i]
+                            
+                    except Exception as e:
+                        print(f"Warning: Could not transform {feature_type}: {e}")
+                        # If transformation fails, add original features with minimal processing
+                        for col in available_features:
+                            if col in df.columns:
+                                clean_col = df[col].replace([np.inf, -np.inf], np.nan).fillna(0)
+                                normalized_df[f'{col}_raw'] = clean_col
+
+        # 6. Feature engineering
+        normalized_df = self._engineer_features(df, normalized_df)
+
+        # 7. Final comprehensive cleanup of any remaining issues
+        # Replace any infinite values that might have been created
+        normalized_df = normalized_df.replace([np.inf, -np.inf], np.nan)
+        
+        # Fill remaining NaN values with appropriate defaults
+        for col in normalized_df.columns:
+            if normalized_df[col].isna().any():
+                if col == 'symbol':
+                    continue  # Don't fill symbol
+                elif 'sentiment' in col.lower():
+                    normalized_df[col] = normalized_df[col].fillna(0.5)  # Neutral sentiment
+                elif 'ratio' in col.lower() or 'momentum' in col.lower():
+                    normalized_df[col] = normalized_df[col].fillna(0)  # No change/neutral
+                elif 'hour' in col or 'day_of_week' in col or 'month' in col or 'quarter' in col:
+                    normalized_df[col] = normalized_df[col].fillna(0)  # Time features
+                elif col.endswith('_encoded'):
+                    normalized_df[col] = normalized_df[col].fillna(0)  # Encoded categories
+                else:
+                    normalized_df[col] = normalized_df[col].fillna(0)  # General fallback
+        
+        # Final validation - ensure no NaN or infinite values remain
+        try:
+            assert not normalized_df.isnull().any().any(), "Still contains NaN values after cleanup"
+            assert not np.isinf(normalized_df.select_dtypes(include=[np.number])).any().any(), "Still contains infinite values after cleanup"
+        except AssertionError as e:
+            print(f"Warning: {e}")
+            # Emergency cleanup
+            normalized_df = normalized_df.fillna(0).replace([np.inf, -np.inf], 0)
+
+        return normalized_df
+
+    def fit_transform(self, data):
+        """Fit and transform in one step"""
+        return self.fit(data).transform(data)
+
+    def get_feature_importance_info(self):
+        """Return information about feature categories for model interpretation"""
+        return {
+            'feature_categories': self.feature_info,
+            'scalers_used': {k: type(v).__name__ for k, v in self.scalers.items()},
+            'total_features': sum(len(features) for features in self.feature_info.values() if isinstance(features, list))
+        }
+
+    def save(self, filepath):
+        """Save the fitted normalizer"""
+        with open(filepath, 'wb') as f:
+            pickle.dump({
+                'scalers': self.scalers,
+                'encoders': self.encoders,
+                'feature_info': self.feature_info,
+                'is_fitted': self.is_fitted,
+                'preserve_symbol': self.preserve_symbol,
+                'handle_outliers': self.handle_outliers,
+                'feature_engineering': self.feature_engineering,
+                'outlier_bounds': self.outlier_bounds
+            }, f)
+
+    def load(self, filepath):
+        """Load a fitted normalizer"""
+        with open(filepath, 'rb') as f:
+            data = pickle.load(f)
+            self.scalers = data['scalers']
+            self.encoders = data['encoders']
+            self.feature_info = data['feature_info']
+            self.is_fitted = data['is_fitted']
+            self.preserve_symbol = data.get('preserve_symbol', True)
+            self.handle_outliers = data.get('handle_outliers', True)
+            self.feature_engineering = data.get('feature_engineering', True)
+            self.outlier_bounds = data.get('outlier_bounds', {})
+        return self
+
+def cap_outliers(df, features=None, method='iqr', factor=1.5):
+    """
+    Cap outliers in the DataFrame for the given features using the IQR method.
+    If features is None, all numeric columns are used.
+    """
+    capped_df = df.copy()
+    if features is None:
+        features = capped_df.select_dtypes(include=[np.number]).columns
+    for col in features:
+        if col not in capped_df.columns:
+            continue
+        Q1 = capped_df[col].quantile(0.25)
+        Q3 = capped_df[col].quantile(0.75)
+        IQR = Q3 - Q1
+        lower = Q1 - factor * IQR
+        upper = Q3 + factor * IQR
+        capped_df[col] = np.clip(capped_df[col], lower, upper)
+        print(f"Capped outliers in {col}: [{lower:.3g}, {upper:.3g}]")
+    return capped_df
+
+# Example usage after normalization:
+# normalized_df = cap_outliers(normalized_df, features=['price_momentum', 'volume_price_ratio', 'daily_range', 'technical_strength'])
+# (You can call this function in your pipeline after normalization, before saving or modeling.)
+
+def normalize_stock_data_file_improved(input_file, output_file, save_normalizer=True, **kwargs):
+    """
+    Enhanced normalization function with better defaults
+    """
+    # Load data
+    if input_file.endswith('.parquet'):
+        df = pd.read_parquet(input_file)
+        print(f"Loaded {len(df)} records with {len(df.columns)} features from parquet")
+    else:
+        data = []
+        with open(input_file, 'r') as f:
+            for line in f:
+                data.append(json.loads(line.strip()))
+        df = pd.DataFrame(data)
+        print(f"Loaded {len(df)} records with {len(df.columns)} features from jsonl")
+
+    # Initialize improved normalizer
+    normalizer = ImprovedStockDataNormalizer(**kwargs)
+    
+    # Show feature categorization
+    feature_info = normalizer._categorize_features(df)
+    print("\nFeature Categorization:")
+    for category, features in feature_info.items():
+        if features:
+            print(f"  {category}: {len(features)} features")
+    
+    # Fit and transform
+    normalized_df = normalizer.fit_transform(df)
+
+    print(f"\nNormalized to {len(normalized_df.columns)} features")
+    print(f"Data shape: {normalized_df.shape}")
+
+    # Cap outliers in engineered features
+    engineered_features = ['price_momentum', 'volume_price_ratio', 'daily_range', 'technical_strength']
+    normalized_df = cap_outliers(normalized_df, features=[f for f in engineered_features if f in normalized_df.columns])
+
+    # Show feature importance info
+    importance_info = normalizer.get_feature_importance_info()
+    print(f"\nScalers used: {importance_info['scalers_used']}")
+
+    # Ensure output directory exists
+    import os
+    output_dir = os.path.dirname(output_file)
+    if output_dir and not os.path.exists(output_dir):
+        os.makedirs(output_dir, exist_ok=True)
+
+    # Save normalized data as pickle
+    pkl_output_file = output_file.replace('.csv', '.pkl')
+    normalized_df.to_pickle(pkl_output_file)
+    print(f"Saved normalized data to {pkl_output_file}")
+
+    # Save normalizer
+    if save_normalizer:
+        normalizer_file = pkl_output_file.replace('.pkl', '_improved_normalizer.pkl')
+        normalizer.save(normalizer_file)
+        print(f"Saved normalizer to {normalizer_file}")
+
+    return normalized_df, normalizer
+
+# CLI function
+import argparse
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Enhanced normalization for stock/crypto features with better handling of different feature types"
+    )
+    parser.add_argument('input', nargs='?', default='data/merged/features/stocks_features.parquet', 
+                       help='Input file (.parquet or .jsonl)')
+    parser.add_argument('output', nargs='?', default='data/merged/features/norm/stocks_features_improved_normalized.pkl', 
+                       help='Output pickle file for normalized features')
+    parser.add_argument('--no-save-normalizer', action='store_true', 
+                       help='Do not save the normalizer pickle')
+    parser.add_argument('--no-preserve-symbol', action='store_true', 
+                       help='Do not preserve symbol column')
+    parser.add_argument('--no-handle-outliers', action='store_true', 
+                       help='Do not handle outliers')
+    parser.add_argument('--no-feature-engineering', action='store_true', 
+                       help='Do not create engineered features')
+    parser.add_argument('--train', action='store_true',
+                       help='Normalize the train file and save under train/norm/')
+
+    args = parser.parse_args()
+
+    kwargs = {
+        'preserve_symbol': not args.no_preserve_symbol,
+        'handle_outliers': not args.no_handle_outliers,
+        'feature_engineering': not args.no_feature_engineering
+    }
+
+    if args.train:
+        train_input = 'data/merged/train/stocks_features_train.parquet'
+        train_norm_dir = 'data/merged/train/norm'
+        import os
+        os.makedirs(train_norm_dir, exist_ok=True)
+        train_output = os.path.join(train_norm_dir, 'stocks_features_train_normalized.pkl')
+        print(f"[INFO] Normalizing train file: {train_input} -> {train_output}")
+        normalize_stock_data_file_improved(
+            train_input,
+            train_output,
+            save_normalizer=not args.no_save_normalizer,
+            **kwargs
+        )
+    else:
+        print(f"[INFO] Enhanced normalizing: {args.input} -> {args.output}")
+        print(f"[INFO] Options: {kwargs}")
+        normalize_stock_data_file_improved(
+            args.input,
+            args.output,
+            save_normalizer=not args.no_save_normalizer,
+            **kwargs
+        )
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/merge/norm/test_null_handling.py b/src/merge/norm/test_null_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..e98a837324b7dc4ec4f242f062ef576f647233f5
--- /dev/null
+++ b/src/merge/norm/test_null_handling.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+"""
+Test script to verify null handling improvements in normalization
+"""
+
+import pandas as pd
+import numpy as np
+import sys
+from pathlib import Path
+
+# Add the norm directory to path
+sys.path.append(str(Path(__file__).parent))
+
+# Import the normalizers
+from crypto import CryptoDataNormalizer
+from stocks import ImprovedStockDataNormalizer
+
+def create_test_crypto_data():
+    """Create test crypto data with various null scenarios"""
+    data = {
+        'symbol': ['bitcoin', 'ethereum', 'cardano'] * 10,
+        'price': [50000, np.nan, 2000] * 10,
+        'volume': [1000000, 2000000, np.inf] * 10,
+        'dominance': [0.4, 0.15, np.nan] * 10,
+        'rank': [1, 2, 8] * 10,
+        'performance.day': [2.5, -1.2, np.nan] * 10,
+        'performance.week': [-5.0, np.inf, 1.5] * 10,
+        'exchangePrices.binance': [50001, 1601, np.nan] * 10,
+        'exchangePrices.coinbase': [49999, np.nan, 2001] * 10,
+        'rsi': [65, np.nan, 45] * 10,
+        'macd': [100, -50, np.nan] * 10,
+        'interval_timestamp': [1640995200000] * 30,
+        'stable': [False, False, False] * 10,
+        'transaction_count': [1000, np.nan, 500] * 10
+    }
+    return pd.DataFrame(data)
+
+def create_test_stock_data():
+    """Create test stock data with various null scenarios"""
+    data = {
+        'symbol': ['AAPL', 'GOOGL', 'MSFT'] * 10,
+        'close': [150, np.nan, 300] * 10,
+        'prev_close': [148, 2850, np.inf] * 10,
+        'volume': [1000000, 500000, np.nan] * 10,
+        'high': [152, 2870, 305] * 10,
+        'low': [147, np.nan, 295] * 10,
+        'rsi': [65, 45, np.nan] * 10,
+        'macd': [1.5, -0.8, np.nan] * 10,
+        'news_sentiment_mean_x': [0.7, np.nan, 0.3] * 10,
+        'news_articles_count_x': [5, 0, np.nan] * 10,
+        'marketCapitalization': [2500000000000, np.inf, 2000000000000] * 10,
+        'interval_timestamp': [1640995200000] * 30
+    }
+    return pd.DataFrame(data)
+
+def test_crypto_normalizer():
+    """Test crypto normalizer with null handling"""
+    print("Testing Crypto Normalizer...")
+    
+    # Create test data
+    df = create_test_crypto_data()
+    print(f"Original data shape: {df.shape}")
+    print(f"Original nulls: {df.isnull().sum().sum()}")
+    print(f"Original infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum()}")
+    
+    # Initialize and test normalizer
+    try:
+        normalizer = CryptoDataNormalizer()
+        normalized = normalizer.fit_transform(df)
+        
+        print(f"Normalized data shape: {normalized.shape}")
+        print(f"Remaining nulls: {normalized.isnull().sum().sum()}")
+        print(f"Remaining infinite values: {np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum()}")
+        
+        if normalized.isnull().sum().sum() == 0 and np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum() == 0:
+            print("✅ Crypto normalizer passed null handling test!")
+            return True
+        else:
+            print("❌ Crypto normalizer failed null handling test!")
+            return False
+            
+    except Exception as e:
+        print(f"❌ Crypto normalizer failed with error: {e}")
+        return False
+
+def test_stock_normalizer():
+    """Test stock normalizer with null handling"""
+    print("\nTesting Stock Normalizer...")
+    
+    # Create test data
+    df = create_test_stock_data()
+    print(f"Original data shape: {df.shape}")
+    print(f"Original nulls: {df.isnull().sum().sum()}")
+    print(f"Original infinite values: {np.isinf(df.select_dtypes(include=[np.number])).sum().sum()}")
+    
+    # Initialize and test normalizer
+    try:
+        normalizer = ImprovedStockDataNormalizer()
+        normalized = normalizer.fit_transform(df)
+        
+        print(f"Normalized data shape: {normalized.shape}")
+        print(f"Remaining nulls: {normalized.isnull().sum().sum()}")
+        print(f"Remaining infinite values: {np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum()}")
+        
+        if normalized.isnull().sum().sum() == 0 and np.isinf(normalized.select_dtypes(include=[np.number])).sum().sum() == 0:
+            print("✅ Stock normalizer passed null handling test!")
+            return True
+        else:
+            print("❌ Stock normalizer failed null handling test!")
+            return False
+            
+    except Exception as e:
+        print(f"❌ Stock normalizer failed with error: {e}")
+        return False
+
+def main():
+    """Run all tests"""
+    print("="*60)
+    print("TESTING NULL HANDLING IMPROVEMENTS")
+    print("="*60)
+    
+    crypto_passed = test_crypto_normalizer()
+    stock_passed = test_stock_normalizer()
+    
+    print("\n" + "="*60)
+    print("TEST RESULTS SUMMARY")
+    print("="*60)
+    
+    if crypto_passed and stock_passed:
+        print("🎉 All tests passed! Null handling improvements are working correctly.")
+        return 0
+    else:
+        print("❌ Some tests failed. Review the output above for details.")
+        return 1
+
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)
diff --git a/src/merge/normalize.py b/src/merge/normalize.py
new file mode 100644
index 0000000000000000000000000000000000000000..54f954e3226e5326cb8cf202eed5eb99ddd90ee6
--- /dev/null
+++ b/src/merge/normalize.py
@@ -0,0 +1,23 @@
+# This script runs both the stock and crypto normalization pipelines from the norm/ directory
+import sys
+import os
+
+# Add norm directory to sys.path for imports
+norm_dir = os.path.join(os.path.dirname(__file__), 'norm')
+sys.path.insert(0, norm_dir)
+
+# Import and run stock normalization
+try:
+    from norm import stocks
+    print("\n--- Running Stock Normalization ---")
+    stocks.main()
+except Exception as e:
+    print(f"[ERROR] Stock normalization failed: {e}")
+
+# Import and run crypto normalization
+try:
+    from norm import crypto
+    print("\n--- Running Crypto Normalization ---")
+    crypto.main()
+except Exception as e:
+    print(f"[ERROR] Crypto normalization failed: {e}")
diff --git a/src/merge/remove_null_symbols.py b/src/merge/remove_null_symbols.py
new file mode 100644
index 0000000000000000000000000000000000000000..90085c50cfabfe6216d4619dfcee95cac06cb9cf
--- /dev/null
+++ b/src/merge/remove_null_symbols.py
@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+"""
+Remove rows with null symbols from crypto and stock features.
+This script ensures that all records have valid symbols for downstream processing.
+"""
+
+import pandas as pd
+from pathlib import Path
+
+def remove_null_symbols():
+    """Remove rows with null symbols from crypto and stock features."""
+    
+    # Process crypto features
+    crypto_path = Path("data/merged/features/crypto_features.parquet")
+    if crypto_path.exists():
+        df_crypto = pd.read_parquet(crypto_path)
+        
+        initial_count = len(df_crypto)
+        null_count = df_crypto['symbol'].isnull().sum()
+        
+        if null_count > 0:
+            # Remove null symbol rows
+            df_crypto_clean = df_crypto[df_crypto['symbol'].notnull()].copy()
+            
+            final_count = len(df_crypto_clean)
+            removed_count = initial_count - final_count
+            
+            print(f"[CRYPTO] Removed {removed_count} rows with null symbols ({final_count} remaining)")
+            
+            # Save cleaned data
+            df_crypto_clean.to_parquet(crypto_path, index=False)
+            
+            # Verify no null symbols remain
+            remaining_nulls = df_crypto_clean['symbol'].isnull().sum()
+            if remaining_nulls > 0:
+                print(f"⚠️  Warning: {remaining_nulls} null symbols still remain")
+    
+    # Process stock features
+    stocks_path = Path("data/merged/features/stocks_features.parquet")
+    if stocks_path.exists():
+        df_stocks = pd.read_parquet(stocks_path)
+        
+        initial_count = len(df_stocks)
+        null_count = df_stocks['symbol'].isnull().sum()
+        
+        if null_count > 0:
+            # Remove null symbol rows
+            df_stocks_clean = df_stocks[df_stocks['symbol'].notnull()].copy()
+            
+            final_count = len(df_stocks_clean)
+            removed_count = initial_count - final_count
+            
+            print(f"[STOCKS] Removed {removed_count} rows with null symbols ({final_count} remaining)")
+            
+            # Save cleaned data
+            df_stocks_clean.to_parquet(stocks_path, index=False)
+            
+            # Verify no null symbols remain
+            remaining_nulls = df_stocks_clean['symbol'].isnull().sum()
+            if remaining_nulls > 0:
+                print(f"⚠️  Warning: {remaining_nulls} null symbols still remain")
+
+if __name__ == "__main__":
+    remove_null_symbols()
diff --git a/src/merge/run_final_null_handling.py b/src/merge/run_final_null_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..2876dea12d002e0f34c2708f5570ec2528e3ab4c
--- /dev/null
+++ b/src/merge/run_final_null_handling.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python3
+"""
+Final Null Handler Integration Script
+Integrates the final null value handler into the existing merge pipeline.
+"""
+
+import sys
+import subprocess
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from final_null_handler import FinalNullValueHandler, process_crypto_features_file, process_stock_features_file
+
+def run_final_null_handling():
+    """Run the final null value handling on all feature files"""
+    
+    print("="*60)
+    print("STARTING FINAL NULL VALUE HANDLING")
+    print("="*60)
+    
+    base_path = Path("data/merged/features")
+    
+    files_to_process = [
+        ("crypto_features.parquet", "crypto"),
+        ("stocks_features.parquet", "stock"),
+        ("merged_features.parquet", "merged")
+    ]
+    
+    results = {}
+    
+    for filename, file_type in files_to_process:
+        file_path = base_path / filename
+        
+        if not file_path.exists():
+            print(f"[WARNING]  {filename} not found, skipping...")
+            continue
+        
+        print(f"\n[INFO] Processing {filename}...")
+        
+        try:
+            if file_type == "crypto":
+                df_processed, report = process_crypto_features_file(file_path)
+            elif file_type == "stock":
+                df_processed, report = process_stock_features_file(file_path)
+            elif file_type == "merged":
+                # For merged file, determine type by content
+                df_processed, report = process_merged_features_file(file_path)
+            
+            results[file_type] = {
+                'success': True,
+                'file_path': file_path,
+                'report': report,
+                'rows': len(df_processed),
+                'nulls_filled': report['total_nulls_filled']
+            }
+            
+            print(f"[SUCCESS] {filename} processed successfully!")
+            print(f"   - Rows: {len(df_processed):,}")
+            print(f"   - Nulls filled: {report['total_nulls_filled']:,}")
+            
+        except Exception as e:
+            print(f"[ERROR] Error processing {filename}: {str(e)}")
+            results[file_type] = {
+                'success': False,
+                'error': str(e),
+                'file_path': file_path
+            }
+    
+    return results
+
+def process_merged_features_file(file_path):
+    """Process merged features file (contains both crypto and stock data)"""
+    print(f"Loading merged features from {file_path}...")
+    df = pd.read_parquet(file_path)
+    
+    print(f"Loaded {len(df)} rows with {len(df.columns)} columns")
+    print(f"Null values before processing: {df.isnull().sum().sum()}")
+    
+    handler = FinalNullValueHandler()
+    
+    # Separate crypto and stock data if possible
+    if 'symbol' in df.columns:
+        # Detect crypto vs stock based on available columns
+        crypto_indicators = ['rank', 'dominance', 'performance.day', 'exchangePrices.binance']
+        stock_indicators = ['news_activity_score_x', 'strongBuy', 'marketCapitalization']
+        
+        has_crypto_cols = any(col in df.columns for col in crypto_indicators)
+        has_stock_cols = any(col in df.columns for col in stock_indicators)
+        
+        if has_crypto_cols and has_stock_cols:
+            # Mixed data - process intelligently
+            print("Detected mixed crypto/stock data - processing intelligently...")
+            
+            # Try to separate by symbol patterns or available data
+            crypto_mask = df['rank'].notna() | df['dominance'].notna()
+            if crypto_mask.any():
+                print(f"Processing {crypto_mask.sum()} rows as crypto data...")
+                df_crypto = df[crypto_mask].copy()
+                df_crypto_processed = handler.process_crypto_features(df_crypto)
+                df.loc[crypto_mask] = df_crypto_processed
+            
+            stock_mask = ~crypto_mask
+            if stock_mask.any():
+                print(f"Processing {stock_mask.sum()} rows as stock data...")
+                df_stock = df[stock_mask].copy()
+                df_stock_processed = handler.process_stock_features(df_stock)
+                df.loc[stock_mask] = df_stock_processed
+            
+            df_processed = df
+            
+        elif has_crypto_cols:
+            print("Detected crypto-only data...")
+            df_processed = handler.process_crypto_features(df)
+        elif has_stock_cols:
+            print("Detected stock-only data...")
+            df_processed = handler.process_stock_features(df)
+        else:
+            print("Could not determine data type, applying generic processing...")
+            df_processed = handler.process_stock_features(df)  # Default to stock processing
+    else:
+        print("No symbol column found, applying generic processing...")
+        df_processed = handler.process_stock_features(df)
+    
+    print(f"Null values after processing: {df_processed.isnull().sum().sum()}")
+    
+    # Generate report
+    report = handler.generate_report(df, df_processed, 'merged')
+    
+    # Save processed data
+    df_processed.to_parquet(file_path, index=False)
+    print(f"Saved processed merged features to {file_path}")
+    
+    return df_processed, report
+
+def validate_data_quality(results):
+    """Validate that the data quality is maintained after null handling"""
+    print("\n" + "="*60)
+    print("DATA QUALITY VALIDATION")
+    print("="*60)
+    
+    validation_results = {}
+    
+    for file_type, result in results.items():
+        if not result.get('success', False):
+            continue
+        
+        file_path = result['file_path']
+        
+        try:
+            df = pd.read_parquet(file_path)
+            
+            # Basic validation checks
+            validation = {
+                'total_rows': len(df),
+                'total_columns': len(df.columns),
+                'remaining_nulls': df.isnull().sum().sum(),
+                'duplicate_rows': df.duplicated().sum(),
+                'infinite_values': np.isinf(df.select_dtypes(include=[np.number])).sum().sum(),
+                'data_types_consistent': True,  # Could add more sophisticated checks
+            }
+            
+            # Check for unrealistic values
+            numeric_cols = df.select_dtypes(include=[np.number]).columns
+            extreme_values = {}
+            
+            for col in numeric_cols:
+                if col in df.columns:
+                    col_data = df[col].dropna()
+                    if len(col_data) > 0:
+                        q1, q99 = col_data.quantile([0.01, 0.99])
+                        extreme_count = ((col_data < q1 - 10 * (q99 - q1)) | 
+                                       (col_data > q99 + 10 * (q99 - q1))).sum()
+                        if extreme_count > 0:
+                            extreme_values[col] = extreme_count
+            
+            validation['extreme_values'] = extreme_values
+            validation['quality_score'] = calculate_quality_score(validation)
+            
+            validation_results[file_type] = validation
+            
+            print(f"\n{file_type.upper()} VALIDATION:")
+            print(f"  ✓ Rows: {validation['total_rows']:,}")
+            print(f"  ✓ Columns: {validation['total_columns']}")
+            print(f"  ✓ Remaining nulls: {validation['remaining_nulls']}")
+            print(f"  ✓ Duplicate rows: {validation['duplicate_rows']}")
+            print(f"  ✓ Infinite values: {validation['infinite_values']}")
+            print(f"  ✓ Quality score: {validation['quality_score']:.2%}")
+            
+            if extreme_values:
+                print(f"  [WARNING]  Extreme values detected in {len(extreme_values)} columns")
+            
+        except Exception as e:
+            print(f"[ERROR] Validation failed for {file_type}: {str(e)}")
+            validation_results[file_type] = {'error': str(e)}
+    
+    return validation_results
+
+def calculate_quality_score(validation):
+    """Calculate a simple quality score"""
+    score = 1.0
+    
+    # Penalize remaining nulls
+    if validation['total_rows'] > 0:
+        null_ratio = validation['remaining_nulls'] / (validation['total_rows'] * validation['total_columns'])
+        score -= null_ratio * 0.5
+    
+    # Penalize duplicates
+    if validation['total_rows'] > 0:
+        dup_ratio = validation['duplicate_rows'] / validation['total_rows']
+        score -= dup_ratio * 0.3
+    
+    # Penalize infinite values
+    if validation['infinite_values'] > 0:
+        score -= 0.1
+    
+    # Penalize extreme values
+    extreme_columns = len(validation.get('extreme_values', {}))
+    if extreme_columns > 0:
+        score -= (extreme_columns / validation['total_columns']) * 0.2
+    
+    return max(0.0, score)
+
+def print_final_summary(results, validation_results):
+    """Print final summary of the null handling process"""
+    print("\n" + "="*60)
+    print("FINAL NULL HANDLING SUMMARY")
+    print("="*60)
+    
+    total_nulls_filled = sum(r.get('nulls_filled', 0) for r in results.values() if r.get('success'))
+    successful_files = sum(1 for r in results.values() if r.get('success'))
+    total_files = len(results)
+    
+    print(f"\n[INFO] PROCESSING RESULTS:")
+    print(f"   Files processed: {successful_files}/{total_files}")
+    print(f"   Total nulls filled: {total_nulls_filled:,}")
+    
+    print(f"\n[METRICS] QUALITY METRICS:")
+    for file_type, validation in validation_results.items():
+        if 'error' not in validation:
+            print(f"   {file_type}: {validation['quality_score']:.1%} quality score")
+    
+    if successful_files == total_files:
+        print(f"\n[SUCCESS] ALL FILES PROCESSED SUCCESSFULLY!")
+    else:
+        failed_files = total_files - successful_files
+        print(f"\n[WARNING]  {failed_files} files failed to process")
+    
+    print("\n[TIPS] RECOMMENDATIONS:")
+    print("   - Review any remaining null columns in the reports")
+    print("   - Monitor data quality scores in production")
+    print("   - Consider additional validation rules if needed")
+    
+    print("\n" + "="*60)
+
+def main():
+    """Main function"""
+    try:
+        # Import numpy for validation
+        import numpy as np
+        globals()['np'] = np
+        
+        # Run the null handling process
+        results = run_final_null_handling()
+        
+        # Validate data quality
+        validation_results = validate_data_quality(results)
+        
+        # Print final summary
+        print_final_summary(results, validation_results)
+        
+        # Return success if all files processed successfully
+        success_count = sum(1 for r in results.values() if r.get('success'))
+        return 0 if success_count == len(results) else 1
+        
+    except Exception as e:
+        print(f"[ERROR] Fatal error in null handling process: {str(e)}")
+        return 1
+
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)
diff --git a/src/merge/separator.py b/src/merge/separator.py
new file mode 100644
index 0000000000000000000000000000000000000000..0bb125c533beb27785944e609bc009d853898ba8
--- /dev/null
+++ b/src/merge/separator.py
@@ -0,0 +1,57 @@
+import pandas as pd
+from pathlib import Path
+
+def separate_features(merged_path, crypto_path, stocks_path):
+    """
+    Split merged_features.parquet into crypto_features and stocks_features using is_crypto attribute,
+    then drop any columns that are entirely null.
+    """
+    merged_path = Path(merged_path)
+    if not merged_path.exists():
+        print(f"File not found: {merged_path}")
+        return
+
+    df = pd.read_parquet(merged_path)
+
+    # Ensure COIN and XRP are marked as crypto
+    if 'symbol' in df.columns:
+        xrp_mask = df['symbol'].str.upper() == 'RIPPLE'
+        df.loc[xrp_mask, 'is_crypto'] = 1
+
+    # Separate by is_crypto
+    crypto_df = df[df['is_crypto'] == 1].copy()
+    stocks_df = df[df['is_crypto'] == 0].copy()
+
+    # Drop columns that are entirely null
+    def drop_all_null(df, name):
+        null_cols = df.columns[df.isna().all()]
+        if len(null_cols):
+            print(f"Dropping {len(null_cols)} all-null columns from {name}:")
+            # for c in null_cols:
+                # print(f"  • {c}")
+            df.drop(columns=null_cols, inplace=True)
+        else:
+            print(f"No all-null columns in {name}.")
+        return df
+
+    crypto_df = drop_all_null(crypto_df, "crypto_features")
+    stocks_df = drop_all_null(stocks_df, "stocks_features")
+
+    # Save to parquet
+    crypto_df.to_parquet(crypto_path)
+    stocks_df.to_parquet(stocks_path)
+    print(f"Saved {len(crypto_df)} crypto features to {crypto_path}")
+    print(f"Saved {len(stocks_df)} stocks features to {stocks_path}")
+
+
+if __name__ == "__main__":
+    try:
+        from src import config as app_config
+        base = Path(app_config.DATA_DIR)
+    except Exception:
+        from os import getenv
+        base = Path(getenv("DATA_DIR", "/data"))
+    merged_path = base / "merged" / "features" / "merged_features.parquet"
+    crypto_path = base / "merged" / "features" / "crypto_features.parquet"
+    stocks_path = base / "merged" / "features" / "stocks_features.parquet"
+    separate_features(merged_path, crypto_path, stocks_path)
diff --git a/src/merge/stocks_data_filler.py b/src/merge/stocks_data_filler.py
new file mode 100644
index 0000000000000000000000000000000000000000..888388f519833e26af14d8dc9cd2a76b991f314f
--- /dev/null
+++ b/src/merge/stocks_data_filler.py
@@ -0,0 +1,438 @@
+import pandas as pd
+import numpy as np
+from sklearn.impute import KNNImputer
+from sklearn.preprocessing import StandardScaler
+import warnings
+warnings.filterwarnings('ignore')
+
+class ImprovedStockDataImputer:
+    """
+    Enhanced imputation that prevents data homogenization by using
+    symbol-specific patterns and relationships.
+    """
+    
+    def __init__(self, preserve_symbol_diversity=True):
+        self.preserve_symbol_diversity = preserve_symbol_diversity
+        self.symbol_profiles = {}
+        self.scalers = {}
+        
+    def _create_symbol_profiles(self, df):
+        """Create profiles for each symbol to guide imputation."""
+        profiles = {}
+        
+        for symbol in df['symbol'].unique():
+            symbol_data = df[df['symbol'] == symbol]
+            
+            # Calculate symbol-specific statistics with proper null handling
+            price_col = None
+            for col in ['price', 'close', 'close_alpaca', 'open', 'high', 'low']:
+                if col in symbol_data.columns and not symbol_data[col].isnull().all():
+                    price_col = col
+                    break
+            
+            volume_col = None
+            for col in ['volume', 'volume_alpaca']:
+                if col in symbol_data.columns and not symbol_data[col].isnull().all():
+                    volume_col = col
+                    break
+            
+            profile = {
+                'symbol': symbol,
+                'price_level': symbol_data[price_col].median() if price_col else 100.0,  # Default to 100
+                'price_volatility': symbol_data[price_col].std() if price_col else 2.0,  # Default volatility
+                'volume_level': symbol_data[volume_col].median() if volume_col else 1000.0,  # Default volume
+                'is_crypto': symbol_data['is_crypto'].mode().iloc[0] if 'is_crypto' in symbol_data.columns and not symbol_data['is_crypto'].isnull().all() else 0,
+                'typical_rsi': symbol_data['rsi'].median() if 'rsi' in symbol_data.columns and not symbol_data['rsi'].isnull().all() else 50.0,
+                'data_availability': len(symbol_data) / len(df) if len(df) > 0 else 0
+            }
+            
+            # Ensure no None values in profile
+            for key, value in profile.items():
+                if value is None or (isinstance(value, float) and np.isnan(value)):
+                    if key == 'price_level':
+                        profile[key] = 100.0
+                    elif key == 'price_volatility':
+                        profile[key] = 2.0
+                    elif key == 'volume_level':
+                        profile[key] = 1000.0
+                    elif key == 'typical_rsi':
+                        profile[key] = 50.0
+                    elif key == 'is_crypto':
+                        profile[key] = 0
+                    else:
+                        profile[key] = 0.0
+            
+            profiles[symbol] = profile
+            
+        return profiles
+    
+    def _impute_with_symbol_context(self, df, column, symbol_profiles):
+        """Impute values using symbol-specific context to prevent homogenization."""
+        
+        df_result = df.copy()
+        
+        for symbol in df['symbol'].unique():
+            symbol_mask = df['symbol'] == symbol
+            symbol_data = df.loc[symbol_mask, column]
+            
+            if symbol_data.isnull().sum() == 0:
+                continue  # No missing values for this symbol
+            
+            profile = symbol_profiles.get(symbol, {})
+            
+            # Strategy depends on column type and symbol characteristics
+            if column in ['price', 'open', 'high', 'low', 'close']:
+                # Price data - use interpolation with symbol-specific bounds
+                interpolated = symbol_data.interpolate(method='linear', limit_direction='both')
+                
+                # If still missing, use symbol's typical price level with noise
+                if interpolated.isnull().any():
+                    base_price = profile.get('price_level', 100.0)
+                    volatility = profile.get('price_volatility', base_price * 0.02)
+                    
+                    # Add symbol-specific noise to prevent identical values
+                    symbol_hash = hash(symbol) % 1000 / 1000  # 0-1 range
+                    noise_factor = (symbol_hash - 0.5) * 0.1  # -5% to +5%
+                    adjusted_price = base_price * (1 + noise_factor)
+                    
+                    interpolated = interpolated.fillna(adjusted_price)
+                
+                df_result.loc[symbol_mask, column] = interpolated
+                
+            elif column in ['volume', 'volume_alpaca']:
+                # Volume data - use forward fill then symbol-specific median
+                filled = symbol_data.fillna(method='ffill').fillna(method='bfill')
+                
+                if filled.isnull().any():
+                    # Use symbol's typical volume with variation
+                    base_volume = profile.get('volume_level', 1000.0)
+                    symbol_hash = hash(symbol + column) % 1000 / 1000
+                    volume_multiplier = 0.5 + symbol_hash  # 0.5x to 1.5x variation
+                    adjusted_volume = base_volume * volume_multiplier
+                    filled = filled.fillna(adjusted_volume)
+                
+                df_result.loc[symbol_mask, column] = filled
+                
+            elif column in ['rsi', 'stoch_k', 'stoch_d']:
+                # Oscillator indicators - use symbol-specific typical values
+                symbol_median = symbol_data.median()
+                
+                if pd.isna(symbol_median):
+                    # Use symbol-specific baseline with variation
+                    symbol_hash = hash(symbol + column) % 1000 / 1000
+                    if column == 'rsi':
+                        # RSI: 30-70 range with symbol variation
+                        baseline = 30 + (symbol_hash * 40)  # 30-70 range
+                    else:  # stochastic
+                        baseline = 20 + (symbol_hash * 60)  # 20-80 range
+                else:
+                    baseline = symbol_median
+                
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+                
+            elif column in ['macd', 'macd_signal', 'macd_histogram']:
+                # MACD - can be positive/negative, use symbol-specific pattern
+                symbol_median = symbol_data.median()
+                
+                if pd.isna(symbol_median):
+                    # Use price-level dependent MACD estimation with null safety
+                    price_level = profile.get('price_level', 100.0)  # Default to 100 if None
+                    if price_level is None or np.isnan(price_level):
+                        price_level = 100.0
+                    
+                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1
+                    # Scale MACD relative to price level
+                    baseline = (price_level * 0.001) * symbol_hash
+                else:
+                    baseline = symbol_median
+                
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+                
+            else:
+                # Generic numeric imputation with symbol variation
+                symbol_median = symbol_data.median()
+                
+                if pd.isna(symbol_median):
+                    # Use overall median but add symbol-specific variation
+                    overall_median = df[column].median()
+                    if pd.isna(overall_median):
+                        overall_median = 0
+                    
+                    # Add symbol-specific variation (±10%)
+                    symbol_hash = hash(symbol + column) % 2000 / 1000 - 1  # -1 to +1
+                    variation = overall_median * 0.1 * symbol_hash
+                    baseline = overall_median + variation
+                else:
+                    baseline = symbol_median
+                
+                df_result.loc[symbol_mask, column] = symbol_data.fillna(baseline)
+        
+        return df_result[column]
+    
+    def fit_transform(self, df):
+        """Apply improved imputation with anti-homogenization measures."""
+        
+        df_imputed = df.copy()
+        df_imputed = df_imputed.sort_values(['symbol', 'interval_timestamp'])
+        
+        # Create symbol profiles
+        self.symbol_profiles = self._create_symbol_profiles(df_imputed)
+        
+        print(f"Created profiles for {len(self.symbol_profiles)} unique symbols")
+        
+        # 1. Handle categorical/flag columns (same as before)
+        categorical_cols = [
+            'symbol', 'stock_market', 'is_crypto', 'is_stock', 'is_other',
+            'alpaca_data_available', 'is_trading_hours', 'is_weekend'
+        ]
+        
+        for col in categorical_cols:
+            if col in df_imputed.columns:
+                df_imputed[col] = df_imputed.groupby('symbol')[col].fillna(method='ffill').fillna(method='bfill')
+        
+        # 2. Price and volume data - symbol-specific imputation
+        price_volume_cols = [
+            'price', 'open', 'high', 'low', 'close', 'volume',
+            'open_alpaca', 'high_alpaca', 'low_alpaca', 'close_alpaca', 'volume_alpaca',
+            'bid_price', 'ask_price', 'bid_price_alpaca', 'ask_price_alpaca', 'price_alpaca'
+        ]
+        
+        for col in price_volume_cols:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                print(f"Imputing {col} with symbol-specific context...")
+                df_imputed[col] = self._impute_with_symbol_context(
+                    df_imputed, col, self.symbol_profiles
+                )
+        
+        # 3. Technical indicators - symbol-specific imputation
+        tech_indicators = [
+            'rsi', 'macd', 'macd_signal', 'macd_histogram', 'atr', 'bb_position',
+            'stoch_k', 'stoch_d', 'cci', 'roc_5', 'roc_10', 'mfi', 'rsi_macd_signal',
+            'ema_convergence', 'true_range_pct'
+        ]
+        
+        for col in tech_indicators:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                print(f"Imputing {col} with symbol-specific context...")
+                df_imputed[col] = self._impute_with_symbol_context(
+                    df_imputed, col, self.symbol_profiles
+                )
+        
+        # 4. Volume/price change features - symbol-specific
+        change_features = [
+            'price_change_1', 'price_change_7', 'price_change_14', 'volume_ratio',
+            'volatility_7', 'price_volume_trend', 'volatility_consistency'
+        ]
+        
+        for col in change_features:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                df_imputed[col] = self._impute_with_symbol_context(
+                    df_imputed, col, self.symbol_profiles
+                )
+        
+        # 5. On-chain features (crypto only)
+        onchain_features = [
+            'total_fees', 'total_gas_used', 'avg_gas_price', 'tx_count_7d_change',
+            'tx_count_sma_7', 'tx_volume_7d_change', 'tx_volume_sma_7',
+            'gas_used_7d_change', 'gas_used_sma_7', 'gas_price_7d_change',
+            'gas_price_sma_7', 'fees_7d_change', 'avg_tx_size'
+        ]
+        
+        for col in onchain_features:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                # Only impute for crypto assets
+                crypto_mask = df_imputed['is_crypto'] == 1
+                non_crypto_mask = df_imputed['is_crypto'] != 1
+                
+                if crypto_mask.any():
+                    crypto_data = df_imputed.loc[crypto_mask]
+                    crypto_imputed = self._impute_with_symbol_context(
+                        crypto_data, col, self.symbol_profiles
+                    )
+                    df_imputed.loc[crypto_mask, col] = crypto_imputed
+                
+                # Fill non-crypto with 0
+                df_imputed.loc[non_crypto_mask, col] = df_imputed.loc[non_crypto_mask, col].fillna(0)
+        
+        # 6. Handle remaining columns with simple strategies
+        remaining_strategies = {
+            'quality_metrics': [
+                'data_quality_score', 'core_features_completeness', 'technical_indicators_completeness',
+                'onchain_features_completeness', 'price_data_completeness', 
+                'overall_feature_completeness', 'data_completeness_score'
+            ],
+            'news_sentiment': [
+                'news_sentiment_mean', 'news_sentiment_std', 'news_sentiment_min',
+                'news_sentiment_max', 'news_sentiment_range', 'news_match_score_mean',
+                'news_match_score_max', 'news_mentions_count', 'news_articles_count',
+                'news_highlights_count', 'news_activity_score', 'sentiment_score'
+            ],
+            'zero_fill': [
+                'trade_count', 'trade_count_alpaca', 'bid_size', 'ask_size',
+                'bid_size_alpaca', 'ask_size_alpaca', 'size', 'size_alpaca'
+            ]
+        }
+        
+        # Quality metrics - use median but add small variation
+        for col in remaining_strategies['quality_metrics']:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                median_val = df_imputed[col].median()
+                if pd.isna(median_val):
+                    median_val = 0.5  # Default for quality metrics
+                median_val = np.clip(median_val, 0, 1)
+                
+                # Add tiny symbol-specific variation
+                for symbol in df_imputed['symbol'].unique():
+                    mask = df_imputed['symbol'] == symbol
+                    symbol_hash = hash(symbol + col) % 100 / 10000  # Very small variation
+                    fill_val = np.clip(median_val + symbol_hash, 0, 1)
+                    df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(fill_val)
+        
+        # News sentiment - neutral with symbol variation
+        for col in remaining_strategies['news_sentiment']:
+            if col in df_imputed.columns and df_imputed[col].isnull().any():
+                if 'sentiment' in col.lower():
+                    # Slight variation around neutral
+                    for symbol in df_imputed['symbol'].unique():
+                        mask = df_imputed['symbol'] == symbol
+                        symbol_hash = (hash(symbol + col) % 200 / 1000) - 0.1  # -0.1 to +0.1
+                        df_imputed.loc[mask, col] = df_imputed.loc[mask, col].fillna(symbol_hash)
+                elif 'count' in col.lower():
+                    df_imputed[col] = df_imputed[col].fillna(0)
+                else:
+                    median_val = df_imputed[col].median()
+                    if pd.isna(median_val):
+                        median_val = 0
+                    df_imputed[col] = df_imputed[col].fillna(median_val)
+        
+        # Zero fill
+        for col in remaining_strategies['zero_fill']:
+            if col in df_imputed.columns:
+                df_imputed[col] = df_imputed[col].fillna(0)
+        
+        # Handle any remaining columns
+        remaining_numeric = df_imputed.select_dtypes(include=[np.number]).columns
+        remaining_with_nulls = [col for col in remaining_numeric if df_imputed[col].isnull().any()]
+        
+        for col in remaining_with_nulls:
+            if col not in ['id', 'id_alpaca', 'backup_id']:
+                print(f"Imputing remaining column: {col}")
+                df_imputed[col] = self._impute_with_symbol_context(
+                    df_imputed, col, self.symbol_profiles
+                )
+        
+        print("[INFO] Imputation complete with anti-homogenization measures")
+        print(f"[INFO] Final null counts: {df_imputed.isnull().sum().sum()}")
+        return df_imputed
+
+# Usage function with validation
+def impute_with_validation(file_path, output_path=None):
+    """Impute data and validate no homogenization occurred."""
+    
+    try:
+        print(f"[INFO] Loading data from: {file_path}")
+        df = pd.read_parquet(file_path)
+        print(f"[INFO] Loaded data shape: {df.shape}")
+        print(f"[INFO] Initial null counts: {df.isnull().sum().sum()}")
+    except Exception as e:
+        print(f"[ERROR] Failed to load data: {e}")
+        return None
+    
+    # Sample symbols for validation
+    symbols_sample = df['symbol'].unique()[:5]
+    print(f"[INFO] Processing {len(df['symbol'].unique())} unique symbols")
+    
+    # Initialize and run imputer
+    imputer = ImprovedStockDataImputer()
+    df_imputed = imputer.fit_transform(df)
+    
+    # Combine alpaca data with main data where available
+    alpaca_combinations = [
+        ('high', 'high_alpaca'),
+        ('low', 'low_alpaca'),
+        ('close', 'close_alpaca'),
+        ('open', 'open_alpaca'),
+        ('volume', 'volume_alpaca')
+    ]
+    
+    for main_col, alpaca_col in alpaca_combinations:
+        if main_col in df_imputed.columns and alpaca_col in df_imputed.columns:
+            df_imputed[main_col] = df_imputed[main_col].combine_first(df_imputed[alpaca_col])
+            print(f"[INFO] Combined {main_col} with {alpaca_col}")
+    
+    # Drop unwanted columns before saving
+    drop_cols = [
+        '_filename', '_original_format', 'alpaca_data_available',
+        'ask_exchange', 'ask_exchange_alpaca',
+        'bid_exchange', 'bid_exchange_alpaca',
+        'conditions', 'conditions_alpaca', 'conditions_trade', 'conditions_trade_alpaca',
+        'symbol_quote', 'symbol_quote_alpaca', 'symbol_trade', 'symbol_trade_alpaca',
+        'tape', 'tape_alpaca', 'tape_trade', 'tape_trade_alpaca',
+        'id', 'id_alpaca',
+        'is_new_symbol', 'price', 'timestamp_dt',
+        'alpaca_merge_timestamp', 'timestamp', 'timestamp_alpaca',
+        'estimateCurrency', 'exchange', 'exchange_alpaca', 'exchange_company',
+        'finnhubIndustry', 'headline',
+        'sentiment_timestamp', 'logo',
+        'ticker', 'stock_market',
+        'weburl', 'latest_news_timestamp', 'day_of_week', 'feature_timestamp', 
+        'interval_timestamp_dt', 'is_crypto', 'is_other', 'is_stock',
+        'country', 'currency', 'datetime', 'ipo', 'name', 'period', 'phone', 
+        'year', 'month', 'latest_news_timestamp_x', 'latest_news_timestamp_y'
+    ]
+    
+    original_cols = len(df_imputed.columns)
+    for col in drop_cols:
+        if col in df_imputed.columns:
+            df_imputed = df_imputed.drop(columns=col)
+    
+    print(f"[INFO] Dropped {original_cols - len(df_imputed.columns)} unwanted columns")
+    
+    # Reorder columns: 'symbol' first, 'interval_timestamp' second, rest follow
+    cols = list(df_imputed.columns)
+    if 'symbol' in cols and 'interval_timestamp' in cols:
+        rest = [c for c in cols if c not in ['symbol', 'interval_timestamp']]
+        df_imputed = df_imputed[['symbol', 'interval_timestamp'] + rest]
+        print("[INFO] Reordered columns with symbol and interval_timestamp first")
+
+    # Save results
+    if output_path:
+        # Clean up data types
+        if 'backup_id' in df_imputed.columns:
+            df_imputed['backup_id'] = df_imputed['backup_id'].astype(str)
+        
+        try:
+            df_imputed.to_parquet(output_path, compression='snappy')
+            print(f"[INFO] Successfully saved imputed data to: {output_path}")
+        except Exception as e:
+            print(f"[ERROR] Failed to save data: {e}")
+            return None
+    
+    print(f"[INFO] Final dataset shape: {df_imputed.shape}")
+    return df_imputed
+
+# Example usage
+def main():
+    input_file = "data/merged/features/stocks_features.parquet"
+    output_file = input_file
+
+    print("[INFO] Starting stock data imputation process...")
+    df_clean = impute_with_validation(input_file, output_file)
+    
+    if df_clean is not None:
+        print(f"[INFO] Data imputation completed successfully!")
+        print(f"[INFO] Final shape: {df_clean.shape}")
+        print(f"[INFO] Remaining nulls: {df_clean.isnull().sum().sum()}")
+        
+        # Quick validation
+        print("\n=== VALIDATION SUMMARY ===")
+        print(f"Unique symbols: {df_clean['symbol'].nunique()}")
+        if 'close' in df_clean.columns:
+            print(f"Price range: ${df_clean['close'].min():.2f} - ${df_clean['close'].max():.2f}")
+        if 'volume' in df_clean.columns:
+            print(f"Volume range: {df_clean['volume'].min():.0f} - {df_clean['volume'].max():.0f}")
+    else:
+        print("[ERROR] Failed to load or impute data.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/merge/test_enhanced_null_handling.py b/src/merge/test_enhanced_null_handling.py
new file mode 100644
index 0000000000000000000000000000000000000000..61aa738bffb3604b995cb9174c564e5eaa3321e8
--- /dev/null
+++ b/src/merge/test_enhanced_null_handling.py
@@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""
+Test script for the enhanced symbol-first null handling strategy
+"""
+
+import pandas as pd
+import numpy as np
+import sys
+from pathlib import Path
+import json
+
+# Add the merge directory to path
+sys.path.append(str(Path(__file__).parent.parent))
+
+from final_null_handler import FinalNullValueHandler
+
+def create_realistic_test_data():
+    """Create realistic test data with temporal patterns and symbol-specific characteristics"""
+    
+    # Create timestamps for the last 30 days
+    timestamps = pd.date_range(start='2025-07-01', end='2025-07-30', freq='1H')
+    timestamp_ms = (timestamps.astype(np.int64) // 10**6).tolist()
+    
+    symbols = ['bitcoin', 'ethereum', 'AAPL', 'GOOGL']
+    data = []
+    
+    for symbol in symbols:
+        for i, ts in enumerate(timestamp_ms[:100]):  # 100 records per symbol
+            
+            if symbol in ['bitcoin', 'ethereum']:
+                # Crypto data
+                base_price = 50000 if symbol == 'bitcoin' else 3000
+                price_trend = i * 10  # Upward trend
+                price = base_price + price_trend + np.random.normal(0, 500)
+                
+                record = {
+                    'symbol': symbol,
+                    'interval_timestamp': ts,
+                    'price': price if np.random.random() > 0.2 else np.nan,  # 20% nulls
+                    'volume': price * 1000 + np.random.normal(0, 100000) if np.random.random() > 0.15 else np.nan,
+                    'marketcap': price * 19000000 if np.random.random() > 0.3 else np.nan,
+                    'dominance': (0.4 if symbol == 'bitcoin' else 0.15) + np.random.normal(0, 0.02) if np.random.random() > 0.25 else np.nan,
+                    'rank': 1 if symbol == 'bitcoin' else 2,
+                    'performance.day': np.random.normal(0, 2) if np.random.random() > 0.2 else np.nan,
+                    'performance.week': np.random.normal(0, 5) if np.random.random() > 0.3 else np.nan,
+                    'exchangePrices.binance': price * 1.001 if np.random.random() > 0.4 else np.nan,
+                    'exchangePrices.coinbase': price * 0.999 if np.random.random() > 0.4 else np.nan,
+                    'rsi': 50 + np.random.normal(0, 10) if np.random.random() > 0.2 else np.nan,
+                    'macd': np.random.normal(0, 1) if np.random.random() > 0.25 else np.nan,
+                    'transaction_count': 1000 + i * 5 + np.random.normal(0, 100) if np.random.random() > 0.3 else np.nan,
+                    'stable': False
+                }
+            else:
+                # Stock data
+                base_price = 150 if symbol == 'AAPL' else 2800
+                price_trend = i * 0.5  # Modest upward trend
+                price = base_price + price_trend + np.random.normal(0, 5)
+                
+                record = {
+                    'symbol': symbol,
+                    'interval_timestamp': ts,
+                    'close': price if np.random.random() > 0.2 else np.nan,
+                    'open': price * 0.995 if np.random.random() > 0.2 else np.nan,
+                    'high': price * 1.02 if np.random.random() > 0.15 else np.nan,
+                    'low': price * 0.98 if np.random.random() > 0.15 else np.nan,
+                    'volume': 1000000 + np.random.normal(0, 100000) if np.random.random() > 0.2 else np.nan,
+                    'prev_close': price * 0.99 if np.random.random() > 0.25 else np.nan,
+                    'marketCapitalization': price * 15000000000 if np.random.random() > 0.3 else np.nan,
+                    'shareOutstanding': 15000000000 if np.random.random() > 0.1 else np.nan,
+                    'rsi': 50 + np.random.normal(0, 15) if np.random.random() > 0.2 else np.nan,
+                    'macd': np.random.normal(0, 0.5) if np.random.random() > 0.25 else np.nan,
+                    'news_sentiment_mean_x': 0.5 + np.random.normal(0, 0.2) if np.random.random() > 0.4 else np.nan,
+                    'buy': np.random.randint(3, 8) if np.random.random() > 0.3 else np.nan,
+                    'hold': np.random.randint(8, 15) if np.random.random() > 0.3 else np.nan,
+                    'sell': np.random.randint(1, 4) if np.random.random() > 0.3 else np.nan,
+                }
+            
+            data.append(record)
+    
+    return pd.DataFrame(data)
+
+def test_symbol_first_strategy():
+    """Test the symbol-first null handling strategy"""
+    print("="*70)
+    print("TESTING ENHANCED SYMBOL-FIRST NULL HANDLING STRATEGY")
+    print("="*70)
+    
+    # Create realistic test data
+    print("Creating realistic test data with temporal patterns...")
+    df = create_realistic_test_data()
+    
+    print(f"Created dataset with {len(df)} rows and {len(df.columns)} columns")
+    print(f"Symbols: {df['symbol'].unique()}")
+    print(f"Date range: {pd.to_datetime(df['interval_timestamp'], unit='ms').min()} to {pd.to_datetime(df['interval_timestamp'], unit='ms').max()}")
+    
+    # Analyze null patterns before processing
+    print(f"\nNULL ANALYSIS BEFORE PROCESSING:")
+    total_nulls_before = df.isnull().sum().sum()
+    print(f"Total nulls: {total_nulls_before}")
+    
+    symbol_nulls_before = {}
+    for symbol in df['symbol'].unique():
+        symbol_data = df[df['symbol'] == symbol]
+        symbol_nulls = symbol_data.isnull().sum().sum()
+        symbol_nulls_before[symbol] = symbol_nulls
+        print(f"  {symbol}: {symbol_nulls} nulls ({symbol_nulls/len(symbol_data)/len(df.columns)*100:.1f}% of symbol data)")
+    
+    # Test the enhanced handler
+    print(f"\nTESTING ENHANCED NULL HANDLER...")
+    handler = FinalNullValueHandler()
+    
+    # Separate crypto and stock data for targeted processing
+    crypto_mask = df['symbol'].isin(['bitcoin', 'ethereum'])
+    stock_mask = df['symbol'].isin(['AAPL', 'GOOGL'])
+    
+    results = {}
+    
+    if crypto_mask.any():
+        print(f"\nProcessing crypto data ({crypto_mask.sum()} rows)...")
+        df_crypto = df[crypto_mask].copy()
+        df_crypto_processed = handler.process_crypto_features(df_crypto)
+        df.loc[crypto_mask] = df_crypto_processed
+        
+        crypto_nulls_after = df_crypto_processed.isnull().sum().sum()
+        results['crypto'] = {
+            'nulls_before': df_crypto.isnull().sum().sum(),
+            'nulls_after': crypto_nulls_after,
+            'symbols': ['bitcoin', 'ethereum']
+        }
+    
+    if stock_mask.any():
+        print(f"\nProcessing stock data ({stock_mask.sum()} rows)...")
+        df_stock = df[stock_mask].copy()
+        df_stock_processed = handler.process_stock_features(df_stock)
+        df.loc[stock_mask] = df_stock_processed
+        
+        stock_nulls_after = df_stock_processed.isnull().sum().sum()
+        results['stock'] = {
+            'nulls_before': df_stock.isnull().sum().sum(),
+            'nulls_after': stock_nulls_after,
+            'symbols': ['AAPL', 'GOOGL']
+        }
+    
+    # Analyze results
+    print(f"\nRESULTS ANALYSIS:")
+    total_nulls_after = df.isnull().sum().sum()
+    print(f"Total nulls after: {total_nulls_after} (reduced by {total_nulls_before - total_nulls_after})")
+    
+    for asset_type, result in results.items():
+        nulls_filled = result['nulls_before'] - result['nulls_after']
+        fill_rate = (nulls_filled / result['nulls_before'] * 100) if result['nulls_before'] > 0 else 0
+        print(f"  {asset_type.upper()}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
+    
+    # Symbol-level analysis
+    print(f"\nSYMBOL-LEVEL ANALYSIS:")
+    for symbol in df['symbol'].unique():
+        symbol_data = df[df['symbol'] == symbol]
+        nulls_after = symbol_data.isnull().sum().sum()
+        nulls_filled = symbol_nulls_before[symbol] - nulls_after
+        fill_rate = (nulls_filled / symbol_nulls_before[symbol] * 100) if symbol_nulls_before[symbol] > 0 else 0
+        print(f"  {symbol}: {nulls_filled} nulls filled ({fill_rate:.1f}% fill rate)")
+    
+    # Quality checks
+    print(f"\nQUALITY CHECKS:")
+    infinite_values = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
+    print(f"  Infinite values: {infinite_values}")
+    print(f"  Data types preserved: {len(df.dtypes) == len(create_realistic_test_data().dtypes)}")
+    
+    # Test temporal interpolation effectiveness
+    print(f"\nTEMPORAL INTERPOLATION TEST:")
+    for symbol in df['symbol'].unique():
+        symbol_data = df[df['symbol'] == symbol].sort_values('interval_timestamp')
+        if 'price' in symbol_data.columns:
+            price_series = symbol_data['price']
+            if len(price_series.dropna()) >= 2:
+                # Check if we have reasonable price progression
+                price_diff = price_series.dropna().diff().abs().mean()
+                print(f"  {symbol}: Average price change = {price_diff:.2f} (reasonable interpolation)")
+    
+    # Overall success assessment
+    success = (total_nulls_after == 0 and 
+               infinite_values == 0 and 
+               all(result['nulls_after'] < result['nulls_before'] for result in results.values()))
+    
+    if success:
+        print(f"\n✅ ENHANCED SYMBOL-FIRST STRATEGY TEST PASSED!")
+        print(f"   - All nulls handled successfully")
+        print(f"   - No infinite values introduced")
+        print(f"   - Symbol-specific patterns preserved")
+        print(f"   - Temporal interpolation working")
+        return True
+    else:
+        print(f"\n❌ Test failed - review results above")
+        return False
+
+def main():
+    """Main test function"""
+    try:
+        success = test_symbol_first_strategy()
+        return 0 if success else 1
+    except Exception as e:
+        print(f"❌ Test failed with error: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return 1
+
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)
diff --git a/src/merge/test_null_filling_merge.py b/src/merge/test_null_filling_merge.py
new file mode 100644
index 0000000000000000000000000000000000000000..36566a979ba154172fc14e00101d104535f7a28b
--- /dev/null
+++ b/src/merge/test_null_filling_merge.py
@@ -0,0 +1,189 @@
+#!/usr/bin/env python3
+"""
+Test script for null filling during merge operations
+"""
+
+import pandas as pd
+import numpy as np
+import os
+import sys
+from pathlib import Path
+
+# Add the merge directory to path
+sys.path.append(str(Path(__file__).parent))
+
+from merge_temp import fill_nulls_from_temp
+
+def create_test_data():
+    """Create test data with strategic null values"""
+    
+    # Create merged data with some null values
+    merged_data = {
+        'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH'],
+        'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000],
+        'price': [150.0, np.nan, 50000.0, np.nan, 4000.0],  # AAPL and BTC have nulls
+        'volume': [1000000, 1200000, np.nan, 800000, np.nan],  # BTC and ETH have nulls
+        'rsi': [65.0, np.nan, 70.0, 45.0, np.nan],  # AAPL and ETH have nulls
+        'macd': [1.5, 1.8, np.nan, -0.5, 2.1]  # BTC has null
+    }
+    df_merged = pd.DataFrame(merged_data)
+    
+    # Create temp data that can fill some of the nulls
+    temp_data = {
+        'symbol': ['AAPL', 'AAPL', 'BTC', 'BTC', 'ETH', 'GOOGL'],
+        'interval_timestamp': [1640995200000, 1640995260000, 1640995200000, 1640995260000, 1640995200000, 1640995200000],
+        'price': [149.5, 152.3, 49950.0, 51200.0, 3980.0, 2850.0],  # Can fill AAPL and BTC nulls
+        'volume': [950000, 1150000, 2000000, 780000, 500000, 400000],  # Can fill BTC and ETH nulls
+        'rsi': [64.0, 67.0, 69.5, 44.0, 55.0, 60.0],  # Can fill AAPL and ETH nulls
+        'macd': [1.4, 1.9, 15.2, -0.6, 2.0, 0.8],  # Can fill BTC null
+        'new_feature': [100, 200, 300, 400, 500, 600]  # New feature not in merged
+    }
+    df_temp = pd.DataFrame(temp_data)
+    
+    return df_merged, df_temp
+
+def test_null_filling():
+    """Test the null filling functionality"""
+    print("="*60)
+    print("TESTING NULL FILLING DURING MERGE")
+    print("="*60)
+    
+    # Create test data
+    df_merged, df_temp = create_test_data()
+    
+    print("BEFORE NULL FILLING:")
+    print(f"Merged data shape: {df_merged.shape}")
+    print(f"Temp data shape: {df_temp.shape}")
+    print(f"Nulls in merged data: {df_merged.isnull().sum().sum()}")
+    print("\nNull values by column in merged data:")
+    for col in df_merged.columns:
+        null_count = df_merged[col].isnull().sum()
+        if null_count > 0:
+            print(f"  {col}: {null_count} nulls")
+    
+    print(f"\nMerged data preview:")
+    print(df_merged.to_string())
+    print(f"\nTemp data preview:")
+    print(df_temp.to_string())
+    
+    # Test the null filling function
+    df_merged_copy = df_merged.copy()
+    nulls_filled = fill_nulls_from_temp(df_merged_copy, df_temp)
+    
+    print(f"\nAFTER NULL FILLING:")
+    print(f"Nulls filled: {nulls_filled}")
+    print(f"Remaining nulls: {df_merged_copy.isnull().sum().sum()}")
+    print("\nRemaining null values by column:")
+    for col in df_merged_copy.columns:
+        null_count = df_merged_copy[col].isnull().sum()
+        if null_count > 0:
+            print(f"  {col}: {null_count} nulls")
+    
+    print(f"\nFilled data preview:")
+    print(df_merged_copy.to_string())
+    
+    # Verify specific cases
+    print(f"\nVERIFICATION:")
+    
+    # Check AAPL price at timestamp 1640995260000 (should be filled)
+    aapl_price = df_merged_copy[(df_merged_copy['symbol'] == 'AAPL') & 
+                                (df_merged_copy['interval_timestamp'] == 1640995260000)]['price'].iloc[0]
+    print(f"AAPL price at 1640995260000: {aapl_price} (should be 152.3)")
+    
+    # Check BTC volume at timestamp 1640995200000 (should be filled)
+    btc_volume = df_merged_copy[(df_merged_copy['symbol'] == 'BTC') & 
+                                (df_merged_copy['interval_timestamp'] == 1640995200000)]['volume'].iloc[0]
+    print(f"BTC volume at 1640995200000: {btc_volume} (should be 2000000)")
+    
+    # Check if new features are NOT added (function should only fill existing columns)
+    has_new_feature = 'new_feature' in df_merged_copy.columns
+    print(f"New feature added: {has_new_feature} (should be False)")
+    
+    # Calculate success rate
+    original_nulls = df_merged.isnull().sum().sum()
+    remaining_nulls = df_merged_copy.isnull().sum().sum()
+    filled_nulls = original_nulls - remaining_nulls
+    
+    if filled_nulls == nulls_filled:
+        print(f"✅ Null counting is consistent: {filled_nulls} nulls filled")
+    else:
+        print(f"❌ Null counting mismatch: reported {nulls_filled}, actual {filled_nulls}")
+    
+    if nulls_filled > 0:
+        fill_rate = (nulls_filled / original_nulls) * 100
+        print(f"✅ Fill rate: {fill_rate:.1f}% ({nulls_filled}/{original_nulls})")
+        return True
+    else:
+        print("❌ No nulls were filled")
+        return False
+
+def test_edge_cases():
+    """Test edge cases for null filling"""
+    print(f"\n" + "="*60)
+    print("TESTING EDGE CASES")
+    print("="*60)
+    
+    # Test with empty dataframes
+    df_empty = pd.DataFrame()
+    df_test = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [1]})
+    
+    print("Test 1: Empty merged dataframe")
+    nulls_filled = fill_nulls_from_temp(df_empty, df_test)
+    print(f"Nulls filled: {nulls_filled} (should be 0)")
+    
+    print("Test 2: Empty temp dataframe") 
+    df_with_nulls = pd.DataFrame({'symbol': ['A'], 'interval_timestamp': [123], 'value': [np.nan]})
+    nulls_filled = fill_nulls_from_temp(df_with_nulls, df_empty)
+    print(f"Nulls filled: {nulls_filled} (should be 0)")
+    
+    # Test with no matching keys
+    print("Test 3: No matching symbol+timestamp combinations")
+    df_merged_nomatch = pd.DataFrame({
+        'symbol': ['A'], 
+        'interval_timestamp': [111], 
+        'value': [np.nan]
+    })
+    df_temp_nomatch = pd.DataFrame({
+        'symbol': ['B'], 
+        'interval_timestamp': [222], 
+        'value': [100]
+    })
+    nulls_filled = fill_nulls_from_temp(df_merged_nomatch, df_temp_nomatch)
+    print(f"Nulls filled: {nulls_filled} (should be 0)")
+    
+    # Test with no common columns
+    print("Test 4: No common columns")
+    df_merged_nocols = pd.DataFrame({
+        'symbol': ['A'], 
+        'interval_timestamp': [123], 
+        'col1': [np.nan]
+    })
+    df_temp_nocols = pd.DataFrame({
+        'symbol': ['A'], 
+        'interval_timestamp': [123], 
+        'col2': [100]
+    })
+    nulls_filled = fill_nulls_from_temp(df_merged_nocols, df_temp_nocols)
+    print(f"Nulls filled: {nulls_filled} (should be 0)")
+    
+    print("✅ All edge case tests completed")
+
+def main():
+    """Run all tests"""
+    success = test_null_filling()
+    test_edge_cases()
+    
+    print(f"\n" + "="*60)
+    print("TEST SUMMARY")
+    print("="*60)
+    
+    if success:
+        print("🎉 Null filling functionality is working correctly!")
+        return 0
+    else:
+        print("❌ Null filling functionality has issues")
+        return 1
+
+if __name__ == "__main__":
+    exit_code = main()
+    sys.exit(exit_code)
diff --git a/src/utils/symbol_normalizer.py b/src/utils/symbol_normalizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cb2623acc473a121fd7a75190bf5134953e8a5a
--- /dev/null
+++ b/src/utils/symbol_normalizer.py
@@ -0,0 +1,233 @@
+"""
+Crypto Symbol Normalizer
+========================
+
+Provides consistent symbol normalization across all data fetchers and mergers.
+This ensures that different representations of the same cryptocurrency (e.g., XRP vs ripple)
+are treated consistently throughout the entire pipeline.
+
+Features:
+- Maps various symbol formats to canonical identifiers
+- Supports both short symbols (BTC, ETH) and long names (bitcoin, ethereum)
+- Case-insensitive matching
+- Logging for debugging normalization process
+
+Author: AI Assistant
+Date: August 2025
+"""
+
+import logging
+from typing import Dict, List, Set
+
+logger = logging.getLogger(__name__)
+
+class CryptoSymbolNormalizer:
+    """
+    Centralized crypto symbol normalization for consistent asset identification
+    """
+    
+    def __init__(self):
+        """Initialize the symbol normalizer with predefined mappings"""
+        self.symbol_mapping = self._build_symbol_mapping()
+        logger.info(f"Initialized CryptoSymbolNormalizer with {len(self.symbol_mapping)} mappings")
+    
+    def _build_symbol_mapping(self) -> Dict[str, str]:
+        """
+        Build comprehensive symbol mapping dictionary
+        
+        Returns:
+            Dictionary mapping various symbol formats to canonical slugs
+        """
+        # Canonical mapping for major crypto assets
+        # Maps various symbols/names to the official canonical identifier
+        symbol_mapping = {
+            # Bitcoin variants
+            'bitcoin': 'bitcoin',
+            'btc': 'bitcoin',
+            'Bitcoin': 'bitcoin',
+            'BTC': 'bitcoin',
+            
+            # Ethereum variants  
+            'ethereum': 'ethereum',
+            'eth': 'ethereum',
+            'Ethereum': 'ethereum',
+            'ETH': 'ethereum',
+            
+            # Ripple/XRP variants (canonical: ripple for Santiment)
+            'ripple': 'ripple',
+            'xrp': 'ripple',
+            'Ripple': 'ripple',
+            'XRP': 'ripple',
+            
+            # Solana variants (canonical: solana for Santiment)
+            'solana': 'solana',
+            'sol': 'solana',
+            'Solana': 'solana',
+            'SOL': 'solana',
+            
+            # Cardano variants (canonical: cardano for Santiment)
+            'cardano': 'cardano',
+            'ada': 'cardano',
+            'Cardano': 'cardano',
+            'ADA': 'cardano',
+            
+            # Polkadot variants
+            'polkadot': 'polkadot',
+            'dot': 'polkadot',
+            'Polkadot': 'polkadot',
+            'DOT': 'polkadot',
+            
+            # Chainlink variants
+            'chainlink': 'chainlink',
+            'link': 'chainlink',
+            'Chainlink': 'chainlink',
+            'LINK': 'chainlink',
+            
+            # Litecoin variants
+            'litecoin': 'litecoin',
+            'ltc': 'litecoin',
+            'Litecoin': 'litecoin',
+            'LTC': 'litecoin',
+            
+            # Bitcoin Cash variants
+            'bitcoin-cash': 'bitcoin-cash',
+            'bch': 'bitcoin-cash',
+            'Bitcoin Cash': 'bitcoin-cash',
+            'BCH': 'bitcoin-cash',
+            
+            # Stellar variants
+            'stellar': 'stellar',
+            'xlm': 'stellar',
+            'Stellar': 'stellar',
+            'XLM': 'stellar',
+            
+            # Ethereum Classic variants
+            'ethereum-classic': 'ethereum-classic',
+            'etc': 'ethereum-classic',
+            'Ethereum Classic': 'ethereum-classic',
+            'ETC': 'ethereum-classic',
+            
+            # EOS variants
+            'eos': 'eos',
+            'EOS': 'eos',
+        }
+        
+        return symbol_mapping
+    
+    def normalize(self, symbol: str) -> str:
+        """
+        Normalize a symbol to its canonical identifier
+        
+        Args:
+            symbol: Symbol to normalize
+            
+        Returns:
+            Canonical identifier
+        """
+        if symbol in self.symbol_mapping:
+            canonical = self.symbol_mapping[symbol]
+            if symbol != canonical:
+                logger.debug(f"Normalized '{symbol}' -> '{canonical}'")
+            return canonical
+        
+        # If not found in mapping, return as-is but log warning
+        logger.warning(f"Unknown symbol '{symbol}' not found in normalization mapping")
+        return symbol.lower()
+    
+    def normalize_list(self, symbols: List[str]) -> List[str]:
+        """
+        Normalize a list of symbols and remove duplicates
+        
+        Args:
+            symbols: List of symbols to normalize
+            
+        Returns:
+            List of normalized, deduplicated symbols
+        """
+        normalized = []
+        seen = set()
+        
+        for symbol in symbols:
+            canonical = self.normalize(symbol)
+            if canonical not in seen:
+                normalized.append(canonical)
+                seen.add(canonical)
+            else:
+                logger.debug(f"Removed duplicate symbol: {symbol} (canonical: {canonical})")
+        
+        logger.info(f"Normalized {len(symbols)} symbols to {len(normalized)} unique canonical symbols")
+        return normalized
+    
+    def get_all_variants(self, canonical_symbol: str) -> List[str]:
+        """
+        Get all known variants for a canonical symbol
+        
+        Args:
+            canonical_symbol: The canonical symbol to find variants for
+            
+        Returns:
+            List of all variants that map to this canonical symbol
+        """
+        variants = [key for key, value in self.symbol_mapping.items() 
+                   if value == canonical_symbol]
+        return variants
+    
+    def get_canonical_symbols(self) -> Set[str]:
+        """
+        Get set of all canonical symbols
+        
+        Returns:
+            Set of canonical symbols
+        """
+        return set(self.symbol_mapping.values())
+    
+    def add_mapping(self, symbol: str, canonical: str):
+        """
+        Add a new symbol mapping
+        
+        Args:
+            symbol: Symbol variant to add
+            canonical: Canonical symbol it maps to
+        """
+        self.symbol_mapping[symbol] = canonical
+        logger.info(f"Added new mapping: '{symbol}' -> '{canonical}'")
+
+
+# Global instance for easy access
+_normalizer = None
+
+def get_normalizer() -> CryptoSymbolNormalizer:
+    """
+    Get the global normalizer instance (singleton pattern)
+    
+    Returns:
+        CryptoSymbolNormalizer instance
+    """
+    global _normalizer
+    if _normalizer is None:
+        _normalizer = CryptoSymbolNormalizer()
+    return _normalizer
+
+def normalize_symbol(symbol: str) -> str:
+    """
+    Convenience function to normalize a single symbol
+    
+    Args:
+        symbol: Symbol to normalize
+        
+    Returns:
+        Canonical symbol
+    """
+    return get_normalizer().normalize(symbol)
+
+def normalize_symbol_list(symbols: List[str]) -> List[str]:
+    """
+    Convenience function to normalize a list of symbols
+    
+    Args:
+        symbols: List of symbols to normalize
+        
+    Returns:
+        List of normalized symbols
+    """
+    return get_normalizer().normalize_list(symbols)
diff --git a/src/vis/stocks.py b/src/vis/stocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3514a7a5dde4c59a053970a0acf1a185817f439
--- /dev/null
+++ b/src/vis/stocks.py
@@ -0,0 +1,45 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# Load normalized stock data
+csv_path = 'data\\merged\\norm\\stocks_features_improved_normalized.csv'
+df = pd.read_csv(csv_path)
+
+# 1. Show basic info and head
+print('Data shape:', df.shape)
+print(df.head())
+
+# 2. Feature distribution histograms
+features = [
+    'price_momentum', 'volume_price_ratio', 'daily_range', 'avg_sentiment', 'technical_strength'
+]
+existing_features = [f for f in features if f in df.columns]
+if existing_features:
+    df[existing_features].hist(bins=30, figsize=(12, 8))
+    plt.suptitle('Feature Distributions')
+    plt.tight_layout()
+    plt.show()
+else:
+    print('No engineered features found for distribution plots.')
+
+# 3. Correlation heatmap
+if len(existing_features) > 1:
+    plt.figure(figsize=(8, 6))
+    sns.heatmap(df[existing_features].corr(), annot=True, cmap='coolwarm')
+    plt.title('Feature Correlation Heatmap')
+    plt.show()
+
+# 4. Outlier boxplots for engineered features
+for feat in existing_features:
+    plt.figure(figsize=(6, 2))
+    sns.boxplot(x=df[feat])
+    plt.title(f'Boxplot: {feat}')
+    plt.show()
+
+# 5. Pairplot (if you have a target column, e.g., "target")
+# Uncomment and adjust if you have a target/label
+# sns.pairplot(df, vars=existing_features, hue='target')
+# plt.show()
+
+print('Visualization complete. You can add more plots as needed!')
diff --git a/test_gradio.py b/test_gradio.py
new file mode 100644
index 0000000000000000000000000000000000000000..f58e980442b35612cc938870ebc5e70062c80de8
--- /dev/null
+++ b/test_gradio.py
@@ -0,0 +1,9 @@
+import gradio as gr
+
+def hello(name):
+    return f"Hello {name}!"
+
+demo = gr.Interface(fn=hello, inputs="text", outputs="text")
+
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)