Spaces:

maaroufabousaleh
/

advisorai-data-enhanced

Sleeping

App Files Files Community

Maaroufabousaleh commited on Aug 19

Commit

c49b21b

1 Parent(s): bdf86e6

f

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +30 -0
.gitignore +7 -0
Dockerfile +108 -0
Dockerfile.gradio +85 -0
LICENSE +21 -0
PERMISSION_FIX_COMPLETED.md +96 -0
README.md +4 -6
README_HF.md +10 -0
app.py +136 -0
deployment/cleanup.py +102 -0
deployment/entrypoint.sh +64 -0
deployment/fetch_filebase.py +178 -0
deployment/gradio_entrypoint.sh +27 -0
deployment/monitor.py +93 -0
deployment/nginx.conf +51 -0
deployment/nginx.main.conf +37 -0
deployment/render.yaml +83 -0
deployment/scheduler.py +143 -0
deployment/supervisord.conf +65 -0
deployment/test_permissions.py +129 -0
requirements.txt +31 -0
santiment_frequency_controller.py +118 -0
scripts/push_hf_secrets.py +186 -0
src/api/gradio_main.py +265 -0
src/api/main.py +114 -0
src/api/routes/health.py +67 -0
src/api/routes/isrunning.py +34 -0
src/config.py +66 -0
src/data_cloud/cloud_utils.py +163 -0
src/fetchers/advisorai_data/advisorai_data_fetcher.py +226 -0
src/fetchers/alpaca_api/__init__.py +32 -0
src/fetchers/alpaca_api/clients/__init__.py +7 -0
src/fetchers/alpaca_api/clients/crypto.py +95 -0
src/fetchers/alpaca_api/clients/main.py +45 -0
src/fetchers/alpaca_api/clients/options.py +72 -0
src/fetchers/alpaca_api/clients/stocks.py +90 -0
src/fetchers/alpaca_api/config.py +17 -0
src/fetchers/alpaca_api/fetchers/__init__.py +15 -0
src/fetchers/alpaca_api/fetchers/bars.py +58 -0
src/fetchers/alpaca_api/fetchers/quotes.py +40 -0
src/fetchers/alpaca_api/fetchers/trades.py +38 -0
src/fetchers/alpaca_api/main.py +193 -0
src/fetchers/alpaca_api/merge/alpaca_features.py +0 -0
src/fetchers/alpaca_api/utils.py +83 -0
src/fetchers/coindesk_client/asset_metadata.py +26 -0
src/fetchers/coindesk_client/client.py +218 -0
src/fetchers/coindesk_client/coindesk_utils.py +49 -0
src/fetchers/coindesk_client/config.py +30 -0
src/fetchers/coindesk_client/d.txt +12 -0
src/fetchers/coindesk_client/derivatives.py +68 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,30 @@

+# Exclude large, generated, and local-only files from Docker build context
+.git
+.gitignore
+.vscode
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.log
+# Python build artifacts
+build/
+dist/
+*.egg-info/
+# Local env
+.env
+# Data and caches (mounted at runtime instead)
+data/
+/data/
+**/archive/
+**/temp/
+**/train/
+**/raw/
+**/features/
+**/warehouse/
+# Notebooks
+*.ipynb

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+data/
+.env
+src/data_cloud/__init__.py
+__pycache__/
+.vscode/
+last_run.txt
+*.pyc

Dockerfile ADDED Viewed

	@@ -0,0 +1,108 @@

+###############################
+# 1) ─── Python builder ───
+###############################
+FROM python:3.11-slim AS builder
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends gcc g++ git curl wget \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip wheel --no-cache-dir --wheel-dir=/app/wheels -r requirements.txt
+###############################
+# 2) ─── Runtime image ───
+###############################
+FROM python:3.11-slim
+WORKDIR /app
+# OS runtime deps (minimal for memory optimization)
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgomp1 \
+    nginx \
+    supervisor \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+# Python deps
+COPY --from=builder /app/wheels /wheels
+COPY requirements.txt .
+# Install Python dependencies (with cleanup for memory optimization)
+RUN pip install --no-cache-dir --no-index --find-links=/wheels -r requirements.txt \
+    && rm -rf /wheels \
+    && pip cache purge
+    # Install Playwright system dependencies and browsers
+    # && python -m playwright install-deps \
+    # && python -m playwright install chromium firefox webkit
+# Create necessary directories with proper permissions for root
+RUN mkdir -p /data/advisorai-data/archive \
+    && mkdir -p /data/advisorai-data/features \
+    && mkdir -p /data/advisorai-data/temp \
+    && mkdir -p /data/advisorai-data/train \
+    && mkdir -p /data/advisorai-data/warehouse \
+    && mkdir -p /data/alpaca/archive \
+    && mkdir -p /data/alpaca/features \
+    && mkdir -p /data/alpaca/temp \
+    && mkdir -p /data/alpaca/train \
+    && mkdir -p /data/crypto-bubbles/archive \
+    && mkdir -p /data/crypto-bubbles/features \
+    && mkdir -p /data/crypto-bubbles/temp \
+    && mkdir -p /data/crypto-bubbles/train \
+    && mkdir -p /data/finnhub/archive \
+    && mkdir -p /data/finnhub/features \
+    && mkdir -p /data/finnhub/temp \
+    && mkdir -p /data/finnhub/train \
+    && mkdir -p /data/finviz/archive \
+    && mkdir -p /data/finviz/features \
+    && mkdir -p /data/finviz/temp \
+    && mkdir -p /data/finviz/train \
+    && mkdir -p /data/marketaux/archive \
+    && mkdir -p /data/marketaux/features \
+    && mkdir -p /data/marketaux/temp \
+    && mkdir -p /data/marketaux/train \
+    && mkdir -p /data/merged/archive \
+    && mkdir -p /data/merged/features \
+    && mkdir -p /data/merged/temp \
+    && mkdir -p /data/merged/train \
+    && mkdir -p /data/merged/raw \
+    && mkdir -p /data/logs \
+    && mkdir -p /data/nltk_data \
+    && mkdir -p /tmp/nginx/body \
+    && mkdir -p /tmp/nginx/proxy \
+    && mkdir -p /tmp/nginx/fastcgi \
+    && chmod -R 777 /data /tmp/nginx
+# ─── Application code ───
+COPY . .
+# Set executable permissions for entrypoint
+RUN chmod +x /app/deployment/entrypoint.sh /app/deployment/gradio_entrypoint.sh
+# PYTHONPATH for FastAPI
+ENV PYTHONPATH=/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge
+# Nginx config
+RUN rm -f /etc/nginx/conf.d/default.conf
+COPY deployment/nginx.conf /etc/nginx/conf.d/app.conf
+COPY deployment/nginx.main.conf /etc/nginx/nginx.conf
+# Set resource limits for memory optimization (512MB limit)
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=utf-8
+ENV MAX_MEMORY_MB=450
+ENV MALLOC_TRIM_THRESHOLD_=100000
+ENV MALLOC_MMAP_THRESHOLD_=131072
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONHASHSEED=random
+ENV NLTK_DATA=/data/nltk_data
+# Supervisord config
+COPY deployment/supervisord.conf /etc/supervisord.conf
+ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
+# Ports
+EXPOSE 80 7860
+CMD ["supervisord", "-c", "/etc/supervisord.conf"]

Dockerfile.gradio ADDED Viewed

	@@ -0,0 +1,85 @@

+###############################
+# Gradio-optimized Dockerfile
+###############################
+FROM python:3.11-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    libgomp1 \
+    supervisor \
+    && rm -rf /var/lib/apt/lists/* \
+    && apt-get clean
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt \
+    && pip cache purge
+# Create necessary directories
+RUN mkdir -p /data/logs \
+    && mkdir -p /data/merged/features \
+    && mkdir -p /data/merged/train \
+    && mkdir -p /data/alpaca \
+    && mkdir -p /data/advisorai-data \
+    && mkdir -p /data/nltk_data \
+    && chmod -R 777 /data
+# Copy application code
+COPY . .
+# Set executable permissions
+RUN chmod +x /app/deployment/gradio_entrypoint.sh
+# Set environment variables
+ENV PYTHONPATH=/app:/app/src
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONIOENCODING=utf-8
+ENV NLTK_DATA=/data/nltk_data
+# Create simplified supervisord config for Gradio
+RUN echo '[supervisord]\n\
+nodaemon=true\n\
+logfile=/dev/stdout\n\
+logfile_maxbytes=0\n\
+pidfile=/tmp/supervisord.pid\n\
+loglevel=info\n\
+\n\
+[program:gradio]\n\
+command=python /app/app.py\n\
+directory=/app\n\
+autostart=true\n\
+autorestart=true\n\
+stdout_logfile=/dev/stdout\n\
+stderr_logfile=/dev/stderr\n\
+stdout_logfile_maxbytes=0\n\
+stderr_logfile_maxbytes=0\n\
+startsecs=10\n\
+startretries=3\n\
+stopwaitsecs=30\n\
+killasgroup=true\n\
+stopasgroup=true\n\
+environment=PYTHONPATH="/app:/app/src"\n\
+\n\
+[program:scheduler]\n\
+command=/bin/sh -c "sleep 180 && python /app/deployment/scheduler.py"\n\
+directory=/app\n\
+autostart=true\n\
+autorestart=true\n\
+startsecs=0\n\
+stdout_logfile=/dev/stdout\n\
+stderr_logfile=/dev/stderr\n\
+stdout_logfile_maxbytes=0\n\
+stderr_logfile_maxbytes=0\n\
+startretries=3\n\
+stopwaitsecs=60\n\
+killasgroup=true\n\
+stopasgroup=true' > /etc/supervisord_gradio.conf
+ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
+# Expose Gradio port
+EXPOSE 7860
+CMD ["supervisord", "-c", "/etc/supervisord_gradio.conf"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Maaroufabousaleh
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

PERMISSION_FIX_COMPLETED.md ADDED Viewed

	@@ -0,0 +1,96 @@

+# Permission Fix Completion Report
+## Summary
+Successfully resolved Docker container permission errors for Hugging Face Spaces deployment. The application now uses the platform's persistent writable mount `/data` instead of attempting to write to read-only locations under `/app`.
+## Key Changes Applied
+### 1. Container Startup (`deployment/entrypoint.sh`)
+- **Before**: Created symlinks from `/tmp/data` to `/app/data` (not allowed on Spaces)
+- **After**: Creates directory structure under `/data` and exports `DATA_DIR="/data"`
+- **Result**: Container startup proceeds without symlink permission errors
+### 2. Data Fetch Script (`deployment/fetch_filebase.py`)
+- **Before**: Hard-coded paths under `/app/data`
+- **After**: Added CLI `--base-dir` support and `DATA_DIR` environment variable detection
+- **Result**: Fetch script downloads to `/data` successfully without permission errors
+### 3. Application Configuration (`src/config.py` - NEW)
+- **Purpose**: Centralized path management for DATA_DIR, LOG_DIR, and LAST_RUN_PATH
+- **Behavior**: Auto-detects writable locations with fallbacks (`/data` → `/app/data` → `/tmp`)
+- **Result**: Runtime code can work on both local dev and Hugging Face Spaces
+### 4. Runtime Components Updated
+- **health.py**: Uses `LAST_RUN_PATH` and `DATA_DIR` from `src.config`
+- **isrunning.py**: Uses `DATA_DIR` and `LAST_RUN_PATH` from `src.config`
+- **monitor.py**: Uses `LOG_DIR` from `src.config` and checks `DATA_DIR` for disk usage
+- **scheduler.py**: Writes `last_run.txt` to `LAST_RUN_PATH` from `src.config`
+### 5. Container Build (`Dockerfile`)
+- **Before**: Created directories under `/app/data`
+- **After**: Creates directories under `/data` and sets permissions
+- **Result**: Container image prepares the correct writable mount point
+### 6. Permission Test Scripts
+- **test_permissions.py**: Updated to test `/data` directories
+- **cleanup.py**: Updated to operate on `/data` paths
+## Validation Results
+### Fetch Script Test
+```bash
+python deployment/fetch_filebase.py --base-dir /data
+```
+**Result**: ✅ SUCCESS - All downloads completed with `[OK] Downloaded...` messages, no permission errors
+### Code Compilation Test
+```bash
+python -m py_compile src/config.py
+python -m py_compile src/api/routes/health.py
+python -m py_compile src/api/routes/isrunning.py
+python -m py_compile deployment/monitor.py
+python -m py_compile deployment/scheduler.py
+```
+**Result**: ✅ SUCCESS - All files compile without syntax errors
+## Configuration Details
+### Environment Variables
+- `DATA_DIR="/data"` - Exported by entrypoint.sh
+- `LOG_DIR` - Auto-detected as `$DATA_DIR/logs` with fallback to `/tmp/logs`
+### Path Mapping
+| Component | Old Path | New Path |
+|-----------|----------|----------|
+| Data storage | `/app/data` | `/data` |
+| Logs | `/app/logs` | `/data/logs` |
+| Last run marker | `/app/deployment/last_run.txt` | `/data/deployment/last_run.txt` |
+| Feature files | `/app/data/merged/features` | `/data/merged/features` |
+### CLI Usage
+- **Fetch script**: `python deployment/fetch_filebase.py --base-dir /data`
+- **Auto-detection**: Script uses `DATA_DIR` environment variable if no `--base-dir` provided
+- **Local dev**: Fallback to `/app/data` if `/data` doesn't exist
+## Next Steps for Deployment
+1. **Build and deploy** - The container should now start successfully on Hugging Face Spaces
+2. **Monitor logs** - Check that nginx, monitor, and scheduler services start without permission errors
+3. **Verify API endpoints** - Test `/health` and `/isrunning` endpoints return proper status
+4. **Validate data pipeline** - Confirm scheduled data pipeline runs write to `/data` successfully
+## Remaining Considerations
+### Nginx Configuration
+If nginx still fails with `/var/lib/nginx/body` permission errors, consider:
+- Using custom nginx config that writes to `/data/nginx` instead
+- Running nginx with user permissions that match container user
+- Using nginx-light or alternative reverse proxy
+### System Directories
+Monitor for any remaining attempts to write to system directories like:
+- `/var/log`
+- `/usr/local`
+- Any paths under `/app` (should be read-only)
+The permission fix is complete and validated. The application is now ready for deployment on Hugging Face Spaces.

README.md CHANGED Viewed

@@ -1,11 +1,9 @@
 ---
 title: Advisorai Data Enhanced
-emoji: 🌖
-colorFrom: gray
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.42.0
-app_file: app.py
 pinned: false
 license: mit
 ---

 ---
 title: Advisorai Data Enhanced
+emoji: 📚
+colorFrom: indigo
+colorTo: green
+sdk: docker
 pinned: false
 license: mit
 ---

README_HF.md ADDED Viewed

	@@ -0,0 +1,10 @@

+title: AdvisorAI Data Pipeline Monitor
+emoji: 🤖
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Real-time monitoring for AdvisorAI data collection pipeline

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+#!/usr/bin/env python3
+"""
+AdvisorAI Data Pipeline Monitor - Gradio App
+This is the main entry point for Hugging Face Spaces
+"""
+import gradio as gr
+import json
+import os
+import sys
+import logging
+import time
+from datetime import datetime
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def get_basic_health():
+    """Get basic health status without external dependencies"""
+    return {
+        "status": "healthy",
+        "timestamp": datetime.now().isoformat(),
+        "message": "AdvisorAI Data Pipeline Monitor is running"
+    }
+def get_basic_pipeline_status():
+    """Get basic pipeline status"""
+    return {
+        "status": "monitoring",
+        "message": "Data pipeline monitoring active",
+        "last_check": datetime.now().isoformat()
+    }
+def get_sample_data():
+    """Get sample data for display"""
+    return [
+        ["sample_data.json", "merged/features/", "2.5 MB", "2025-01-18 10:30"],
+        ["market_data.parquet", "alpaca/", "15.3 MB", "2025-01-18 10:25"],
+        ["sentiment_data.json", "finviz/features/", "1.2 MB", "2025-01-18 10:20"]
+    ]
+def get_sample_logs():
+    """Get sample log entries"""
+    return """=== scheduler.log ===
+2025-01-18 10:30:15 - INFO - Scheduler started successfully
+2025-01-18 10:30:16 - INFO - Data collection task initiated
+2025-01-18 10:30:45 - INFO - Market data fetched successfully
+=== monitor.log ===
+2025-01-18 10:30:00 - INFO - System monitoring active
+2025-01-18 10:30:30 - INFO - Memory usage: 45%
+2025-01-18 10:31:00 - INFO - All services running normally
+"""
+# Create Gradio interface
+with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🤖 AdvisorAI Data Pipeline Monitor")
+    gr.Markdown("Real-time monitoring of the AdvisorAI data collection and processing pipeline")
+    with gr.Tabs():
+        with gr.TabItem("📊 Dashboard"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Health Status")
+                    health_display = gr.JSON(label="System Health & Status")
+                with gr.Column():
+                    gr.Markdown("### Pipeline Status")
+                    pipeline_display = gr.JSON(label="Data Pipeline Status")
+            with gr.Row():
+                refresh_btn = gr.Button("🔄 Refresh", variant="primary")
+        with gr.TabItem("📁 Recent Files"):
+            gr.Markdown("### Recently Modified Data Files")
+            files_display = gr.Dataframe(
+                headers=["File", "Path", "Size", "Modified"],
+                value=get_sample_data(),
+                label="Recent Files"
+            )
+            refresh_files_btn = gr.Button("🔄 Refresh Files")
+        with gr.TabItem("📝 Logs"):
+            gr.Markdown("### Recent Log Entries")
+            logs_display = gr.Textbox(
+                label="Recent Logs",
+                value=get_sample_logs(),
+                lines=15,
+                max_lines=25,
+                show_copy_button=True
+            )
+            refresh_logs_btn = gr.Button("🔄 Refresh Logs")
+    # Event handlers
+    def refresh_dashboard():
+        health = get_basic_health()
+        pipeline = get_basic_pipeline_status()
+        return json.dumps(health, indent=2), json.dumps(pipeline, indent=2)
+    def refresh_files():
+        return get_sample_data()
+    def refresh_logs():
+        return get_sample_logs()
+    # Connect event handlers
+    refresh_btn.click(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+    refresh_files_btn.click(
+        refresh_files,
+        outputs=[files_display]
+    )
+    refresh_logs_btn.click(
+        refresh_logs,
+        outputs=[logs_display]
+    )
+    # Auto-refresh on load
+    app.load(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

deployment/cleanup.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python3
+"""
+Cleanup script to manage disk space and prevent service issues
+"""
+import os
+import shutil
+import glob
+from datetime import datetime, timedelta
+def cleanup_logs():
+    """Clean up old log files"""
+    log_dirs = ["/data/logs", "/var/log"]
+    for log_dir in log_dirs:
+        if os.path.exists(log_dir):
+            # Remove log files older than 7 days
+            cutoff_date = datetime.now() - timedelta(days=7)
+            for log_file in glob.glob(os.path.join(log_dir, "*.log*")):
+                try:
+                    file_time = datetime.fromtimestamp(os.path.getmtime(log_file))
+                    if file_time < cutoff_date:
+                        os.remove(log_file)
+                        print(f"[Cleanup] Removed old log: {log_file}")
+                except Exception as e:
+                    print(f"[Cleanup] Error removing {log_file}: {e}")
+def cleanup_temp_files():
+    """Clean up temporary files"""
+    temp_dirs = ["/tmp", "/data/merged/temp"]
+    for temp_dir in temp_dirs:
+        if os.path.exists(temp_dir):
+            # Remove files older than 1 day
+            cutoff_date = datetime.now() - timedelta(days=1)
+            for temp_file in glob.glob(os.path.join(temp_dir, "*")):
+                try:
+                    if os.path.isfile(temp_file):
+                        file_time = datetime.fromtimestamp(os.path.getmtime(temp_file))
+                        if file_time < cutoff_date:
+                            os.remove(temp_file)
+                            print(f"[Cleanup] Removed temp file: {temp_file}")
+                except Exception as e:
+                    print(f"[Cleanup] Error removing {temp_file}: {e}")
+def cleanup_old_data():
+    """Clean up old data files to save space"""
+    # Keep only last 30 days of archived data
+    archive_dir = "/data/merged/archive"
+    if os.path.exists(archive_dir):
+        cutoff_date = datetime.now() - timedelta(days=30)
+        for archive_folder in os.listdir(archive_dir):
+            folder_path = os.path.join(archive_dir, archive_folder)
+            if os.path.isdir(folder_path):
+                try:
+                    folder_time = datetime.fromtimestamp(os.path.getmtime(folder_path))
+                    if folder_time < cutoff_date:
+                        shutil.rmtree(folder_path)
+                        print(f"[Cleanup] Removed old archive: {folder_path}")
+                except Exception as e:
+                    print(f"[Cleanup] Error removing {folder_path}: {e}")
+def get_disk_usage():
+    """Get current disk usage"""
+    try:
+        import psutil
+        # Check disk usage for the data mount if present
+        disk_usage = psutil.disk_usage('/data' if os.path.exists('/data') else '/')
+        free_gb = disk_usage.free / (1024**3)
+        used_percent = (disk_usage.used / disk_usage.total) * 100
+        return free_gb, used_percent
+    except Exception:
+        return None, None
+def main():
+    """Main cleanup function"""
+    print(f"[Cleanup] Starting cleanup at {datetime.now()}")
+    # Check disk usage before cleanup
+    free_before, used_before = get_disk_usage()
+    if free_before:
+        print(f"[Cleanup] Disk usage before: {used_before:.1f}% used, {free_before:.1f}GB free")
+    # Run cleanup tasks
+    cleanup_logs()
+    cleanup_temp_files()
+    cleanup_old_data()
+    # Check disk usage after cleanup
+    free_after, used_after = get_disk_usage()
+    if free_after and free_before:
+        freed_space = free_after - free_before
+        print(f"[Cleanup] Disk usage after: {used_after:.1f}% used, {free_after:.1f}GB free")
+        if freed_space > 0:
+            print(f"[Cleanup] Freed {freed_space:.2f}GB of disk space")
+    print(f"[Cleanup] Cleanup completed at {datetime.now()}")
+if __name__ == "__main__":
+    main()

deployment/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,64 @@

+#!/bin/sh
+set -e
+echo "[entrypoint] v2025-08-16-permissions-fix"
+echo "[entrypoint] ensuring data directories exist with proper permissions..."
+# Create directories under /data and /tmp/nginx (for Nginx temp files)
+mkdir -p /data/advisorai-data \
+         /data/merged \
+         /data/alpaca \
+         /data/crypto-bubbles \
+         /data/finnhub \
+         /data/finviz \
+         /data/marketaux \
+         /data/logs \
+         /tmp/nginx/body \
+         /tmp/nginx/proxy \
+         /tmp/nginx/fastcgi
+# Fix permissions at runtime (in case Dockerfile is not enough)
+# Best-effort ownership/permission fixes; ignore errors on Space mounts
+chown -R $(id -u):$(id -g) /data /tmp/nginx 2>/dev/null || true
+chmod -R 777 /data /tmp/nginx 2>/dev/null || true
+echo "[entrypoint] restoring data from Filebase…"
+# Run data restoration in background to avoid blocking startup. Let script auto-detect writable base.
+python /app/deployment/fetch_filebase.py &
+FETCH_PID=$!
+# Wait a bit for critical data, but don't block indefinitely
+sleep 10
+# Check if fetch is still running
+if kill -0 $FETCH_PID 2>/dev/null; then
+    echo "[entrypoint] Data fetch still running in background (PID: $FETCH_PID)"
+else
+    echo "[entrypoint] Data fetch completed"
+fi
+echo "[entrypoint] launching services…"
+# ROLE-based startup: 'web' (default) runs API+nginx under supervisord; 'worker' runs scheduler directly
+ROLE_ENV=${ROLE:-web}
+echo "[entrypoint] detected ROLE=$ROLE_ENV"
+if [ "$ROLE_ENV" = "worker" ]; then
+    echo "[entrypoint] starting worker: scheduler only"
+    exec python /app/deployment/scheduler.py
+else
+    # Hugging Face Spaces friendly mode: run uvicorn directly on $PORT if HF_MODE=1
+    if [ "${HF_MODE:-0}" = "1" ]; then
+        export PORT=${PORT:-7860}
+        echo "[entrypoint] HF_MODE=1 -> launching uvicorn directly on PORT=$PORT"
+        exec uvicorn src.api.main:app --host 0.0.0.0 --port ${PORT} --workers 1 --timeout-keep-alive 30
+    else
+        # Default: nginx + uvicorn via supervisord
+        if [ -n "$PORT" ]; then
+            echo "[entrypoint] configuring nginx to listen on PORT=$PORT"
+            sed -i "s/listen 80;/listen ${PORT};/" /etc/nginx/conf.d/app.conf || true
+        fi
+        exec supervisord -c /etc/supervisord.conf
+    fi
+fi

deployment/fetch_filebase.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import sys
+import argparse
+from dotenv import load_dotenv
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from src.data_cloud.cloud_utils import StorageHandler
+def choose_base_dir(cli_base=None):
+    """Choose a writable base directory. Preference order:
+    1. CLI-provided path
+    2. /data (persistent volume on Spaces)
+    3. /tmp
+    """
+    candidates = []
+    if cli_base:
+        candidates.append(cli_base)
+    candidates.extend(['/data', '/tmp'])
+    for base in candidates:
+        try:
+            merged_path = os.path.abspath(os.path.join(base, 'merged'))
+            advisorai_path = os.path.abspath(os.path.join(base, 'advisorai-data'))
+            os.makedirs(merged_path, mode=0o777, exist_ok=True)
+            os.makedirs(advisorai_path, mode=0o777, exist_ok=True)
+            # Quick writability test
+            test_file = os.path.join(merged_path, '.write_test')
+            with open(test_file, 'w') as f:
+                f.write('ok')
+            os.remove(test_file)
+            return base
+        except Exception:
+            # cannot use this candidate; try next
+            continue
+    # As a last resort, use /tmp (may raise later if not writable)
+    return '/tmp'
+def main(argv=None):
+    parser = argparse.ArgumentParser(description='Fetch data from Filebase/S3 into local disk')
+    parser.add_argument('--base-dir', help='Base directory to store data (default: auto-detected)')
+    args = parser.parse_args(argv)
+    load_dotenv()
+    # Load credentials from environment variables
+    endpoint_url = os.getenv('FILEBASE_ENDPOINT', 'https://s3.filebase.com')
+    access_key = os.getenv('FILEBASE_ACCESS_KEY')
+    secret_key = os.getenv('FILEBASE_SECRET_KEY')
+    bucket_name = os.getenv('FILEBASE_BUCKET')
+    # Prefer explicit DATA_DIR env var if present (Option 1)
+    env_base = os.getenv('DATA_DIR')
+    if env_base:
+        base_root = env_base
+    else:
+        base_root = choose_base_dir(args.base_dir)
+    local_base = os.path.abspath(os.path.join(base_root, 'merged'))
+    advisorai_base = os.path.abspath(os.path.join(base_root, 'advisorai-data'))
+    # Ensure base directories exist with proper permissions
+    os.makedirs(local_base, mode=0o777, exist_ok=True)
+    os.makedirs(advisorai_base, mode=0o777, exist_ok=True)
+    storage = StorageHandler(endpoint_url, access_key, secret_key, bucket_name, local_base=local_base)
+    # Fetch all folders/files from advisorai-data
+    advisor_prefix = "advisorai-data/"
+    print(f"Fetching all folders/files from: {advisor_prefix}")
+    advisor_keys = []
+    if storage.s3 and bucket_name:
+        try:
+            resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=advisor_prefix)
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                if not key.endswith('/'):
+                    advisor_keys.append(key)
+        except Exception as e:
+            print(f"[WARN] Could not list objects for {advisor_prefix}: {e}")
+    else:
+        print(f"[ERROR] No S3 client or bucket configured for advisorai-data!")
+    # Download advisorai-data files
+    for key in advisor_keys:
+        try:
+            data = storage.download(key)
+            # Remove 'advisorai-data/' from the start of the key for local path
+            local_rel_path = key[len("advisorai-data/"):] if key.startswith("advisorai-data/") else key
+            local_path = os.path.join(advisorai_base, local_rel_path)
+            os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True)
+            with open(local_path, 'wb') as f:
+                f.write(data)
+            print(f"[OK] Downloaded advisorai-data/{local_rel_path} from s3://{bucket_name}/{key}")
+        except Exception as e:
+            print(f"[ERROR] Failed to fetch advisorai-data file {key}: {e}")
+    # Fetch everything under merged/ except only the last 7 from merged/archive/
+    merged_prefix = "merged/"
+    print(f"Fetching everything under: {merged_prefix} (except only last 7 from archive)")
+    merged_keys = []
+    archive_prefix = "merged/archive/"
+    archive_folders = set()
+    archive_keys = []
+    if storage.s3 and bucket_name:
+        try:
+            resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=merged_prefix)
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                # Exclude all archive keys for now
+                if key.startswith(archive_prefix):
+                    # Collect archive folders for later
+                    parts = key[len(archive_prefix):].split('/')
+                    if len(parts) > 1 and parts[0].isdigit():
+                        archive_folders.add(parts[0])
+                    continue
+                if not key.endswith('/'):
+                    merged_keys.append(key)
+        except Exception as e:
+            print(f"[WARN] Could not list objects for {merged_prefix}: {e}")
+    else:
+        print(f"[ERROR] No S3 client or bucket configured for merged!")
+    # Download all merged/ (except archive)
+    for key in merged_keys:
+        try:
+            data = storage.download(key)
+            local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key
+            local_path = os.path.join(local_base, local_rel_path)
+            os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True)
+            with open(local_path, 'wb') as f:
+                f.write(data)
+            print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}")
+        except Exception as e:
+            print(f"[ERROR] Failed to fetch {key}: {e}")
+    # Fetch only the last 7 folders under merged/archive
+    archive_prefix = "merged/archive/"
+    print(f"Fetching last 7 archive folders from: {archive_prefix}")
+    archive_folders = set()
+    archive_keys = []
+    if storage.s3 and bucket_name:
+        try:
+            resp = storage.s3.list_objects_v2(Bucket=bucket_name, Prefix=archive_prefix)
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                # Expect keys like merged/archive/YYYYMMDD/...
+                parts = key[len(archive_prefix):].split('/')
+                if len(parts) > 1 and parts[0].isdigit():
+                    archive_folders.add(parts[0])
+            # Sort and get last 7 folders
+            last7 = sorted(archive_folders)[-7:]
+            print(f"[INFO] Last 7 archive folders: {last7}")
+            # Collect all keys in those folders
+            for obj in resp.get('Contents', []):
+                key = obj['Key']
+                parts = key[len(archive_prefix):].split('/')
+                if len(parts) > 1 and parts[0] in last7:
+                    archive_keys.append(key)
+        except Exception as e:
+            print(f"[WARN] Could not list objects for {archive_prefix}: {e}")
+    else:
+        print(f"[ERROR] No S3 client or bucket configured for archive!")
+    # Download archive files
+    for key in archive_keys:
+        try:
+            data = storage.download(key)
+            local_rel_path = key[len("merged/"):] if key.startswith("merged/") else key
+            local_path = os.path.join(local_base, local_rel_path)
+            os.makedirs(os.path.dirname(local_path), mode=0o777, exist_ok=True)
+            with open(local_path, 'wb') as f:
+                f.write(data)
+            print(f"[OK] Downloaded {key} from s3://{bucket_name}/{key}")
+        except Exception as e:
+            print(f"[ERROR] Failed to fetch archive file {key}: {e}")
+if __name__ == "__main__":
+    main()

deployment/gradio_entrypoint.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+set -e
+echo "Starting AdvisorAI Data Pipeline with Gradio..."
+# Create necessary directories
+mkdir -p /data/logs /data/nltk_data
+# Set proper permissions
+chmod -R 777 /data
+# Download NLTK data if needed
+python -c "
+import nltk
+import os
+os.environ['NLTK_DATA'] = '/data/nltk_data'
+try:
+    nltk.download('punkt', download_dir='/data/nltk_data', quiet=True)
+    nltk.download('stopwords', download_dir='/data/nltk_data', quiet=True)
+    nltk.download('vader_lexicon', download_dir='/data/nltk_data', quiet=True)
+    print('NLTK data downloaded successfully')
+except Exception as e:
+    print(f'NLTK download failed: {e}')
+"
+echo "Starting services..."
+exec "$@"

deployment/monitor.py ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/usr/bin/env python3
+"""
+Simple monitoring script to track service health and resource usage
+"""
+import os
+import time
+import psutil
+import json
+from datetime import datetime
+from src import config as app_config
+def get_system_stats():
+    """Get current system statistics"""
+    try:
+        process = psutil.Process()
+        # Memory info
+        memory_info = process.memory_info()
+        memory_mb = memory_info.rss / 1024 / 1024
+        # CPU info
+        cpu_percent = process.cpu_percent(interval=1)
+        # Disk info (prefer DATA_DIR)
+        disk_root = app_config.DATA_DIR if os.path.exists(app_config.DATA_DIR) else '/'
+        disk_usage = psutil.disk_usage(disk_root)
+        disk_free_gb = disk_usage.free / (1024**3)
+        disk_used_percent = (disk_usage.used / disk_usage.total) * 100
+        # Process info
+        num_threads = process.num_threads()
+        return {
+            "timestamp": datetime.now().isoformat(),
+            "memory_mb": round(memory_mb, 2),
+            "cpu_percent": round(cpu_percent, 2),
+            "disk_free_gb": round(disk_free_gb, 2),
+            "disk_used_percent": round(disk_used_percent, 2),
+            "num_threads": num_threads,
+            "pid": process.pid
+        }
+    except Exception as e:
+        return {
+            "timestamp": datetime.now().isoformat(),
+            "error": str(e)
+        }
+def log_stats():
+    """Log system statistics to file"""
+    stats = get_system_stats()
+    # Create logs directory if it doesn't exist
+    log_dir = app_config.LOG_DIR
+    os.makedirs(log_dir, exist_ok=True)
+    # Write to log file
+    log_file = os.path.join(log_dir, "system_stats.jsonl")
+    with open(log_file, "a") as f:
+        f.write(json.dumps(stats) + "\n")
+    # Print to stdout for supervisord
+    print(f"[Monitor] {json.dumps(stats)}")
+    # Check for issues
+    if "error" not in stats:
+        issues = []
+        if stats["memory_mb"] > 450:  # 90% of 512MB limit
+            issues.append(f"HIGH MEMORY: {stats['memory_mb']:.1f}MB")
+        if stats["cpu_percent"] > 80:
+            issues.append(f"HIGH CPU: {stats['cpu_percent']:.1f}%")
+        if stats["disk_free_gb"] < 0.5:
+            issues.append(f"LOW DISK: {stats['disk_free_gb']:.1f}GB free")
+        if issues:
+            print(f"[Monitor] ALERTS: {', '.join(issues)}")
+if __name__ == "__main__":
+    print("[Monitor] Starting system monitoring...")
+    while True:
+        try:
+            log_stats()
+            time.sleep(60)  # Log every minute
+        except KeyboardInterrupt:
+            print("[Monitor] Monitoring stopped")
+            break
+        except Exception as e:
+            print(f"[Monitor] Error: {e}")
+            time.sleep(60)

deployment/nginx.conf ADDED Viewed

	@@ -0,0 +1,51 @@

+server {
+    listen 80;
+    # Increase timeouts to handle long-running operations
+    proxy_connect_timeout       60s;
+    proxy_send_timeout          60s;
+    proxy_read_timeout          60s;
+    # Temp paths are configured globally in nginx.main.conf (http scope)
+    # Buffer settings
+    proxy_buffering             on;
+    proxy_buffer_size           4k;
+    proxy_buffers               8 4k;
+    proxy_busy_buffers_size     8k;
+    # Client settings
+    client_max_body_size        10m;
+    client_body_timeout         60s;
+    client_header_timeout       60s;
+    # -- health-check: proxy to gradio app --
+    location = /health {
+        proxy_pass       http://127.0.0.1:7860/;
+        proxy_set_header Host $host;
+        proxy_set_header X-Real-IP $remote_addr;
+        proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        # Shorter timeouts for health checks
+        proxy_connect_timeout 10s;
+        proxy_send_timeout 10s;
+        proxy_read_timeout 10s;
+        # don't log upstream body
+        access_log off;
+    }
+    # -- everything else to Gradio --
+    location / {
+        proxy_pass       http://127.0.0.1:7860/;
+        proxy_set_header Host              $host;
+        proxy_set_header X-Real-IP         $remote_addr;
+        proxy_set_header X-Forwarded-For   $proxy_add_x_forwarded_for;
+        proxy_set_header X-Forwarded-Proto $scheme;
+        # Handle WebSocket upgrades for Gradio
+        proxy_http_version 1.1;
+        proxy_set_header Upgrade $http_upgrade;
+        proxy_set_header Connection "upgrade";
+    }
+}

deployment/nginx.main.conf ADDED Viewed

	@@ -0,0 +1,37 @@

+worker_processes  auto;
+events {
+    worker_connections 1024;
+}
+http {
+    include       /etc/nginx/mime.types;
+    default_type  application/octet-stream;
+    # Timeouts
+    proxy_connect_timeout       60s;
+    proxy_send_timeout          60s;
+    proxy_read_timeout          60s;
+    # Temp paths (writable on Spaces)
+    client_body_temp_path /tmp/nginx/body 1 2;
+    proxy_temp_path       /tmp/nginx/proxy;
+    fastcgi_temp_path     /tmp/nginx/fastcgi;
+    # Buffers
+    proxy_buffering on;
+    proxy_buffer_size 4k;
+    proxy_buffers 8 4k;
+    proxy_busy_buffers_size 8k;
+    # Client
+    client_max_body_size 10m;
+    client_body_timeout 60s;
+    client_header_timeout 60s;
+    # Logs
+    access_log /dev/stdout;
+    error_log  /dev/stderr warn;
+    include /etc/nginx/conf.d/*.conf;
+}

deployment/render.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+services:
+  # ────────────────────────────────
+  # 1) Web service: API + nginx
+  # ────────────────────────────────
+  - type: web
+    name: advisorai-complete
+    env: docker
+    plan: free
+    instanceCount: 1
+    dockerfilePath: Dockerfile
+    dockerContext: .
+    # Health check configuration
+    healthCheckPath: /health
+    healthCheckInterval: 60s  # Longer interval for free plan
+    healthCheckTimeout: 15s
+    healthCheckThreshold: 5   # More lenient for free plan
+    # Environment variables
+    envVars:
+      - key: PORT
+        value: "80"
+      - key: API_PORT
+        value: "10000"
+      - key: ROLE
+        value: "web"
+      - key: PYTHONPATH
+        value: "/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
+      - key: MAX_MEMORY_MB
+        value: "512"  # Lower limit for free plan
+      - key: PYTHONUNBUFFERED
+        value: "1"
+      - key: PYTHONIOENCODING
+        value: "utf-8"
+      - key: TRIGGER_PING_INTERVAL
+        value: "600"  # Less frequent pinging for free plan
+    # Auto-deploy settings
+    autoDeploy: true
+    # Build settings
+    buildFilter:
+      paths:
+        - src/**
+        - deployment/**
+        - requirements.txt
+        - Dockerfile
+  # ────────────────────────────────
+  # 2) Worker service: pipeline scheduler & backup
+  # ────────────────────────────────
+  - type: worker
+    name: advisorai-scheduler
+    env: docker
+    plan: free
+    instanceCount: 1
+    dockerfilePath: Dockerfile
+    dockerContext: .
+    # entrypoint will respect ROLE=worker and launch scheduler
+    envVars:
+      - key: ROLE
+        value: "worker"
+      - key: PYTHONPATH
+        value: "/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
+      - key: MAX_MEMORY_MB
+        value: "512"  # Lower limit for free plan
+      - key: PYTHONUNBUFFERED
+        value: "1"
+      - key: PYTHONIOENCODING
+        value: "utf-8"
+      - key: TRIGGER_PING_INTERVAL
+        value: "600"  # Less frequent pinging for free plan
+      - key: MONGODB_URI
+        value: "<your-atlas-uri>"
+      - key: MONGODB_DATABASE
+        value: "AdvisorAI"
+      - key: MONGODB_COLLECTION_WAREHOUSE
+        value: "warehouse"
+    # Auto-deploy settings
+    autoDeploy: true
+    # Build settings
+    buildFilter:
+      paths:
+        - src/**
+        - deployment/**
+        - requirements.txt
+        - Dockerfile

deployment/scheduler.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import time
+import subprocess
+import sys
+import threading
+import asyncio
+from dotenv import load_dotenv
+import httpx
+import os
+from src import config as app_config
+# -----------------------------------------------------------------------------
+# LOCATE YOUR DATA-PIPELINE SCRIPT
+# -----------------------------------------------------------------------------
+if os.path.exists(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py"))):
+    PIPELINE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py"))
+    PIPELINE_DIR = os.path.dirname(PIPELINE_PATH)
+else:
+    raise FileNotFoundError("src/main.py not found!")
+# -----------------------------------------------------------------------------
+# CONFIGURATION (via ENV)
+# -----------------------------------------------------------------------------
+load_dotenv()
+# URL to ping every N seconds (default 300s = 5min)
+def _parse_int_env(name: str, default_val: int) -> int:
+    raw = os.getenv(name, str(default_val))
+    if isinstance(raw, str):
+        # Strip inline comments and whitespace, e.g. "3600  # every hour"
+        cleaned = raw.split('#', 1)[0].strip()
+        if cleaned == "":
+            return int(default_val)
+        try:
+            return int(cleaned)
+        except Exception:
+            print(f"[Scheduler] Warning: {name}='{raw}' is not a valid int. Using default {default_val}.")
+            return int(default_val)
+    try:
+        return int(raw)
+    except Exception:
+        return int(default_val)
+TRIGGER_HEALTH_URL = os.getenv(
+    "TRIGGER_HEALTH_URL",
+    "https://advisor-trigger-ki3t.onrender.com/health, https://advisorai-data-1ew2.onrender.com/health"
+)
+PING_INTERVAL = _parse_int_env("TRIGGER_PING_INTERVAL", 300)
+# Pipeline interval default 3600s (1 hour)
+PIPELINE_INTERVAL = _parse_int_env("PIPELINE_INTERVAL", 3600)
+# -----------------------------------------------------------------------------
+# ASYNC PINGER WITH EXPONENTIAL BACKOFF
+# -----------------------------------------------------------------------------
+async def ping_remote():
+    """
+    Continuously GET each URL in TRIGGER_HEALTH_URL (comma-separated) every PING_INTERVAL seconds,
+    backing off on failure (up to 2.5 minutes).
+    """
+    urls = [u.strip() for u in TRIGGER_HEALTH_URL.split(",") if u.strip()]
+    backoff = min(PING_INTERVAL, 5)
+    async with httpx.AsyncClient(timeout=10.0) as client:
+        while True:
+            all_success = True
+            for url in urls:
+                try:
+                    resp = await client.get(url)
+                    resp.raise_for_status()
+                    print(f"[Pinger] {url} -> {resp.status_code}")
+                except Exception as e:
+                    print(f"[Pinger] error pinging {url}: {e}")
+                    all_success = False
+            if all_success:
+                backoff = PING_INTERVAL
+                await asyncio.sleep(PING_INTERVAL)
+            else:
+                await asyncio.sleep(backoff)
+                backoff = min(backoff * 2, 150)
+def start_async_ping():
+    """
+    Spin up a dedicated asyncio loop in a daemon thread
+    to run ping_remote() forever.
+    """
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    loop.create_task(ping_remote())
+    loop.run_forever()
+# launch the ping loop in the background
+threading.Thread(target=start_async_ping, daemon=True).start()
+print("[Scheduler] Started background ping thread")
+# -----------------------------------------------------------------------------
+# MAIN PIPELINE LOOP (runs every 30 minutes)
+# -----------------------------------------------------------------------------
+import traceback
+while True:
+    from datetime import datetime
+    last_run = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+    print(f"[Scheduler] Running pipeline... Last run: {last_run}")
+    # Write last_run to file for API access
+    try:
+        with open(app_config.LAST_RUN_PATH, 'w') as f:
+            f.write(last_run)
+    except Exception as e:
+        print(f"[Scheduler] Failed to write last_run.txt: {e}")
+    try:
+        # Set working directory to project root (parent of deployment)
+        project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+        print(f"[Scheduler] Project root: {project_root}")
+        print(f"[Scheduler] Pipeline path: {PIPELINE_PATH}")
+        # Run from '/' so relative 'data/...' writes resolve to '/data/...'
+        result = subprocess.run(
+            [sys.executable, PIPELINE_PATH],
+            cwd='/',
+            capture_output=True,
+            text=True,
+            env=os.environ.copy()
+        )
+        print(f"[Scheduler] Pipeline finished with code {result.returncode}")
+        if result.stdout:
+            print("[Scheduler] STDOUT:\n", result.stdout)
+        if result.stderr:
+            print("[Scheduler] STDERR:\n", result.stderr)
+        # Raise an exception if the return code is non-zero
+        if result.returncode != 0:
+            raise subprocess.CalledProcessError(result.returncode, result.args, result.stdout, result.stderr)
+    except subprocess.CalledProcessError as e:
+        print(f"[Scheduler] Pipeline execution failed with return code {e.returncode}")
+        print(f"[Scheduler] STDOUT:\n{e.stdout}")
+        print(f"[Scheduler] STDERR:\n{e.stderr}")
+    except Exception as e:
+        print(f"[Scheduler] Exception running pipeline: {e}")
+        print(traceback.format_exc())
+    print(f"[Scheduler] Sleeping for {PIPELINE_INTERVAL // 60} minutes...")
+    time.sleep(PIPELINE_INTERVAL)

deployment/supervisord.conf ADDED Viewed

	@@ -0,0 +1,65 @@

+[supervisord]
+nodaemon=true
+logfile=/dev/stdout
+logfile_maxbytes=0
+pidfile=/tmp/supervisord.pid
+loglevel=info
+[program:gradio]
+command=python /app/src/api/gradio_main.py
+directory=/app
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startsecs=10
+startretries=3
+stopwaitsecs=30
+killasgroup=true
+stopasgroup=true
+environment=PYTHONPATH="/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
+[program:nginx]
+command=/usr/sbin/nginx -g 'daemon off;'
+autostart=true
+autorestart=true
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startsecs=5
+startretries=3
+stopwaitsecs=10
+[program:scheduler]
+; wait 180 s before first run, then your scheduler.py handles its own 30 min sleeps
+command=/bin/sh -c 'sleep 180 && python /app/deployment/scheduler.py'
+directory=/app
+autostart=true
+autorestart=true
+startsecs=0
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startretries=3
+stopwaitsecs=60
+killasgroup=true
+stopasgroup=true
+[program:monitor]
+command=python /app/deployment/monitor.py
+directory=/app
+autostart=true
+autorestart=true
+startsecs=5
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stderr
+stdout_logfile_maxbytes=0
+stderr_logfile_maxbytes=0
+startretries=3
+stopwaitsecs=10
+killasgroup=true
+stopasgroup=true

deployment/test_permissions.py ADDED Viewed

	@@ -0,0 +1,129 @@

+"""
+Test script to verify directory permissions and file creation capabilities.
+This script should be run inside the container to verify the fixes.
+"""
+import os
+import tempfile
+import sys
+from pathlib import Path
+def test_directory_permissions():
+    """Test if we can create directories and files in the expected locations."""
+    print("=== Directory Permission Test ===")
+    # Test directories that should be writable (use /data on Spaces)
+    test_dirs = [
+        "/data/advisorai-data/test",
+        "/data/merged/test",
+        "/data/alpaca/test",
+        "/data/crypto-bubbles/test",
+        "/data/finnhub/test",
+        "/data/finviz/test",
+        "/data/marketaux/test"
+    ]
+    success_count = 0
+    for test_dir in test_dirs:
+        try:
+            # Try to create directory
+            os.makedirs(test_dir, mode=0o755, exist_ok=True)
+            # Try to create a test file
+            test_file = os.path.join(test_dir, "test_write.txt")
+            with open(test_file, 'w') as f:
+                f.write(f"Test write successful at {test_dir}")
+            # Try to read the file back
+            with open(test_file, 'r') as f:
+                content = f.read()
+            # Clean up
+            os.remove(test_file)
+            os.rmdir(test_dir)
+            print(f"✅ SUCCESS: {test_dir}")
+            success_count += 1
+        except Exception as e:
+            print(f"❌ FAILED: {test_dir} - {e}")
+    print(f"\n📊 Results: {success_count}/{len(test_dirs)} directories passed the test")
+    if success_count == len(test_dirs):
+        print("🎉 All directory permission tests PASSED!")
+        return True
+    else:
+        print("⚠️  Some directory permission tests FAILED!")
+        return False
+def test_user_info():
+    """Display current user and process information."""
+    print("\n=== User & Process Information ===")
+    # Check if running on Windows or Unix
+    if hasattr(os, 'getuid'):
+        # Unix/Linux system
+        print(f"Current UID: {os.getuid()}")
+        print(f"Current GID: {os.getgid()}")
+        print(f"Effective UID: {os.geteuid()}")
+        print(f"Effective GID: {os.getegid()}")
+        # Check if running as root
+        if os.getuid() == 0:
+            print("✅ Running as root user")
+        else:
+            print("ℹ️  Running as non-root user")
+    else:
+        # Windows system
+        print("ℹ️  Running on Windows system")
+        print(f"Current user: {os.getenv('USERNAME', 'Unknown')}")
+    print(f"Process ID: {os.getpid()}")
+    print(f"Parent Process ID: {os.getppid()}")
+def test_filebase_connectivity():
+    """Test if we can load environment variables needed for Filebase."""
+    print("\n=== Environment Variables Test ===")
+    required_vars = [
+        'FILEBASE_ENDPOINT',
+        'FILEBASE_ACCESS_KEY',
+        'FILEBASE_SECRET_KEY',
+        'FILEBASE_BUCKET'
+    ]
+    missing_vars = []
+    for var in required_vars:
+        value = os.getenv(var)
+        if value:
+            # Don't print sensitive values, just show they exist
+            if 'KEY' in var:
+                print(f"✅ {var}: ***redacted*** (length: {len(value)})")
+            else:
+                print(f"✅ {var}: {value}")
+        else:
+            print(f"❌ {var}: NOT SET")
+            missing_vars.append(var)
+    if missing_vars:
+        print(f"⚠️  Missing environment variables: {missing_vars}")
+        return False
+    else:
+        print("🎉 All required environment variables are set!")
+        return True
+if __name__ == "__main__":
+    print("Starting permission and environment tests...\n")
+    test_user_info()
+    perm_test = test_directory_permissions()
+    env_test = test_filebase_connectivity()
+    print(f"\n=== Final Results ===")
+    if perm_test and env_test:
+        print("🎉 ALL TESTS PASSED! The container should work correctly.")
+        sys.exit(0)
+    else:
+        print("❌ SOME TESTS FAILED! Check the output above for details.")
+        sys.exit(1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# feedparser
+# crawl4ai
+python-dotenv
+requests>=2.25.0
+# pymongo
+pandas>=1.3.0
+pyarrow
+boto3==1.36.*
+finnhub-python==2.4.24
+alpaca-py>=0.6.0
+pydantic-settings>=1.0.0
+sanpy>=0.1.0
+python-dateutil
+plotly
+nltk
+Flask==2.2.2
+werkzeug==2.2.3
+fastapi
+uvicorn[standard]
+httpx
+gradio>=4.0.0
+# trafilatura
+rich
+numpy
+pydantic
+# playwright
+psutil
+beautifulsoup4
+scikit-learn
+python-multipart
+aiofiles

santiment_frequency_controller.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Santiment Frequency Controller
+=============================
+This module provides frequency control for Santiment API calls to preserve API limits.
+It tracks execution frequency and limits runs to avoid exceeding API quotas.
+"""
+import json
+import os
+from datetime import datetime, timedelta
+from pathlib import Path
+class SantimentFrequencyController:
+    """Controls the frequency of Santiment API calls to preserve API limits"""
+    def __init__(self, state_file: str = None):
+        """Initialize the frequency controller
+        Args:
+            state_file: Path to the state file. If None, uses default location.
+        """
+        if state_file is None:
+            # Try to find the state file in data/santiment directory
+            try:
+                from src.config import DATA_DIR
+                state_file = os.path.join(DATA_DIR, "santiment", "frequency_state.json")
+            except Exception:
+                # Fallback to local directory
+                state_file = "data/santiment/frequency_state.json"
+        self.state_file = Path(state_file)
+        self.state_file.parent.mkdir(parents=True, exist_ok=True)
+        self._load_state()
+    def _load_state(self):
+        """Load the current state from file"""
+        if self.state_file.exists():
+            try:
+                with open(self.state_file, 'r') as f:
+                    self.state = json.load(f)
+            except Exception:
+                self.state = {}
+        else:
+            self.state = {}
+        # Ensure required fields exist
+        if 'last_run' not in self.state:
+            self.state['last_run'] = None
+        if 'runs_today' not in self.state:
+            self.state['runs_today'] = 0
+        if 'date' not in self.state:
+            self.state['date'] = None
+    def _save_state(self):
+        """Save the current state to file"""
+        try:
+            with open(self.state_file, 'w') as f:
+                json.dump(self.state, f, indent=2)
+        except Exception as e:
+            print(f"[WARN] Failed to save frequency state: {e}")
+    def should_run_santiment(self, max_runs_per_day: int = 2) -> bool:
+        """Check if Santiment should be allowed to run
+        Args:
+            max_runs_per_day: Maximum number of runs allowed per day
+        Returns:
+            True if Santiment should run, False otherwise
+        """
+        today = datetime.now().strftime("%Y-%m-%d")
+        # Reset counter if it's a new day
+        if self.state.get('date') != today:
+            self.state['date'] = today
+            self.state['runs_today'] = 0
+            self._save_state()
+        # Check if we've exceeded the daily limit
+        return self.state['runs_today'] < max_runs_per_day
+    def record_run(self):
+        """Record that Santiment has been run"""
+        today = datetime.now().strftime("%Y-%m-%d")
+        now = datetime.now().isoformat()
+        # Update state
+        self.state['last_run'] = now
+        self.state['date'] = today
+        self.state['runs_today'] = self.state.get('runs_today', 0) + 1
+        # Save state
+        self._save_state()
+        print(f"[SANTIMENT] Recorded run #{self.state['runs_today']} for {today}")
+    def get_status(self) -> dict:
+        """Get the current status of the frequency controller
+        Returns:
+            Dictionary with current status information
+        """
+        return {
+            'last_run': self.state.get('last_run'),
+            'runs_today': self.state.get('runs_today', 0),
+            'date': self.state.get('date'),
+            'state_file': str(self.state_file)
+        }
+    def reset_daily_count(self):
+        """Reset the daily run count (for testing or manual reset)"""
+        today = datetime.now().strftime("%Y-%m-%d")
+        self.state['date'] = today
+        self.state['runs_today'] = 0
+        self._save_state()
+        print(f"[SANTIMENT] Reset daily count for {today}")

scripts/push_hf_secrets.py ADDED Viewed

	@@ -0,0 +1,186 @@

+"""
+Push all variables from a .env file into a Hugging Face Space as secrets (or variables).
+Requirements:
+    - huggingface_hub (Python SDK)
+        Install: pip install -U huggingface_hub
+Usage examples:
+    python scripts/push_hf_secrets.py --repo your-username/your-space
+    python scripts/push_hf_secrets.py --repo your-username/your-space --env .env.production
+    python scripts/push_hf_secrets.py --repo your-username/your-space --dry-run
+    python scripts/push_hf_secrets.py --repo your-username/your-space --as-variables  # send as public variables
+Notes:
+    - This script is intentionally simple and cross-platform.
+    - It parses common .env formats (KEY=VALUE, supports quoted values and export prefix).
+    - It won’t print secret values; only key names are logged.
+    - "Secrets" are private; "Variables" are public. See: Settings → Secrets and variables
+"""
+from __future__ import annotations
+import argparse
+import os
+import re
+import sys
+from typing import Dict, Tuple
+ENV_LINE_RE = re.compile(r"^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)\s*$")
+def _unquote(value: str) -> str:
+    """Strip matching single or double quotes and unescape simple escapes for double quotes.
+    - If value is wrapped in double quotes, unescape common sequences (\\n, \\r, \\t, \\" , \\\\).
+    - If wrapped in single quotes, return inner content as-is (no escapes processing).
+    - Otherwise, return value trimmed of surrounding whitespace.
+    """
+    if len(value) >= 2 and value[0] == value[-1] and value[0] in ("'", '"'):
+        quote = value[0]
+        inner = value[1:-1]
+        if quote == '"':
+            # Process simple escape sequences
+            inner = (
+                inner.replace(r"\\n", "\n")
+                     .replace(r"\\r", "\r")
+                     .replace(r"\\t", "\t")
+                     .replace(r"\\\"", '"')
+                     .replace(r"\\\\", "\\")
+            )
+        return inner
+    return value.strip()
+def parse_env_file(path: str) -> Dict[str, str]:
+    """Parse a .env-like file into a dict of {KEY: VALUE}.
+    Skips blank lines and comments (lines starting with #, ignoring leading whitespace).
+    Supports lines like:
+      - KEY=VALUE
+      - export KEY=VALUE
+    Values can be quoted with single or double quotes.
+    """
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f".env file not found: {path}")
+    env: Dict[str, str] = {}
+    with open(path, "r", encoding="utf-8-sig") as f:
+        for idx, raw in enumerate(f, start=1):
+            line = raw.rstrip("\n\r")
+            stripped = line.strip()
+            if not stripped or stripped.startswith("#"):
+                continue
+            m = ENV_LINE_RE.match(line)
+            if not m:
+                # Non-fatal: skip lines that don't match KEY=VALUE
+                continue
+            key, raw_val = m.group(1), m.group(2).strip()
+            # If value is unquoted, do not strip inline comments aggressively to avoid breaking tokens.
+            value = _unquote(raw_val)
+            env[key] = value
+    return env
+def get_hf_api():
+    """Return an authenticated HfApi client or None with a helpful error.
+    Uses locally saved token if you previously ran `huggingface-cli login` or
+    set HF_TOKEN environment variable.
+    """
+    try:
+        from huggingface_hub import HfApi
+    except Exception:
+        sys.stderr.write(
+            "huggingface_hub is not installed. Install with: pip install -U huggingface_hub\n"
+        )
+        return None
+    return HfApi()
+def set_secret(api, repo: str, key: str, value: str, dry_run: bool = False) -> int:
+    if dry_run:
+        print(f"[DRY RUN] Set secret: {key} -> (hidden) on {repo}")
+        return 0
+    try:
+        api.add_space_secret(repo_id=repo, key=key, value=value)
+        print(f"Set secret: {key}")
+        return 0
+    except Exception as e:
+        sys.stderr.write(f"Error setting secret {key!r} for repo {repo!r}: {e}\n")
+        return 1
+def set_variable(api, repo: str, key: str, value: str, dry_run: bool = False) -> int:
+    if dry_run:
+        print(f"[DRY RUN] Set variable: {key} -> (hidden) on {repo}")
+        return 0
+    try:
+        api.add_space_variable(repo_id=repo, key=key, value=value)
+        print(f"Set variable: {key}")
+        return 0
+    except Exception as e:
+        sys.stderr.write(f"Error setting variable {key!r} for repo {repo!r}: {e}\n")
+        return 1
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description="Push .env variables to a Hugging Face Space as secrets or variables.")
+    parser.add_argument("--repo", required=True, help="Space repo id, e.g. your-username/your-space")
+    parser.add_argument("--env", default=".env", help="Path to .env file (default: .env)")
+    parser.add_argument("--dry-run", action="store_true", help="Print what would be set without applying changes")
+    parser.add_argument(
+        "--as-variables",
+        action="store_true",
+        help="Send entries as public variables instead of private secrets",
+    )
+    parser.add_argument(
+        "--exclude",
+        action="append",
+        default=[],
+        help="Key(s) to exclude (can be repeated)",
+    )
+    args = parser.parse_args(argv)
+    api = get_hf_api()
+    if api is None:
+        return 127
+    try:
+        env_map = parse_env_file(args.env)
+    except Exception as e:
+        sys.stderr.write(f"Failed to read env file {args.env}: {e}\n")
+        return 2
+    if not env_map:
+        print("No variables found in .env; nothing to do.")
+        return 0
+    excluded = set(args.exclude or [])
+    total = 0
+    failures = 0
+    for key, value in env_map.items():
+        if key in excluded:
+            continue
+        total += 1
+        if args.as_variables:
+            rc = set_variable(api, args.repo, key, value, args.dry_run)
+        else:
+            rc = set_secret(api, args.repo, key, value, args.dry_run)
+        if rc != 0:
+            failures += 1
+    if failures:
+        sys.stderr.write(f"Completed with {failures}/{total} failures.\n")
+        return 1
+    print(f"Completed: {total} secrets {'validated' if args.dry_run else 'set'} for {args.repo}.")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

src/api/gradio_main.py ADDED Viewed

	@@ -0,0 +1,265 @@

+import gradio as gr
+import json
+import os
+import sys
+import logging
+import pandas as pd
+import time
+from datetime import datetime, timedelta
+import psutil
+from pathlib import Path
+# Add src to Python path for imports
+sys.path.insert(0, '/app/src')
+sys.path.insert(0, '/app')
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[logging.StreamHandler(sys.stdout)]
+)
+logger = logging.getLogger(__name__)
+def get_health_status():
+    """Get basic health status"""
+    try:
+        # Get process info
+        process = psutil.Process()
+        memory_mb = process.memory_info().rss / 1024 / 1024
+        cpu_percent = process.cpu_percent()
+        # Get system info
+        memory = psutil.virtual_memory()
+        disk = psutil.disk_usage('/')
+        # Check scheduler status
+        scheduler_running = False
+        last_run_time = "Unknown"
+        try:
+            last_run_file = "/app/deployment/last_run.txt"
+            if os.path.exists(last_run_file):
+                with open(last_run_file, 'r') as f:
+                    last_run_str = f.read().strip()
+                    last_run = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S')
+                    time_since_last_run = (datetime.now() - last_run).total_seconds()
+                    scheduler_running = time_since_last_run < 2700  # 45 minutes
+                    last_run_time = last_run_str
+        except Exception as e:
+            logger.warning(f"Could not check scheduler status: {e}")
+        return {
+            "status": "healthy" if memory_mb < 400 else "warning",
+            "timestamp": datetime.now().isoformat(),
+            "process_memory_mb": round(memory_mb, 2),
+            "process_cpu_percent": round(cpu_percent, 2),
+            "system_memory_percent": round(memory.percent, 1),
+            "system_memory_available_gb": round(memory.available / (1024**3), 2),
+            "disk_free_gb": round(disk.free / (1024**3), 2),
+            "scheduler_running": scheduler_running,
+            "scheduler_last_run": last_run_time
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return {
+            "status": "error",
+            "error": str(e),
+            "timestamp": datetime.now().isoformat()
+        }
+def get_pipeline_status():
+    """Get data pipeline status"""
+    try:
+        data_dirs = [
+            "/data/merged/features",
+            "/data/merged/train",
+            "/data/alpaca",
+            "/data/advisorai-data"
+        ]
+        recent_files = 0
+        total_size = 0
+        for data_dir in data_dirs:
+            if os.path.exists(data_dir):
+                for root, dirs, files in os.walk(data_dir):
+                    for file in files:
+                        if file.endswith(('.json', '.parquet', '.csv')):
+                            file_path = os.path.join(root, file)
+                            try:
+                                stat = os.stat(file_path)
+                                # Count files modified in last 24 hours
+                                if time.time() - stat.st_mtime < 86400:
+                                    recent_files += 1
+                                total_size += stat.st_size
+                            except Exception:
+                                continue
+        return {
+            "status": "running" if recent_files > 0 else "stale",
+            "recent_files_24h": recent_files,
+            "total_data_size_gb": round(total_size / (1024**3), 2),
+            "last_check": datetime.now().isoformat()
+        }
+    except Exception as e:
+        logger.error(f"Pipeline status check failed: {e}")
+        return {
+            "status": "error",
+            "error": str(e),
+            "last_check": datetime.now().isoformat()
+        }
+def get_recent_files():
+    """Get list of recent files in the data directories"""
+    try:
+        base_paths = [
+            "/data/merged/features",
+            "/data/merged/train",
+            "/data/alpaca",
+            "/data/advisorai-data/features"
+        ]
+        recent_files = []
+        for base_path in base_paths:
+            if os.path.exists(base_path):
+                for root, dirs, files in os.walk(base_path):
+                    for file in files[:10]:  # Limit to 10 files per directory
+                        file_path = os.path.join(root, file)
+                        try:
+                            stat = os.stat(file_path)
+                            recent_files.append({
+                                "File": file,
+                                "Path": file_path.replace("/data/", ""),
+                                "Size": f"{stat.st_size / (1024**2):.2f} MB",
+                                "Modified": datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
+                            })
+                        except Exception:
+                            continue
+        # Sort by modification time and take most recent 20
+        recent_files.sort(key=lambda x: x["Modified"], reverse=True)
+        return recent_files[:20]
+    except Exception as e:
+        logger.error(f"Error getting recent files: {e}")
+        return [{"Error": str(e)}]
+def get_logs():
+    """Get recent log entries"""
+    try:
+        log_files = [
+            "/data/logs/scheduler.log",
+            "/data/logs/data_pipeline.log",
+            "/data/logs/monitor.log"
+        ]
+        logs = []
+        for log_file in log_files:
+            if os.path.exists(log_file):
+                try:
+                    with open(log_file, 'r', encoding='utf-8') as f:
+                        lines = f.readlines()
+                        # Get last 10 lines
+                        recent_lines = lines[-10:] if len(lines) > 10 else lines
+                        logs.append(f"=== {os.path.basename(log_file)} ===\n")
+                        logs.extend(recent_lines)
+                        logs.append("\n")
+                except Exception as e:
+                    logs.append(f"Error reading {log_file}: {str(e)}\n")
+        return "".join(logs) if logs else "No log files found"
+    except Exception as e:
+        logger.error(f"Error getting logs: {e}")
+        return f"Error getting logs: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft()) as app:
+    gr.Markdown("# 🤖 AdvisorAI Data Pipeline Monitor")
+    gr.Markdown("Real-time monitoring of the AdvisorAI data collection and processing pipeline")
+    with gr.Tabs():
+        with gr.TabItem("📊 Dashboard"):
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Health Status")
+                    health_display = gr.JSON(label="System Health & Status")
+                with gr.Column():
+                    gr.Markdown("### Pipeline Status")
+                    pipeline_display = gr.JSON(label="Data Pipeline Status")
+            with gr.Row():
+                refresh_btn = gr.Button("🔄 Refresh", variant="primary")
+        with gr.TabItem("📁 Recent Files"):
+            gr.Markdown("### Recently Modified Data Files")
+            files_display = gr.Dataframe(
+                headers=["File", "Path", "Size", "Modified"],
+                datatype=["str", "str", "str", "str"],
+                label="Recent Files"
+            )
+            refresh_files_btn = gr.Button("🔄 Refresh Files")
+        with gr.TabItem("📝 Logs"):
+            gr.Markdown("### Recent Log Entries")
+            logs_display = gr.Textbox(
+                label="Recent Logs",
+                lines=20,
+                max_lines=30,
+                show_copy_button=True
+            )
+            refresh_logs_btn = gr.Button("🔄 Refresh Logs")
+    # Event handlers
+    def refresh_dashboard():
+        health = get_health_status()
+        pipeline = get_pipeline_status()
+        return json.dumps(health, indent=2), json.dumps(pipeline, indent=2)
+    def refresh_files():
+        files = get_recent_files()
+        if files and isinstance(files[0], dict) and "Error" not in files[0]:
+            return [[f["File"], f["Path"], f["Size"], f["Modified"]] for f in files]
+        else:
+            return [["Error", str(files), "", ""]]
+    def refresh_logs():
+        return get_logs()
+    # Connect event handlers
+    refresh_btn.click(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+    refresh_files_btn.click(
+        refresh_files,
+        outputs=[files_display]
+    )
+    refresh_logs_btn.click(
+        refresh_logs,
+        outputs=[logs_display]
+    )
+    # Auto-refresh on load
+    app.load(
+        refresh_dashboard,
+        outputs=[health_display, pipeline_display]
+    )
+    app.load(
+        refresh_files,
+        outputs=[files_display]
+    )
+if __name__ == "__main__":
+    logger.info("Starting Gradio app...")
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        quiet=False
+    )

src/api/main.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, HTMLResponse
+import uvicorn
+import logging
+import sys
+from src.api.routes.health import health_status
+from src.api.routes.isrunning import is_running
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout)
+    ]
+)
+logger = logging.getLogger(__name__)
+app = FastAPI(
+    title="AdvisorAI Data API",
+    description="API for AdvisorAI data pipeline and health monitoring",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+@app.exception_handler(Exception)
+async def global_exception_handler(request, exc):
+    logger.error(f"Global exception handler caught: {exc}", exc_info=True)
+    return JSONResponse(
+        status_code=500,
+        content={"detail": "Internal server error", "error": str(exc)}
+    )
+@app.get('/health')
+def health():
+    """Enhanced health check endpoint"""
+    try:
+        return health_status()
+    except Exception as e:
+        logger.error(f"Health check failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Health check failed: {str(e)}")
+# Route to check if there are any JSON files under data/merged/features (relative path)
+@app.get('/status')
+def status():
+    """Check if the data pipeline is running and has recent data"""
+    try:
+        return is_running()
+    except Exception as e:
+        logger.error(f"Status check failed: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=f"Status check failed: {str(e)}")
+@app.get('/', response_class=HTMLResponse)
+def root():
+        """Root endpoint returns simple HTML so HF Spaces iframe can render it."""
+        html = """
+        <!doctype html>
+        <html lang="en">
+        <head>
+            <meta charset="utf-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1">
+            <title>AdvisorAI Data API</title>
+            <style>
+                body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; padding: 24px; }
+                code { background: #f5f5f5; padding: 2px 4px; border-radius: 4px; }
+                .links a { margin-right: 12px; }
+            </style>
+        </head>
+        <body>
+            <h1>AdvisorAI Data API</h1>
+            <p>Service is running.</p>
+            <div class="links">
+                <a href="/health">/health</a>
+                <a href="/status">/status</a>
+                <a href="/api">/api (JSON)</a>
+            </div>
+        </body>
+        </html>
+        """
+        return HTMLResponse(content=html, status_code=200)
+@app.get('/api')
+def api_root():
+        """JSON root for programmatic clients."""
+        return {
+                "message": "AdvisorAI Data API",
+                "version": "1.0.0",
+                "endpoints": {
+                        "/health": "Health check with system metrics",
+                        "/status": "Data pipeline status",
+                        "/api": "This JSON endpoint",
+                        "/": "HTML landing page for Spaces"
+                }
+        }
+if __name__ == "__main__":
+    uvicorn.run(
+        "src.api.main:app",
+        host="0.0.0.0",
+        port=10000,
+        workers=1,
+        timeout_keep_alive=30,
+        access_log=True
+    )

src/api/routes/health.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+import psutil
+import time
+from datetime import datetime
+from src.config import DATA_DIR, LAST_RUN_PATH
+def health_status():
+    """Enhanced health check that monitors actual service health"""
+    try:
+        # Check memory usage
+        process = psutil.Process()
+        memory_mb = process.memory_info().rss / 1024 / 1024
+        cpu_percent = process.cpu_percent()
+        # Check if scheduler is running
+        scheduler_running = False
+        try:
+            with open(LAST_RUN_PATH, 'r') as f:
+                last_run_str = f.read().strip()
+                last_run = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S')
+                # Consider scheduler healthy if it ran within last 45 minutes
+                time_since_last_run = (datetime.now() - last_run).total_seconds()
+                scheduler_running = time_since_last_run < 2700  # 45 minutes
+        except Exception:
+            scheduler_running = False
+        # Check disk space (prefer DATA_DIR)
+        disk_usage = psutil.disk_usage(DATA_DIR if os.path.exists(DATA_DIR) else '/')
+        disk_free_gb = disk_usage.free / (1024**3)
+        # Determine overall health
+        health_issues = []
+        # Memory checks
+        if memory_mb > 1024:  # More than 1GB
+            health_issues.append(f"High memory usage: {memory_mb:.1f}MB (over 1GB)")
+        elif memory_mb > 512:  # More than 512MB for free plan
+            health_issues.append(f"High memory usage: {memory_mb:.1f}MB (over 512MB)")
+        if cpu_percent > 80:
+            health_issues.append(f"High CPU usage: {cpu_percent:.1f}%")
+        if disk_free_gb < 1:  # Less than 1GB free
+            health_issues.append(f"Low disk space: {disk_free_gb:.1f}GB free")
+        if not scheduler_running:
+            health_issues.append("Scheduler not running or stale")
+        status = "healthy" if not health_issues else "degraded"
+        return {
+            "status": status,
+            "timestamp": datetime.now().isoformat(),
+            "metrics": {
+                "memory_mb": round(memory_mb, 1),
+                "cpu_percent": round(cpu_percent, 1),
+                "disk_free_gb": round(disk_free_gb, 1),
+                "scheduler_running": scheduler_running
+            },
+            "issues": health_issues
+        }
+    except Exception as e:
+        return {
+            "status": "error",
+            "timestamp": datetime.now().isoformat(),
+            "error": str(e)
+        }

src/api/routes/isrunning.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+from datetime import datetime
+from fastapi import APIRouter
+from ... import config as app_config
+router = APIRouter()
+@router.get("/status")
+def is_running():
+    """Return a small status dict: whether pipeline appears to be running and last run time."""
+    json_folder = os.path.join(app_config.DATA_DIR, 'merged', 'features')
+    has_json = False
+    if os.path.exists(json_folder):
+        try:
+            has_json = any(f.endswith('.json') for f in os.listdir(json_folder))
+        except Exception:
+            has_json = False
+    last_run_file = app_config.LAST_RUN_PATH
+    last_run_display = 'Unknown'
+    try:
+        if os.path.exists(last_run_file):
+            with open(last_run_file, 'r') as f:
+                last_run_str = f.read().strip()
+            last_run_dt = datetime.strptime(last_run_str, '%Y-%m-%d %H:%M:%S')
+            minutes_ago = int((datetime.now() - last_run_dt).total_seconds() // 60)
+            last_run_display = f"{minutes_ago} minutes ago"
+    except Exception:
+        last_run_display = 'Unknown'
+    status = "Running" if not has_json else "Not Running"
+    return {"status": status, "last_run": last_run_display}

src/config.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import tempfile
+def _is_writable(path: str) -> bool:
+    try:
+        if not os.path.exists(path):
+            os.makedirs(path, exist_ok=True)
+        test_fd, test_path = tempfile.mkstemp(prefix='.wtest_', dir=path)
+        os.close(test_fd)
+        os.unlink(test_path)
+        return True
+    except Exception:
+        return False
+def _detect_data_dir() -> str:
+    # 1) Respect DATA_DIR env only if writable
+    env = os.getenv('DATA_DIR')
+    if env and _is_writable(env):
+        return env
+    # 2) Prefer /data if writable (Spaces)
+    if _is_writable('/data'):
+        return '/data'
+    # 3) Local dev fallback: /app/data if writable
+    if _is_writable('/app/data'):
+        return '/app/data'
+    # 4) Final fallback: /tmp
+    return '/tmp'
+DATA_DIR = _detect_data_dir()
+# Logs: prefer DATA_DIR/logs, fallback to /tmp/logs
+_preferred_logs = os.getenv('LOG_DIR') or os.path.join(DATA_DIR, 'logs')
+try:
+    os.makedirs(_preferred_logs, exist_ok=True)
+    # sanity: try to write
+    if not _is_writable(_preferred_logs):
+        raise PermissionError("Log dir not writable")
+except Exception:
+    _preferred_logs = '/tmp/logs'
+    os.makedirs(_preferred_logs, exist_ok=True)
+LOG_DIR = _preferred_logs
+# Path for scheduler's last_run marker
+def _compute_last_run_path(base_dir: str) -> str:
+    candidates = [
+        os.path.join(base_dir, 'deployment', 'last_run.txt'),
+        os.path.join(base_dir, 'last_run.txt'),
+        '/tmp/last_run.txt',
+    ]
+    for p in candidates:
+        try:
+            os.makedirs(os.path.dirname(p), exist_ok=True)
+            # test write
+            with open(p, 'a'):
+                pass
+            return p
+        except Exception:
+            continue
+    return '/tmp/last_run.txt'
+LAST_RUN_PATH = _compute_last_run_path(DATA_DIR)

src/data_cloud/cloud_utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+cloud_utils.py – Unified utilities for HTTP fetch and cloud/local storage operations.
+Provides:
+  • fetch_content / fetch_json for HTTP GET
+  • StorageHandler class with upload/download and fallback to local filesystem
+    - Methods set self.last_mode to 'cloud' or 'local'
+    - Local files are stored under a base directory
+Usage:
+  from cloud_utils import StorageHandler, fetch_json
+Requirements:
+  • boto3 and botocore
+  • requests
+  • ENV vars for cloud credentials (e.g. FILEBASE_*)
+"""
+import os
+import errno
+import requests
+import boto3
+from botocore.config import Config
+from botocore.exceptions import BotoCoreError, ClientError
+# HTTP Fetch utilities ---------------------------------------------------------
+def fetch_content(url, headers=None, timeout=15):
+    """Fetch binary content via HTTP GET."""
+    resp = requests.get(url, headers=headers, timeout=timeout, stream=False)
+    resp.raise_for_status()
+    return resp.content
+def fetch_json(url, headers=None, timeout=15):
+    """Fetch JSON data via HTTP GET."""
+    resp = requests.get(url, headers=headers, timeout=timeout)
+    resp.raise_for_status()
+    data = resp.json()
+    return data.get("data", data) if isinstance(data, dict) else data
+def fetch_text(url, headers=None, timeout=15, encoding='utf-8'):
+    """Fetch text content via HTTP GET."""
+    resp = requests.get(url, headers=headers, timeout=timeout)
+    resp.raise_for_status()
+    resp.encoding = encoding
+    return resp.text
+# Storage Handler ---------------------------------------------------------------
+class StorageHandler:
+    def list_prefix(self, prefix):
+        """List all object keys in the given S3 prefix. Returns a list of keys. Local fallback returns empty list."""
+        if self.s3 and self.bucket:
+            paginator = self.s3.get_paginator('list_objects_v2')
+            keys = []
+            for page in paginator.paginate(Bucket=self.bucket, Prefix=prefix):
+                for obj in page.get('Contents', []):
+                    keys.append(obj['Key'])
+            return keys
+        # Local fallback: not implemented (could walk local filesystem if needed)
+        return []
+    def __init__(self, endpoint_url, access_key, secret_key, bucket_name, local_base="data"):
+        """
+        Initialize cloud storage client and local base path.
+        endpoint_url: S3-compatible endpoint URL
+        bucket_name: target bucket name (if None/empty, operate in local-only mode)
+        local_base: directory prefix for local fallback files
+        """
+        self.bucket = bucket_name
+        self.local_base = local_base.rstrip(os.sep)
+        self.last_mode = None  # 'cloud' or 'local'
+        if bucket_name:
+            # boto3 client config
+            cfg = Config(signature_version="s3v4", s3={"addressing_style": "path"})
+            self.s3 = boto3.client(
+                "s3",
+                endpoint_url=endpoint_url,
+                aws_access_key_id=access_key,
+                aws_secret_access_key=secret_key,
+                config=cfg,
+                region_name='us-east-1'
+            )
+        else:
+            self.s3 = None
+    def _ensure_local_dir(self, key):
+        path = os.path.join(self.local_base, key)
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        return path
+    def download(self, key):
+        """Download object by key. Returns bytes, sets last_mode. Raises FileNotFoundError if not found."""
+        if self.s3 and self.bucket:
+            try:
+                resp = self.s3.get_object(Bucket=self.bucket, Key=key)
+                data = resp['Body'].read()
+                self.last_mode = 'cloud'
+                print(f"[OK] Downloaded {key} from s3://{self.bucket}/{key}")
+                return data
+            except (ClientError, BotoCoreError) as e:
+                print(f"[WARN] Could not download {key} from S3: {e}")
+        # Always fallback to local if S3 is not configured or download fails
+        local_path = self._ensure_local_dir(key)
+        try:
+            with open(local_path, 'rb') as f:
+                data = f.read()
+            self.last_mode = 'local'
+            print(f"[FALLBACK] Loaded {key} from local {local_path}")
+            return data
+        except FileNotFoundError:
+            print(f"[ERROR] {key} not found in S3 or locally at {local_path}")
+            raise
+    def upload(self, key, data, content_type='application/octet-stream'):
+        """Upload bytes to cloud, fallback to local. Sets last_mode. Returns True if cloud, False if local."""
+        if self.s3 and self.bucket:
+            try:
+                self.s3.put_object(Bucket=self.bucket, Key=key, Body=data, ContentType=content_type)
+                self.last_mode = 'cloud'
+                print(f"[OK] Uploaded {key} -> s3://{self.bucket}/{key}")
+                return True
+            except (ClientError, BotoCoreError) as e:
+                print(f"[ERROR] Failed uploading {key}: {e}")
+        # Always fallback to local if S3 is not configured or upload fails
+        local_path = self._ensure_local_dir(key)
+        with open(local_path, 'wb') as f:
+            f.write(data)
+        self.last_mode = 'local'
+        print(f"[FALLBACK] Saved {key} locally -> {local_path}")
+        return False
+    def exists(self, key):
+        """Check for existence of object. Returns True if found in cloud or local."""
+        if self.s3 and self.bucket:
+            try:
+                self.s3.head_object(Bucket=self.bucket, Key=key)
+                return True
+            except (ClientError, BotoCoreError):
+                pass
+        local_path = os.path.join(self.local_base, key)
+        return os.path.exists(local_path)
+    def delete(self, key):
+        """Delete object in cloud or local fallback."""
+        if self.s3 and self.bucket:
+            try:
+                self.s3.delete_object(Bucket=self.bucket, Key=key)
+                self.last_mode = 'cloud'
+                print(f"[OK] Deleted {key} from s3://{self.bucket}/{key}")
+                return
+            except Exception:
+                pass
+        local_path = os.path.join(self.local_base, key)
+        try:
+            os.remove(local_path)
+            self.last_mode = 'local'
+            print(f"[FALLBACK] Deleted {key} locally -> {local_path}")
+        except OSError as e:
+            if e.errno != errno.ENOENT:
+                raise
+    def get_last_mode(self):
+        """Return 'cloud' or 'local' depending on last operation."""
+        return self.last_mode
+# End of cloud_utils.py

src/fetchers/advisorai_data/advisorai_data_fetcher.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+advisorai_data_fetcher.py – Fetches feature files from AdvisorAI Data API and MongoDB,
+then uploads them to Filebase S3 instead of local storage.
+✱ 2025-07-11 – switched backend from local filesystem to Filebase S3
+  • Uses boto3 against FILEBASE_ENDPOINT
+  • No local disk writes; everything streams directly to S3
+Requirements:
+  • FILEBASE_ENDPOINT env var, e.g. https://s3.filebase.com
+  • FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY env vars
+  • FILEBASE_BUCKET env var (your bucket name)
+  • ADVISORAI_data_API_URL and ADVISORAI_data_API_KEY env vars for the Data API
+  • MONGODB_URI, MONGODB_DATABASE, MONGODB_COLLECTION_FEATURES env vars for archive fetch
+"""
+import os
+import sys
+import requests
+import asyncio
+from io import BytesIO
+from dotenv import load_dotenv
+import pandas as pd
+# from pymongo import MongoClient
+# Ensure src is in sys.path for direct script execution
+import sys
+import os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))
+from data_cloud.cloud_utils import StorageHandler
+# ─── Configuration ────────────────────────────────────────────────────────────
+load_dotenv()
+# AdvisorAI Data API
+API_BASE_URL = os.getenv("ADVISORAI_data_API_URL", "http://localhost:8000")
+API_KEY      = os.getenv("ADVISORAI_data_API_KEY")
+if not API_KEY:
+    print("[ERROR] ADVISORAI_data_API_KEY must be set")
+    sys.exit(1)
+HEADERS = {"Authorization": f"Bearer {API_KEY}"}
+# MongoDB for archive features
+MONGODB_URI                  = os.getenv("MONGODB_URI", "mongodb://localhost:27017")
+MONGODB_DATABASE             = os.getenv("MONGODB_DATABASE", "AdvisorAI")
+MONGODB_COLLECTION_FEATURES  = os.getenv("MONGODB_COLLECTION_FEATURES", "arch_features")
+# Filebase S3 credentials
+FILEBASE_ENDPOINT    = os.getenv("FILEBASE_ENDPOINT")
+FILEBASE_ACCESS_KEY  = os.getenv("FILEBASE_ACCESS_KEY")
+FILEBASE_SECRET_KEY  = os.getenv("FILEBASE_SECRET_KEY")
+FILEBASE_BUCKET      = os.getenv("FILEBASE_BUCKET")
+if not all([FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, FILEBASE_BUCKET]):
+    print("[ERROR] FILEBASE_ENDPOINT, FILEBASE_ACCESS_KEY, FILEBASE_SECRET_KEY, and FILEBASE_BUCKET must be set")
+    sys.exit(1)
+# ─── Fetch and upload functions ───────────────────────────────────────────────
+def fetch_and_upload_latest_parquet(storage):
+    """Fetch latest Parquet from API and upload to S3 bucket at features/latest_features.parquet"""
+    url = f"{API_BASE_URL}/features/latest"
+    resp = requests.get(url, headers=HEADERS, stream=True)
+    resp.raise_for_status()
+    data = resp.content
+    key = "advisorai-data/features/latest_features.parquet"
+    try:
+        storage.upload(key, data, content_type="application/octet-stream")
+        print(f"[OK] Uploaded latest_features.parquet -> {storage.get_last_mode()}:{key}")
+        # Also save locally
+        local_path = os.path.join("data", key)
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "wb") as f:
+            f.write(data)
+        print(f"[OK] Saved locally: {local_path}")
+    except Exception as e:
+        print(f"[ERROR] Failed uploading latest_features.parquet: {e}", file=sys.stderr)
+async def fetch_and_upload_jsons(storage):
+    """List JSON feature files, fetch them, and upload to S3 under features/"""
+    url = f"{API_BASE_URL}/features"
+    resp = requests.get(url, headers=HEADERS)
+    resp.raise_for_status()
+    files = resp.json().get("files", [])
+    json_files = [f["filename"] for f in files if f.get("file_type") == "json"]
+    if not json_files:
+        print("[INFO] No JSON feature files to upload.")
+        return
+    # Delete all old feature_report_*.json files before saving any new ones (both locally and on S3)
+    import glob
+    import os
+    # Local delete (as before)
+    features_dir = os.path.join("data", "advisorai-data", "features")
+    report_files = glob.glob(os.path.join(features_dir, "feature_report_*.json"))
+    for old_report in report_files:
+        try:
+            os.remove(old_report)
+            print(f"[INFO] Deleted old local report: {old_report}")
+        except Exception as e:
+            print(f"[WARN] Could not delete local {old_report}: {e}", file=sys.stderr)
+    # S3 delete (list all files in the prefix and filter manually)
+    try:
+        s3_files = storage.list_prefix("advisorai-data/features/")
+        s3_report_files = [f for f in s3_files if f.startswith("advisorai-data/features/feature_report_") and f.endswith(".json")]
+        for s3_report in s3_report_files:
+            try:
+                storage.delete(s3_report)
+                print(f"[INFO] Deleted old S3 report: {s3_report}")
+            except Exception as e:
+                print(f"[WARN] Could not delete S3 {s3_report}: {e}", file=sys.stderr)
+    except Exception as e:
+        print(f"[WARN] Could not list/delete S3 feature_report_*.json: {e}", file=sys.stderr)
+    for fname in json_files:
+        dl_url = f"{API_BASE_URL}/features/{fname}"
+        r = requests.get(dl_url, headers=HEADERS, stream=True)
+        r.raise_for_status()
+        data = r.content
+        key = f"advisorai-data/features/{fname}"
+        try:
+            storage.upload(key, data, content_type="application/json")
+            print(f"[OK] Uploaded {fname} -> {storage.get_last_mode()}:{key}")
+            # Also save locally
+            local_path = os.path.join("data", key)
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            with open(local_path, "wb") as f:
+                f.write(data)
+            print(f"[OK] Saved locally: {local_path}")
+        except Exception as e:
+            print(f"[ERROR] Failed uploading {fname}: {e}", file=sys.stderr)
+# async def fetch_and_upload_archive_parquet(storage):
+#     """Fetch archive from MongoDB, convert to Parquet, and upload to S3 at archive/merged_features.parquet"""
+#     client = MongoClient(MONGODB_URI)
+#     db = client[MONGODB_DATABASE]
+#     coll = db[MONGODB_COLLECTION_FEATURES]
+#     docs = list(coll.find())
+#     if not docs:
+#         print("[INFO] No documents in archive collection.")
+#         return
+#     for d in docs:
+#         d.pop("_id", None)
+#     df = pd.DataFrame(docs)
+#     buf = BytesIO()
+#     df.to_parquet(buf, index=False)
+#     data = buf.getvalue()
+#     key = "advisorai-data/archive/merged_features.parquet"
+#     try:
+#         storage.upload(key, data, content_type="application/octet-stream")
+#         print(f"[OK] Uploaded archive Parquet -> {storage.get_last_mode()}:{key}")
+#         # Also save locally
+#         local_path = os.path.join("data", key)
+#         os.makedirs(os.path.dirname(local_path), exist_ok=True)
+#         with open(local_path, "wb") as f:
+#             f.write(data)
+#         print(f"[OK] Saved locally: {local_path}")
+#     except Exception as e:
+#         print(f"[ERROR] Failed uploading archive Parquet: {e}", file=sys.stderr)
+def create_train_merged_parquet(storage):
+    """Create advisorai-data/train/merged_features.parquet by merging archive and latest features, deduping by (symbol, interval_timestamp)."""
+    # Download archive/merged_features.parquet
+    from io import BytesIO
+    import pandas as pd
+    archive_key = "advisorai-data/archive/merged_features.parquet"
+    latest_key = "advisorai-data/features/latest_features.parquet"
+    train_key = "advisorai-data/train/merged_features.parquet"
+    try:
+        archive_buf = BytesIO(storage.download(archive_key))
+        df_archive = pd.read_parquet(archive_buf)
+    except Exception as e:
+        print(f"[WARN] Could not load archive parquet: {e}", file=sys.stderr)
+        df_archive = pd.DataFrame()
+    try:
+        latest_buf = BytesIO(storage.download(latest_key))
+        df_latest = pd.read_parquet(latest_buf)
+    except Exception as e:
+        print(f"[WARN] Could not load latest features parquet: {e}", file=sys.stderr)
+        df_latest = pd.DataFrame()
+    if df_archive.empty and df_latest.empty:
+        print("[INFO] No data to merge for train/merged_features.parquet.")
+        return
+    # Concatenate and deduplicate by (symbol, interval_timestamp)
+    df_all = pd.concat([df_archive, df_latest], ignore_index=True)
+    if 'symbol' in df_all.columns and 'interval_timestamp' in df_all.columns:
+        df_all = df_all.drop_duplicates(subset=["symbol", "interval_timestamp"], keep="last")
+    else:
+        print("[WARN] 'symbol' or 'interval_timestamp' column missing, skipping deduplication.")
+    # Save to train/merged_features.parquet
+    buf = BytesIO()
+    df_all.to_parquet(buf, index=False)
+    data = buf.getvalue()
+    try:
+        storage.upload(train_key, data, content_type="application/octet-stream")
+        print(f"[OK] Uploaded train merged features -> {storage.get_last_mode()}:{train_key}")
+        # Also save locally
+        local_path = os.path.join("data", train_key)
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        with open(local_path, "wb") as f:
+            f.write(data)
+        print(f"[OK] Saved locally: {local_path}")
+    except Exception as e:
+        print(f"[ERROR] Failed uploading train merged features: {e}", file=sys.stderr)
+# ─── Main entrypoint ─────────────────────────────────────────────────────────
+def main():
+    # Use StorageHandler with both S3 and local enabled
+    storage = StorageHandler(
+        endpoint_url=FILEBASE_ENDPOINT,
+        access_key=FILEBASE_ACCESS_KEY,
+        secret_key=FILEBASE_SECRET_KEY,
+        bucket_name=FILEBASE_BUCKET,
+        local_base="data"
+    )
+    fetch_and_upload_latest_parquet(storage)
+    asyncio.run(fetch_and_upload_jsons(storage))
+    # asyncio.run(fetch_and_upload_archive_parquet(storage))
+    create_train_merged_parquet(storage)
+if __name__ == "__main__":
+    main()

src/fetchers/alpaca_api/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+# alpaca/__init__.py
+from .config import settings
+from .clients import StocksClient, CryptoClient, OptionsClient
+from .fetchers import (
+    fetch_stock_bars,
+    fetch_crypto_bars,
+    fetch_option_bars,
+    fetch_stock_trades,
+    fetch_crypto_trades,
+    fetch_stock_quotes,
+    fetch_crypto_quotes,
+)
+from .utils import logger, backoff, to_rfc3339, parse_rfc3339
+__all__ = [
+    "settings",
+    "StocksClient",
+    "CryptoClient",
+    "OptionsClient",
+    "fetch_stock_bars",
+    "fetch_crypto_bars",
+    "fetch_option_bars",
+    "fetch_stock_trades",
+    "fetch_crypto_trades",
+    "fetch_stock_quotes",
+    "fetch_crypto_quotes",
+    "logger",
+    "backoff",
+    "to_rfc3339",
+    "parse_rfc3339",
+]

src/fetchers/alpaca_api/clients/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# alpaca/clients/__init__.py
+from .stocks import StocksClient
+from .crypto import CryptoClient
+from .options import OptionsClient
+__all__ = ["StocksClient", "CryptoClient", "OptionsClient"]

src/fetchers/alpaca_api/clients/crypto.py ADDED Viewed

	@@ -0,0 +1,95 @@

+# alpaca/clients/crypto.py
+from datetime import datetime
+from typing import Optional
+import re
+from alpaca.data.historical import CryptoHistoricalDataClient
+from alpaca.data.requests import (
+    CryptoBarsRequest,
+    CryptoTradesRequest,
+    CryptoQuoteRequest,
+)
+from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
+from ..config import settings
+class CryptoClient:
+    def __init__(self):
+        # You can omit api_key/secret for crypto, but providing them raises rate limits
+        self.client = CryptoHistoricalDataClient(
+            api_key=settings.ALPACA_API_KEY,
+            secret_key=settings.ALPACA_API_SECRET,
+        )
+    def get_bars(
+        self,
+        symbol: str,
+        timeframe: str | TimeFrame,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        feed: Optional[str] = None,
+    ):
+        """
+        Fetch historical OHLCV bars for a given crypto symbol.
+        Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc.
+        """
+        if isinstance(timeframe, str):
+            m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe)
+            if not m:
+                raise ValueError(f"Invalid timeframe format: {timeframe!r}")
+            amt, unit_str = m.groups()
+            unit_key = unit_str.capitalize().rstrip("s")
+            unit = TimeFrameUnit[unit_key]
+            timeframe = TimeFrame(int(amt), unit)
+        req = CryptoBarsRequest(
+            symbol_or_symbols=symbol,
+            timeframe=timeframe,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=feed,
+        )
+        return self.client.get_crypto_bars(req)
+        # ↳ uses CryptoBarsRequest(symbol_or_symbols, timeframe, start, end, limit, feed) :contentReference[oaicite:0]{index=0}
+    def get_trades(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical trade ticks for a given crypto symbol.
+        """
+        req = CryptoTradesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_crypto_trades(req)
+        # ↳ uses CryptoTradesRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1}
+    def get_quotes(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical Level-1 quotes for a given crypto symbol.
+        """
+        req = CryptoQuoteRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_crypto_quotes(req)
+        # ↳ uses CryptoQuoteRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:2]{index=2}

src/fetchers/alpaca_api/clients/main.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# from datetime import datetime, timedelta
+# import sys
+# import os
+# import pandas as pd
+# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+# from alpaca_api.clients.stocks import StocksClient
+# def normalize_records(records):
+#     dicts = [rec.model_dump() for rec in records]
+#     for rec in dicts:
+#         for k, v in rec.items():
+#             if hasattr(v, 'isoformat'):
+#                 rec[k] = v.isoformat()
+#     return dicts
+# if __name__ == "__main__":
+#     client = StocksClient()
+#     symbol = "AAPL"
+#     timeframe = "1Day"
+#     end = datetime.utcnow()
+#     start = end - timedelta(days=7)
+#     output_dir = os.path.join("..", "..", "..", "data", "alpaca")
+#     os.makedirs(output_dir, exist_ok=True)
+#     print(f"Testing get_bars for {symbol} from {start} to {end}")
+#     bars = client.get_bars(symbol, timeframe, start, end, limit=10)
+#     # print("Bars:", bars)
+#     bars_records = normalize_records(bars.data[symbol])
+#     bars_df = pd.DataFrame(bars_records)
+#     bars_df.to_parquet(os.path.join(output_dir, f"{symbol}_bars.parquet"), index=False)
+#     print(f"Testing get_trades for {symbol} from {start} to {end}")
+#     trades = client.get_trades(symbol, start, end, limit=10)
+#     # print("Trades:", trades)
+#     trades_records = normalize_records(trades.data[symbol])
+#     trades_df = pd.DataFrame(trades_records)
+#     trades_df.to_parquet(os.path.join(output_dir, f"{symbol}_trades.parquet"), index=False)
+#     print(f"Testing get_quotes for {symbol} from {start} to {end}")
+#     quotes = client.get_quotes(symbol, start, end, limit=10)
+#     # print("Quotes:", quotes)
+#     quotes_records = normalize_records(quotes.data[symbol])
+#     quotes_df = pd.DataFrame(quotes_records)
+#     quotes_df.to_parquet(os.path.join(output_dir, f"{symbol}_quotes.parquet"), index=False)

src/fetchers/alpaca_api/clients/options.py ADDED Viewed

	@@ -0,0 +1,72 @@

+# alpaca/clients/options.py
+from datetime import datetime
+from typing import Optional, Union
+import re
+from alpaca.data.historical import OptionHistoricalDataClient
+from alpaca.data.requests import (
+    OptionBarsRequest,
+    OptionTradesRequest,
+)
+from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
+from ..config import settings
+class OptionsClient:
+    def __init__(self):
+        self.client = OptionHistoricalDataClient(
+            api_key=settings.ALPACA_API_KEY,
+            secret_key=settings.ALPACA_API_SECRET,
+        )
+    def get_bars(
+        self,
+        symbol: str,
+        timeframe: Union[str, TimeFrame],
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical OHLCV bars for a given option contract.
+        Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc.
+        """
+        if isinstance(timeframe, str):
+            m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe)
+            if not m:
+                raise ValueError(f"Invalid timeframe format: {timeframe!r}")
+            amount, unit_str = m.groups()
+            unit_key = unit_str.capitalize().rstrip("s")
+            unit = TimeFrameUnit[unit_key]
+            timeframe = TimeFrame(int(amount), unit)
+        req = OptionBarsRequest(
+            symbol_or_symbols=symbol,
+            timeframe=timeframe,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_option_bars(req)
+        # ↳ uses OptionBarsRequest(symbol_or_symbols, timeframe, start, end, limit, sort) :contentReference[oaicite:0]{index=0}
+    def get_trades(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+        sort: Optional[str] = None,
+    ):
+        """
+        Fetch historical trade ticks for a given option contract.
+        """
+        req = OptionTradesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            sort=sort,
+        )
+        return self.client.get_option_trades(req)
+        # ↳ uses OptionTradesRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1}

src/fetchers/alpaca_api/clients/stocks.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# alpaca_api/clients/stocks.py
+from datetime import datetime
+import re
+from alpaca.data.historical import StockHistoricalDataClient
+from alpaca.data.timeframe import TimeFrame, TimeFrameUnit
+from alpaca.data.requests import StockBarsRequest, StockTradesRequest, StockQuotesRequest, DataFeed
+import sys, os
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
+from alpaca_api.config import settings
+class StocksClient:
+    def __init__(self):
+        self.client = StockHistoricalDataClient(
+            api_key=settings.ALPACA_API_KEY,
+            secret_key=settings.ALPACA_API_SECRET,
+        )
+    def get_bars(
+        self,
+        symbol: str,
+        timeframe: str | TimeFrame,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+    ):
+        """
+        Fetch historical OHLCV bars for a given stock.
+        Accepts either a TimeFrame enum or a string like "1Day", "5Minute", etc.
+        """
+        if isinstance(timeframe, str):
+            m = re.fullmatch(r"(\d+)([A-Za-z]+)", timeframe)
+            if not m:
+                raise ValueError(f"Invalid timeframe format: {timeframe!r}")
+            amount_str, unit_str = m.groups()
+            # Normalize unit name to match TimeFrameUnit keys (Minute, Hour, Day, Week, Month)
+            unit_key = unit_str.capitalize().rstrip("s")
+            unit = TimeFrameUnit[unit_key]
+            timeframe = TimeFrame(int(amount_str), unit)
+        # Now we have a proper TimeFrame instance
+        req = StockBarsRequest(
+            symbol_or_symbols=symbol,
+            timeframe=timeframe,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=DataFeed.IEX,  # use IEX for free delayed data
+        )
+        return self.client.get_stock_bars(req)
+        # ↳ requires StockBarsRequest(symbol_or_symbols, timeframe, start, end, limit) :contentReference[oaicite:0]{index=0}
+    def get_trades(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+    ):
+        """
+        Fetch historical trade ticks for a given stock.
+        """
+        req = StockTradesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=DataFeed.IEX,  # use IEX for free delayed trade data
+        )
+        return self.client.get_stock_trades(req)
+        # ↳ takes symbol_or_symbols, start, end, limit :contentReference[oaicite:1]{index=1}
+    def get_quotes(
+        self,
+        symbol: str,
+        start: datetime,
+        end: datetime,
+        limit: int = 1000,
+    ):
+        """
+        Fetch historical Level-1 quotes (bid/ask) for a given stock.
+        """
+        req = StockQuotesRequest(
+            symbol_or_symbols=symbol,
+            start=start,
+            end=end,
+            limit=limit,
+            feed=DataFeed.IEX,  # use IEX for free delayed quote data
+        )
+        return self.client.get_stock_quotes(req)
+        # ↳ takes symbol_or_symbols, start, end, limit :contentReference[oaicite:2]{index=2}

src/fetchers/alpaca_api/config.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# alpaca/config.py
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    ALPACA_API_KEY:    str
+    ALPACA_API_SECRET: str
+    ALPACA_BASE_URL:   str = "https://paper-api.alpaca.markets/v2"
+    PAPER:             bool = True
+    model_config = SettingsConfigDict(
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",  # allow all other .env keys without error
+    )
+settings = Settings()

src/fetchers/alpaca_api/fetchers/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# alpaca/fetchers/__init__.py
+from .bars import fetch_stock_bars, fetch_crypto_bars, fetch_option_bars
+from .trades import fetch_stock_trades, fetch_crypto_trades
+from .quotes import fetch_stock_quotes, fetch_crypto_quotes
+__all__ = [
+    "fetch_stock_bars",
+    "fetch_crypto_bars",
+    "fetch_option_bars",
+    "fetch_stock_trades",
+    "fetch_crypto_trades",
+    "fetch_stock_quotes",
+    "fetch_crypto_quotes",
+]

src/fetchers/alpaca_api/fetchers/bars.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# alpaca/fetchers/bars.py
+from datetime import datetime
+from ..clients.stocks import StocksClient
+from ..clients.crypto import CryptoClient
+from ..clients.options import OptionsClient
+from ..utils import backoff, logger
+# instantiate once
+stocks_client = StocksClient()
+crypto_client = CryptoClient()
+options_client = OptionsClient()
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_stock_bars(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    timeframe: str,
+    limit: int = 1000,
+):
+    """
+    Fetch OHLCV bars for a stock, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching stock bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit}")
+    return stocks_client.get_bars(symbol, timeframe, start, end, limit)
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_crypto_bars(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    timeframe: str,
+    limit: int = 1000,
+    feed: str | None = None,
+):
+    """
+    Fetch OHLCV bars for a crypto, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching crypto bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit} feed={feed}")
+    return crypto_client.get_bars(symbol, timeframe, start, end, limit, feed)
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_option_bars(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    timeframe: str,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch OHLCV bars for an option contract, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching option bars: {symbol=} {timeframe=} {start=} to {end=} limit={limit} sort={sort}")
+    return options_client.get_bars(symbol, timeframe, start, end, limit, sort)

src/fetchers/alpaca_api/fetchers/quotes.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# alpaca/fetchers/quotes.py
+from datetime import datetime
+from ..clients.stocks import StocksClient
+from ..clients.crypto import CryptoClient
+from ..utils import backoff, logger
+# instantiate clients once
+stocks_client = StocksClient()
+crypto_client = CryptoClient()
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_stock_quotes(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical Level-1 quotes (bid/ask) for a stock, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching stock quotes: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return stocks_client.get_quotes(symbol, start, end, limit)
+    # ↳ uses StockQuotesRequest(symbol_or_symbols, start, end, limit) :contentReference[oaicite:0]{index=0}
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_crypto_quotes(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical Level-1 quotes for a crypto symbol, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching crypto quotes: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return crypto_client.get_quotes(symbol, start, end, limit)
+    # ↳ uses CryptoQuoteRequest(symbol_or_symbols, start, end, limit, sort) :contentReference[oaicite:1]{index=1}

src/fetchers/alpaca_api/fetchers/trades.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# alpaca/fetchers/trades.py
+from datetime import datetime
+from ..clients.stocks import StocksClient
+from ..clients.crypto import CryptoClient
+from ..utils import backoff, logger
+# instantiate clients once
+stocks_client = StocksClient()
+crypto_client = CryptoClient()
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_stock_trades(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical trade ticks for a stock, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching stock trades: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return stocks_client.get_trades(symbol, start, end, limit)
+@backoff(max_retries=5, base_delay=1, factor=2)
+def fetch_crypto_trades(
+    symbol: str,
+    start: datetime,
+    end: datetime,
+    limit: int = 1000,
+    sort: str | None = None,
+):
+    """
+    Fetch historical trade ticks for a crypto symbol, with retry/back-off and logging.
+    """
+    logger.info(f"Fetching crypto trades: symbol={symbol}, start={start}, end={end}, limit={limit}, sort={sort}")
+    return crypto_client.get_trades(symbol, start, end, limit)

src/fetchers/alpaca_api/main.py ADDED Viewed

	@@ -0,0 +1,193 @@

+def normalize_crypto_symbol(sym: str) -> str:
+    return sym if "/" in sym else f"{sym}/USD"
+import os
+import sys
+from datetime import datetime, timedelta
+import pandas as pd
+# Add src/fetchers to sys.path for direct execution
+base = os.path.dirname(__file__)
+src_fetchers = os.path.abspath(os.path.join(base, ".."))
+sys.path.insert(0, src_fetchers)
+from alpaca_api.fetchers import (
+    fetch_stock_bars,
+    fetch_stock_trades,
+    fetch_stock_quotes,
+    fetch_crypto_bars,
+    fetch_crypto_trades,
+    fetch_option_bars,
+)
+from alpaca_api.config import settings
+def normalize_records(records):
+    """Convert Pydantic models to ISO-format dicts."""
+    dicts = [rec.model_dump() for rec in records]
+    for rec in dicts:
+        for k, v in rec.items():
+            if hasattr(v, "isoformat"):
+                rec[k] = v.isoformat()
+    return dicts
+def save_df(df: pd.DataFrame, fname: str):
+    out = os.path.join("data", "alpaca", fname)
+    os.makedirs(os.path.dirname(out), exist_ok=True)
+    # Check if file exists and implement incremental loading
+    if os.path.exists(out):
+        try:
+            existing_df = pd.read_parquet(out)
+            print(f"-> existing data has {len(existing_df)} records")
+            # Combine and remove duplicates based on timestamp and symbol
+            combined_df = pd.concat([existing_df, df], ignore_index=True)
+            # Remove duplicates keeping the latest record
+            if 'timestamp' in combined_df.columns and 'symbol' in combined_df.columns:
+                combined_df = combined_df.drop_duplicates(subset=['timestamp', 'symbol'], keep='last')
+            elif 'timestamp' in combined_df.columns:
+                combined_df = combined_df.drop_duplicates(subset=['timestamp'], keep='last')
+            # Sort by timestamp for consistency
+            if 'timestamp' in combined_df.columns:
+                combined_df = combined_df.sort_values('timestamp')
+            combined_df.to_parquet(out, index=False)
+            print(f"-> updated {out} with {len(combined_df)} total records ({len(df)} new)")
+        except Exception as e:
+            print(f"-> error merging with existing data: {e}, overwriting")
+            df.to_parquet(out, index=False)
+            print(f"-> wrote {out} with {len(df)} records")
+    else:
+        df.to_parquet(out, index=False)
+        print(f"-> wrote {out} with {len(df)} records")
+def main():
+    # you can also read these from os.getenv or settings if you prefer
+    stock_symbols  = ["AAPL", "TSLA", "GOOGL", "MSFT", "NVDA", "COIN"]  # Added COIN
+    crypto_symbols = ["BTC", "ETH", "SOL", "ADA", "XRP"]
+    # option symbols use the Alpaca format: "<UNDERLYING>_<YYYYMMDD>_<STRIKE>_<C/P>"
+    # option_symbols = ["AAPL_20250718_150_C", "TSLA_20250718_700_P"]
+    def normalize_option_symbol(sym: str) -> str:
+        # expects “UNDERLYING_YYYYMMDD_STRIKE_C” or “P”
+        underlying, ymd, strike, cp = sym.split("_")
+        yymmdd = ymd[2:]  # “20250718” → “250718”
+        amt = int(float(strike) * 1000)
+        strike_str = f"{amt:08d}"
+        return f"{underlying}{yymmdd}{cp}{strike_str}"
+    days = "1Day"
+    end = datetime.utcnow()
+    # Check for existing data to determine start date
+    def get_start_date_for_symbol(symbol, data_type="bars"):
+        fname = f"{symbol}_{data_type}.parquet"
+        out = os.path.join("data", "alpaca", fname)
+        if os.path.exists(out):
+            try:
+                existing_df = pd.read_parquet(out)
+                if not existing_df.empty and 'timestamp' in existing_df.columns:
+                    # Get the latest timestamp and add 1 day to avoid duplicates
+                    latest_timestamp = pd.to_datetime(existing_df['timestamp'].max())
+                    start_from_latest = latest_timestamp + timedelta(days=1)
+                    # Don't go back more than 30 days from now to limit data size
+                    max_lookback = end - timedelta(days=30)
+                    start_date = max(start_from_latest, max_lookback)
+                    print(f"-> {symbol} {data_type}: continuing from {start_date}")
+                    return start_date
+            except Exception as e:
+                print(f"-> error reading existing {fname}: {e}")
+        # Default: get last 30 days for new symbols
+        default_start = end - timedelta(days=30)
+        print(f"-> {symbol} {data_type}: starting fresh from {default_start}")
+        return default_start
+    # STOCKS: bars, trades, quotes
+    for sym in stock_symbols:
+        print(f"\nFetching stock data for {sym}:")
+        # Get appropriate start dates for each data type
+        start_bars = get_start_date_for_symbol(sym, "bars")
+        start_trades = get_start_date_for_symbol(sym, "trades")
+        start_quotes = get_start_date_for_symbol(sym, "quotes")
+        # Only fetch if there's a meaningful time range
+        if start_bars < end:
+            bars = fetch_stock_bars(sym, start_bars, end, days, limit=1000)  # Increased limit
+            save_df(pd.DataFrame(normalize_records(bars.data[sym])), f"{sym}_bars.parquet")
+        else:
+            print(f"-> {sym} bars: no new data to fetch")
+        if start_trades < end:
+            trades = fetch_stock_trades(sym, start_trades, end, limit=1000)  # Increased limit
+            save_df(pd.DataFrame(normalize_records(trades.data[sym])), f"{sym}_trades.parquet")
+        else:
+            print(f"-> {sym} trades: no new data to fetch")
+        if start_quotes < end:
+            quotes = fetch_stock_quotes(sym, start_quotes, end, limit=1000)  # Increased limit
+            save_df(pd.DataFrame(normalize_records(quotes.data[sym])), f"{sym}_quotes.parquet")
+        else:
+            print(f"-> {sym} quotes: no new data to fetch")
+    # CRYPTO: bars, trades
+    for sym in crypto_symbols:
+        pair = normalize_crypto_symbol(sym)
+        print(f"\nFetching crypto data for {pair}:")
+        try:
+            # Get appropriate start dates for crypto data
+            start_bars = get_start_date_for_symbol(pair.replace('/', '_'), "bars")
+            start_trades = get_start_date_for_symbol(pair.replace('/', '_'), "trades")
+            # Only fetch if there's a meaningful time range
+            bar_records = []
+            trade_records = []
+            if start_bars < end:
+                bars = fetch_crypto_bars(pair, start_bars, end, days, limit=1000)  # Increased limit
+                bar_records = bars.data.get(pair, [])
+            else:
+                print(f"-> {pair} bars: no new data to fetch")
+            if start_trades < end:
+                trades = fetch_crypto_trades(pair, start_trades, end, limit=1000)  # Increased limit
+                trade_records = trades.data.get(pair, [])
+            else:
+                print(f"-> {pair} trades: no new data to fetch")
+            if bar_records:
+                save_df(
+                    pd.DataFrame(normalize_records(bar_records)),
+                    f"{pair.replace('/', '_')}_bars.parquet",
+                )
+            else:
+                print(f"-> no bar data for {pair}, skipping")
+            if trade_records:
+                save_df(
+                    pd.DataFrame(normalize_records(trade_records)),
+                    f"{pair.replace('/', '_')}_trades.parquet",
+                )
+            else:
+                print(f"-> no trade data for {pair}, skipping")
+        except Exception as e:
+            print(f"⚠️  error fetching {pair}: {e!r}, skipping")
+            continue
+    # # OPTIONS: bars only
+    # for sym in option_symbols:
+    #     occ = normalize_option_symbol(sym)
+    #     print(f"\nFetching option bars for {occ}:")
+    #     bars = fetch_option_bars(occ, start, end, days, limit=10)
+    #     save_df(pd.DataFrame(normalize_records(bars.data[occ])),   f"{occ}_bars.parquet")
+if __name__ == "__main__":
+    main()

src/fetchers/alpaca_api/merge/alpaca_features.py ADDED Viewed

File without changes

src/fetchers/alpaca_api/utils.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# alpaca/utils.py
+import time
+import functools
+import logging
+from datetime import datetime, timezone
+from typing import Callable, Type, Tuple, Any
+# -----------------------------
+# Structured logger
+# -----------------------------
+logger = logging.getLogger("alpaca")
+logger.setLevel(logging.INFO)
+handler = logging.StreamHandler()
+formatter = logging.Formatter(
+    "%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+    datefmt="%Y-%m-%dT%H:%M:%S%z",
+)
+handler.setFormatter(formatter)
+if not logger.handlers:
+    logger.addHandler(handler)
+# -----------------------------
+# Exponential back-off decorator
+# -----------------------------
+def backoff(
+    max_retries: int = 5,
+    base_delay: float = 1.0,
+    factor: float = 2.0,
+    exceptions: Tuple[Type[BaseException], ...] = (Exception,),
+) -> Callable:
+    """
+    Decorator to retry a function with exponential back-off upon specified exceptions.
+    :param max_retries: maximum number of retries before giving up
+    :param base_delay: initial delay between retries (in seconds)
+    :param factor: multiplier for delay on each retry
+    :param exceptions: tuple of exception classes that should trigger a retry
+    """
+    def decorator(func: Callable) -> Callable:
+        @functools.wraps(func)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            retries = 0
+            delay = base_delay
+            while True:
+                try:
+                    return func(*args, **kwargs)
+                except exceptions as e:
+                    if retries >= max_retries:
+                        logger.error(
+                            f"{func.__name__}: exceeded {max_retries} retries – giving up: {e}"
+                        )
+                        raise
+                    logger.warning(
+                        f"{func.__name__}: error {e!r}, retrying in {delay:.1f}s "
+                        f"(retry {retries + 1}/{max_retries})"
+                    )
+                    time.sleep(delay)
+                    retries += 1
+                    delay *= factor
+        return wrapper
+    return decorator
+# -----------------------------
+# Time helpers
+# -----------------------------
+def to_rfc3339(dt: datetime) -> str:
+    """
+    Convert a datetime to an RFC 3339–formatted string.
+    If no tzinfo is present, UTC is assumed.
+    """
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt.isoformat()
+def parse_rfc3339(timestamp: str) -> datetime:
+    """
+    Parse an RFC 3339–formatted string into a datetime.
+    """
+    return datetime.fromisoformat(timestamp)

src/fetchers/coindesk_client/asset_metadata.py ADDED Viewed

	@@ -0,0 +1,26 @@

+"""
+asset_metadata.py – Asset metadata endpoints for CoinDesk API client.
+- list_assets(): List all supported assets with basic metadata.
+- get_asset_details(symbol): Fetch detailed metadata for a specific asset.
+"""
+from client import BaseClient
+class AssetMetadataClient(BaseClient):
+    def list_assets(self):
+        """
+        Get a list of all supported assets and their basic metadata.
+        :return: JSON response containing assets list.
+        """
+        return self._get("assets")
+    def get_asset_details(self, symbol):
+        """
+        Get detailed metadata for a specific asset.
+        :param symbol: Asset symbol, e.g., "BTC" or "ETH".
+        :return: JSON response with asset details.
+        """
+        return self._get(f"assets/{symbol}")

src/fetchers/coindesk_client/client.py ADDED Viewed

	@@ -0,0 +1,218 @@

+"""
+client.py – Base HTTP client for CoinDesk API.
+This module provides the BaseClient class that handles HTTP requests
+to the CoinDesk API with proper authentication and error handling.
+"""
+import requests
+import json
+from typing import Dict, Any, Optional
+from urllib.parse import urljoin, urlencode
+import config
+class APIError(Exception):
+    """Custom exception for API errors."""
+    def __init__(self, message: str, status_code: int = None, response: Any = None):
+        self.message = message
+        self.status_code = status_code
+        self.response = response
+        super().__init__(self.message)
+class BaseClient:
+    """
+    Base HTTP client for CoinDesk API requests.
+    Handles authentication, request formatting, and error handling.
+    """
+    def __init__(self, base_url: str = None, headers: Dict[str, str] = None):
+        """
+        Initialize the base client.
+        Args:
+            base_url: Base URL for the API (defaults to config.BASE_URL)
+            headers: Default headers (defaults to config.HEADERS)
+        """
+        self.base_url = base_url or config.BASE_URL
+        self.headers = headers or config.HEADERS.copy()
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+    def _make_request(self, method: str, endpoint: str, params: Dict[str, Any] = None,
+                     data: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make an HTTP request to the API.
+        Args:
+            method: HTTP method (GET, POST, PUT, DELETE)
+            endpoint: API endpoint path
+            params: URL parameters
+            data: Request body data
+            **kwargs: Additional arguments for requests
+        Returns:
+            dict: JSON response from the API
+        Raises:
+            APIError: If the request fails or returns an error status
+        """
+        # Construct full URL
+        url = urljoin(self.base_url, endpoint.lstrip('/'))
+        # Clean up parameters (remove None values)
+        if params:
+            params = {k: v for k, v in params.items() if v is not None}
+        try:
+            # Make the request
+            response = self.session.request(
+                method=method,
+                url=url,
+                params=params,
+                json=data,
+                **kwargs
+            )
+            # Log the request for debugging
+            print(f"[DEBUG] {method} {url}")
+            if params:
+                print(f"[DEBUG] Params: {params}")
+            print(f"[DEBUG] Status: {response.status_code}")
+            # Check if request was successful
+            if response.status_code == 200:
+                try:
+                    return response.json()
+                except json.JSONDecodeError:
+                    # If response is not JSON, return the text
+                    return {"data": response.text, "status": "success"}
+            else:
+                # Handle different error status codes
+                error_message = f"API request failed with status {response.status_code}"
+                try:
+                    error_data = response.json()
+                    if 'error' in error_data:
+                        error_message = error_data['error']
+                    elif 'message' in error_data:
+                        error_message = error_data['message']
+                except json.JSONDecodeError:
+                    error_message = f"{error_message}: {response.text}"
+                raise APIError(
+                    message=error_message,
+                    status_code=response.status_code,
+                    response=response
+                )
+        except requests.exceptions.RequestException as e:
+            raise APIError(f"Request failed: {str(e)}")
+    def get(self, endpoint: str, params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a GET request.
+        Args:
+            endpoint: API endpoint path
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('GET', endpoint, params=params, **kwargs)
+    def post(self, endpoint: str, data: Dict[str, Any] = None,
+             params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a POST request.
+        Args:
+            endpoint: API endpoint path
+            data: Request body data
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('POST', endpoint, params=params, data=data, **kwargs)
+    def put(self, endpoint: str, data: Dict[str, Any] = None,
+            params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a PUT request.
+        Args:
+            endpoint: API endpoint path
+            data: Request body data
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('PUT', endpoint, params=params, data=data, **kwargs)
+    def delete(self, endpoint: str, params: Dict[str, Any] = None, **kwargs) -> Dict[str, Any]:
+        """
+        Make a DELETE request.
+        Args:
+            endpoint: API endpoint path
+            params: URL parameters
+            **kwargs: Additional arguments for requests
+        Returns:
+            dict: JSON response from the API
+        """
+        return self._make_request('DELETE', endpoint, params=params, **kwargs)
+    def close(self):
+        """Close the HTTP session."""
+        self.session.close()
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        """Context manager exit."""
+        self.close()
+# Convenience function to create a client instance
+def create_client(base_url: str = None, headers: Dict[str, str] = None) -> BaseClient:
+    """
+    Create a new BaseClient instance.
+    Args:
+        base_url: Base URL for the API
+        headers: Default headers
+    Returns:
+        BaseClient: Configured client instance
+    """
+    return BaseClient(base_url=base_url, headers=headers)
+# Test function to verify the client works
+def test_client():
+    """Test the base client functionality."""
+    try:
+        with create_client() as client:
+            # Test a simple endpoint (you might need to adjust this based on your API)
+            response = client.get("/index/cc/v1/markets")
+            print("Client test successful!")
+            print(f"Response keys: {list(response.keys()) if isinstance(response, dict) else 'Not a dict'}")
+            return True
+    except Exception as e:
+        print(f"Client test failed: {e}")
+        return False
+if __name__ == "__main__":
+    test_client()

src/fetchers/coindesk_client/coindesk_utils.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""
+coindesk_utils.py – Utilities for saving, merging, and managing CoinDesk data as Parquet using StorageHandler.
+Features:
+- save_and_merge_parquet: Save new data, merge with existing Parquet, dedupe by date, keep N days.
+"""
+import os
+import pandas as pd
+from datetime import datetime, timedelta
+from src.data_cloud.cloud_utils import StorageHandler
+def save_and_merge_parquet(
+    storage: StorageHandler,
+    key: str,
+    new_data: pd.DataFrame,
+    date_col: str = "timestamp",
+    days: int = 7,
+    content_type: str = "application/octet-stream",
+):
+    """
+    Save new_data as Parquet, merging with existing file by date_col, keeping only the last N days.
+    - storage: StorageHandler instance
+    - key: storage key (e.g., 'coindesk/spot_markets.parquet')
+    - new_data: DataFrame to save
+    - date_col: column to use for date filtering (must be datetime-like)
+    - days: keep only this many days of data
+    - content_type: MIME type for Parquet
+    """
+    # Try to load existing data
+    try:
+        existing_bytes = storage.download(key)
+        df_old = pd.read_parquet(pd.io.common.BytesIO(existing_bytes))
+    except Exception:
+        df_old = pd.DataFrame()
+    # Combine and dedupe
+    df_all = pd.concat([df_old, new_data], ignore_index=True)
+    if date_col in df_all.columns:
+        df_all[date_col] = pd.to_datetime(df_all[date_col], errors="coerce")
+        cutoff = datetime.utcnow() - timedelta(days=days)
+        df_all = df_all[df_all[date_col] >= cutoff]
+        df_all = df_all.sort_values(date_col).drop_duplicates()
+    # Save merged Parquet
+    buf = pd.io.common.BytesIO()
+    df_all.to_parquet(buf, index=False)
+    storage.upload(key, buf.getvalue(), content_type=content_type)
+    return df_all

src/fetchers/coindesk_client/config.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+config.py – Configuration and secrets for CoinDesk API client.
+- Defines API_KEY, BASE_URL, and optional TIMEZONE constants
+- Loads environment variables securely (e.g., via python-dotenv)
+- Configures default headers (Authorization, Content-Type)
+"""
+import os
+from dotenv import load_dotenv
+load_dotenv()
+API_KEY = os.getenv("COINDESK_API_KEY")
+BASE_URL = os.getenv("COINDESK_BASE_URL", "https://data-api.coindesk.com/").rstrip('/')
+TIMEZONE = os.getenv("COINDESK_TIMEZONE", "UTC")
+# Flexible parameters for data collection
+MARKET = os.getenv("COINDESK_MARKET", "binance")
+SYMBOL = os.getenv("COINDESK_SYMBOL", "BTC-USD")
+INSTRUMENTS = os.getenv("COINDESK_INSTRUMENTS", "BTC-USD").split(",")
+DAYS = int(os.getenv("COINDESK_DAYS_OLD", 7))
+FUTURES_LIMIT = int(os.getenv("COINDESK_FUTURES_LIMIT", 50))
+SENTIMENT_LIMIT = int(os.getenv("COINDESK_SENTIMENT_LIMIT", 50))
+BLOCK_NUMBER = int(os.getenv("COINDESK_BLOCK_NUMBER", 100000))
+HEADERS = {
+    "Authorization": f"Bearer {API_KEY}",
+    "Content-Type": "application/json"
+}

src/fetchers/coindesk_client/d.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Latest Tick:/index/cc/v1/latest/tick?market=cadli&instruments=BTC-USD,ETH-USD&apply_mapping=true
+Historical OHLCV+:/index/cc/v1/historical/days?market=cadli&instrument=BTC-USD&limit=30&aggregate=1&fill=true&apply_mapping=true&response_format=JSON
+DA Fixings:/index/cc/v1/historical/days/ccda?instrument=BTC-USD&timezone=Europe/London&date=2023-10-30&close_time=16:00&limit=5&response_format=JSON
+Index Updates:/index/cc/v2/historical/messages/hour?market=cadli&instrument=BTC-USD&hour_ts=1701176400&apply_mapping=true&response_format=JSON
+Index Composition:/index/cc/v1/historical/days/composition?market=cd_mc&instrument=CD20-USD&timezone=Europe/London&date=2025-05-09&close_time=16:00&limit=5&response_format=JSON
+Instrument Metadata:/index/cc/v1/latest/instrument/metadata?market=cadli&instruments=BTC-USD,ETH-USD&apply_mapping=true
+Markets:/index/cc/v1/markets?market=cadli
+Markets + Instruments:/index/cc/v1/markets/instruments?market=cadli&instruments=BTC-USD,ETH-USD&instrument_status=ACTIVE
+Forex Rates: /index/cc/v1/latest/tick/forex?instruments=GBP-USD,MYR-USD
+EOD Markets + Instruments: /index/cc/v1/markets/instruments/unmapped/eod?market=cdifti&instruments=BTIUSF-USD&instrument_status=ACTIVE
+EOD Historical OHLCV+ Day:/index/cc/v1/historical/days/eod?market=cdifti&instrument=BTIUSF-USD&limit=5&response_format=JSON
+Index Reconstitution: /index/cc/v1/reconstitution?market=cd_mc&instrument=CD20-USD

src/fetchers/coindesk_client/derivatives.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""
+derivatives.py – Derivatives endpoints for CoinDesk API client.
+- list_markets(): List all available derivatives markets.
+- get_latest_futures(symbol=None): Fetch the latest futures data, optionally for a symbol.
+- get_futures_historical(days, limit=None): Retrieve futures historical data over N days.
+- list_options(symbol=None): List available options or option chain for a given asset.
+- get_options_historical(symbol, start, end=None, limit=None): Fetch options historical data over a timeframe.
+"""
+from client import BaseClient
+class DerivativesClient(BaseClient):
+    def list_markets(self):
+        """
+        List all available derivatives markets.
+        """
+        return self._get("derivatives/markets")
+    def get_latest_futures(self, symbol=None):
+        """
+        Get the most recent futures data. If `symbol` is provided, returns data for that symbol.
+        :param symbol: Futures symbol, e.g., "BTC-USD" (optional).
+        """
+        path = "derivatives/futures"
+        if symbol:
+            path += f"/{symbol}"
+        return self._get(path)
+    def get_futures_historical(self, days, limit=None):
+        """
+        Fetch historical futures data for the past `days` days.
+        :param days: Number of days of history to retrieve.
+        :param limit: Maximum number of records to return (optional).
+        """
+        params = {"days": days}
+        if limit is not None:
+            params["limit"] = limit
+        return self._get("derivatives/futures/historical", params=params)
+    def list_options(self, symbol=None):
+        """
+        List all available options or get the option chain for a symbol.
+        :param symbol: Asset symbol for option chain, e.g., "BTC-USD" (optional).
+        """
+        path = "derivatives/options"
+        if symbol:
+            path += f"/{symbol}"
+        return self._get(path)
+    def get_options_historical(self, symbol, start, end=None, limit=None):
+        """
+        Fetch historical options data for a symbol over a timeframe.
+        :param symbol: Asset symbol, e.g., "BTC-USD".
+        :param start: ISO8601 start datetime string.
+        :param end: ISO8601 end datetime string (optional).
+        :param limit: Maximum number of records to return (optional).
+        """
+        params = {"start": start}
+        if end:
+            params["end"] = end
+        if limit is not None:
+            params["limit"] = limit
+        return self._get(f"derivatives/options/{symbol}/historical", params=params)