File size: 5,603 Bytes
c49b21b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
import time
import subprocess
import sys
import threading
import asyncio
from dotenv import load_dotenv
import httpx
import os
from src import config as app_config
# -----------------------------------------------------------------------------
# LOCATE YOUR DATA-PIPELINE SCRIPT
# -----------------------------------------------------------------------------
if os.path.exists(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py"))):
PIPELINE_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "src", "main.py"))
PIPELINE_DIR = os.path.dirname(PIPELINE_PATH)
else:
raise FileNotFoundError("src/main.py not found!")
# -----------------------------------------------------------------------------
# CONFIGURATION (via ENV)
# -----------------------------------------------------------------------------
load_dotenv()
# URL to ping every N seconds (default 300s = 5min)
def _parse_int_env(name: str, default_val: int) -> int:
raw = os.getenv(name, str(default_val))
if isinstance(raw, str):
# Strip inline comments and whitespace, e.g. "3600 # every hour"
cleaned = raw.split('#', 1)[0].strip()
if cleaned == "":
return int(default_val)
try:
return int(cleaned)
except Exception:
print(f"[Scheduler] Warning: {name}='{raw}' is not a valid int. Using default {default_val}.")
return int(default_val)
try:
return int(raw)
except Exception:
return int(default_val)
TRIGGER_HEALTH_URL = os.getenv(
"TRIGGER_HEALTH_URL",
"https://advisor-trigger-ki3t.onrender.com/health, https://advisorai-data-1ew2.onrender.com/health"
)
PING_INTERVAL = _parse_int_env("TRIGGER_PING_INTERVAL", 300)
# Pipeline interval default 3600s (1 hour)
PIPELINE_INTERVAL = _parse_int_env("PIPELINE_INTERVAL", 3600)
# -----------------------------------------------------------------------------
# ASYNC PINGER WITH EXPONENTIAL BACKOFF
# -----------------------------------------------------------------------------
async def ping_remote():
"""
Continuously GET each URL in TRIGGER_HEALTH_URL (comma-separated) every PING_INTERVAL seconds,
backing off on failure (up to 2.5 minutes).
"""
urls = [u.strip() for u in TRIGGER_HEALTH_URL.split(",") if u.strip()]
backoff = min(PING_INTERVAL, 5)
async with httpx.AsyncClient(timeout=10.0) as client:
while True:
all_success = True
for url in urls:
try:
resp = await client.get(url)
resp.raise_for_status()
print(f"[Pinger] {url} -> {resp.status_code}")
except Exception as e:
print(f"[Pinger] error pinging {url}: {e}")
all_success = False
if all_success:
backoff = PING_INTERVAL
await asyncio.sleep(PING_INTERVAL)
else:
await asyncio.sleep(backoff)
backoff = min(backoff * 2, 150)
def start_async_ping():
"""
Spin up a dedicated asyncio loop in a daemon thread
to run ping_remote() forever.
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.create_task(ping_remote())
loop.run_forever()
# launch the ping loop in the background
threading.Thread(target=start_async_ping, daemon=True).start()
print("[Scheduler] Started background ping thread")
# -----------------------------------------------------------------------------
# MAIN PIPELINE LOOP (runs every 30 minutes)
# -----------------------------------------------------------------------------
import traceback
while True:
from datetime import datetime
last_run = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print(f"[Scheduler] Running pipeline... Last run: {last_run}")
# Write last_run to file for API access
try:
with open(app_config.LAST_RUN_PATH, 'w') as f:
f.write(last_run)
except Exception as e:
print(f"[Scheduler] Failed to write last_run.txt: {e}")
try:
# Set working directory to project root (parent of deployment)
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
print(f"[Scheduler] Project root: {project_root}")
print(f"[Scheduler] Pipeline path: {PIPELINE_PATH}")
# Run from '/' so relative 'data/...' writes resolve to '/data/...'
result = subprocess.run(
[sys.executable, PIPELINE_PATH],
cwd='/',
capture_output=True,
text=True,
env=os.environ.copy()
)
print(f"[Scheduler] Pipeline finished with code {result.returncode}")
if result.stdout:
print("[Scheduler] STDOUT:\n", result.stdout)
if result.stderr:
print("[Scheduler] STDERR:\n", result.stderr)
# Raise an exception if the return code is non-zero
if result.returncode != 0:
raise subprocess.CalledProcessError(result.returncode, result.args, result.stdout, result.stderr)
except subprocess.CalledProcessError as e:
print(f"[Scheduler] Pipeline execution failed with return code {e.returncode}")
print(f"[Scheduler] STDOUT:\n{e.stdout}")
print(f"[Scheduler] STDERR:\n{e.stderr}")
except Exception as e:
print(f"[Scheduler] Exception running pipeline: {e}")
print(traceback.format_exc())
print(f"[Scheduler] Sleeping for {PIPELINE_INTERVAL // 60} minutes...")
time.sleep(PIPELINE_INTERVAL)
|