|
|
|
""" |
|
Simple monitoring script to track service health and resource usage |
|
""" |
|
import os |
|
import time |
|
import psutil |
|
import json |
|
from datetime import datetime |
|
|
|
from src import config as app_config |
|
|
|
def get_system_stats(): |
|
"""Get current system statistics""" |
|
try: |
|
process = psutil.Process() |
|
|
|
|
|
memory_info = process.memory_info() |
|
memory_mb = memory_info.rss / 1024 / 1024 |
|
|
|
|
|
cpu_percent = process.cpu_percent(interval=1) |
|
|
|
|
|
disk_root = app_config.DATA_DIR if os.path.exists(app_config.DATA_DIR) else '/' |
|
disk_usage = psutil.disk_usage(disk_root) |
|
disk_free_gb = disk_usage.free / (1024**3) |
|
disk_used_percent = (disk_usage.used / disk_usage.total) * 100 |
|
|
|
|
|
num_threads = process.num_threads() |
|
|
|
return { |
|
"timestamp": datetime.now().isoformat(), |
|
"memory_mb": round(memory_mb, 2), |
|
"cpu_percent": round(cpu_percent, 2), |
|
"disk_free_gb": round(disk_free_gb, 2), |
|
"disk_used_percent": round(disk_used_percent, 2), |
|
"num_threads": num_threads, |
|
"pid": process.pid |
|
} |
|
except Exception as e: |
|
return { |
|
"timestamp": datetime.now().isoformat(), |
|
"error": str(e) |
|
} |
|
|
|
def log_stats(): |
|
"""Log system statistics to file""" |
|
stats = get_system_stats() |
|
|
|
|
|
log_dir = app_config.LOG_DIR |
|
os.makedirs(log_dir, exist_ok=True) |
|
|
|
|
|
log_file = os.path.join(log_dir, "system_stats.jsonl") |
|
with open(log_file, "a") as f: |
|
f.write(json.dumps(stats) + "\n") |
|
|
|
|
|
print(f"[Monitor] {json.dumps(stats)}") |
|
|
|
|
|
if "error" not in stats: |
|
issues = [] |
|
|
|
if stats["memory_mb"] > 450: |
|
issues.append(f"HIGH MEMORY: {stats['memory_mb']:.1f}MB") |
|
|
|
if stats["cpu_percent"] > 80: |
|
issues.append(f"HIGH CPU: {stats['cpu_percent']:.1f}%") |
|
|
|
if stats["disk_free_gb"] < 0.5: |
|
issues.append(f"LOW DISK: {stats['disk_free_gb']:.1f}GB free") |
|
|
|
if issues: |
|
print(f"[Monitor] ALERTS: {', '.join(issues)}") |
|
|
|
if __name__ == "__main__": |
|
print("[Monitor] Starting system monitoring...") |
|
|
|
while True: |
|
try: |
|
log_stats() |
|
time.sleep(60) |
|
except KeyboardInterrupt: |
|
print("[Monitor] Monitoring stopped") |
|
break |
|
except Exception as e: |
|
print(f"[Monitor] Error: {e}") |
|
time.sleep(60) |