Maaroufabousaleh
commited on
Commit
·
4b5719e
1
Parent(s):
c49b21b
Refactor Docker and entrypoint configurations; update NLTK data handling and logging paths
Browse files- Dockerfile +1 -1
- deployment/gradio_entrypoint.sh +39 -15
- deployment/nginx.conf +5 -14
- deployment/supervisord.conf +14 -14
- src/api/gradio_main.py +34 -16
Dockerfile
CHANGED
@@ -103,6 +103,6 @@ COPY deployment/supervisord.conf /etc/supervisord.conf
|
|
103 |
ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
|
104 |
|
105 |
# Ports
|
106 |
-
EXPOSE
|
107 |
|
108 |
CMD ["supervisord", "-c", "/etc/supervisord.conf"]
|
|
|
103 |
ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
|
104 |
|
105 |
# Ports
|
106 |
+
EXPOSE 7860
|
107 |
|
108 |
CMD ["supervisord", "-c", "/etc/supervisord.conf"]
|
deployment/gradio_entrypoint.sh
CHANGED
@@ -1,27 +1,51 @@
|
|
1 |
#!/bin/bash
|
2 |
-
set -
|
3 |
|
4 |
echo "Starting AdvisorAI Data Pipeline with Gradio..."
|
5 |
|
6 |
-
#
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
-
|
10 |
-
|
11 |
|
12 |
-
#
|
13 |
-
python -
|
14 |
-
import nltk
|
15 |
import os
|
16 |
-
os.environ
|
17 |
try:
|
18 |
-
nltk
|
19 |
-
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
22 |
except Exception as e:
|
23 |
-
print(
|
24 |
-
|
25 |
|
26 |
echo "Starting services..."
|
27 |
exec "$@"
|
|
|
1 |
#!/bin/bash
|
2 |
+
set -u
|
3 |
|
4 |
echo "Starting AdvisorAI Data Pipeline with Gradio..."
|
5 |
|
6 |
+
# Determine writable data dir via existing Python config logic
|
7 |
+
NLTK_DIR=$(python - <<'PY'
|
8 |
+
import os
|
9 |
+
try:
|
10 |
+
from src.config import DATA_DIR
|
11 |
+
except Exception:
|
12 |
+
# fallback order
|
13 |
+
for p in ['/data', '/app/data', '/tmp']:
|
14 |
+
try:
|
15 |
+
os.makedirs(p, exist_ok=True)
|
16 |
+
test = os.path.join(p, '.wtest')
|
17 |
+
open(test,'w').close(); os.remove(test)
|
18 |
+
DATA_DIR = p
|
19 |
+
break
|
20 |
+
except Exception:
|
21 |
+
continue
|
22 |
+
else:
|
23 |
+
DATA_DIR = '/tmp'
|
24 |
+
|
25 |
+
nl = os.path.join(DATA_DIR, 'nltk_data')
|
26 |
+
os.makedirs(nl, exist_ok=True)
|
27 |
+
print(nl)
|
28 |
+
PY
|
29 |
+
)
|
30 |
|
31 |
+
export NLTK_DATA="$NLTK_DIR"
|
32 |
+
echo "NLTK_DATA set to: $NLTK_DATA"
|
33 |
|
34 |
+
# Best-effort NLTK downloads (do not fail on errors)
|
35 |
+
python - <<'PY'
|
|
|
36 |
import os
|
37 |
+
print('Preparing NLTK into', os.environ.get('NLTK_DATA'))
|
38 |
try:
|
39 |
+
import nltk
|
40 |
+
for pkg in ['punkt', 'stopwords', 'vader_lexicon']:
|
41 |
+
try:
|
42 |
+
nltk.download(pkg, download_dir=os.environ.get('NLTK_DATA'), quiet=True)
|
43 |
+
print('Downloaded', pkg)
|
44 |
+
except Exception as e:
|
45 |
+
print('NLTK download failed for', pkg, e)
|
46 |
except Exception as e:
|
47 |
+
print('NLTK import failed:', e)
|
48 |
+
PY
|
49 |
|
50 |
echo "Starting services..."
|
51 |
exec "$@"
|
deployment/nginx.conf
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
server {
|
2 |
-
listen
|
|
|
3 |
|
4 |
# Increase timeouts to handle long-running operations
|
5 |
proxy_connect_timeout 60s;
|
@@ -18,20 +19,10 @@ server {
|
|
18 |
client_body_timeout 60s;
|
19 |
client_header_timeout 60s;
|
20 |
|
21 |
-
# -- health-check:
|
22 |
location = /health {
|
23 |
-
|
24 |
-
|
25 |
-
proxy_set_header X-Real-IP $remote_addr;
|
26 |
-
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
27 |
-
proxy_set_header X-Forwarded-Proto $scheme;
|
28 |
-
|
29 |
-
# Shorter timeouts for health checks
|
30 |
-
proxy_connect_timeout 10s;
|
31 |
-
proxy_send_timeout 10s;
|
32 |
-
proxy_read_timeout 10s;
|
33 |
-
|
34 |
-
# don't log upstream body
|
35 |
access_log off;
|
36 |
}
|
37 |
|
|
|
1 |
server {
|
2 |
+
# On Spaces Docker, the platform expects the app to listen on 7860
|
3 |
+
listen 7860;
|
4 |
|
5 |
# Increase timeouts to handle long-running operations
|
6 |
proxy_connect_timeout 60s;
|
|
|
19 |
client_body_timeout 60s;
|
20 |
client_header_timeout 60s;
|
21 |
|
22 |
+
# -- health-check: serve OK directly --
|
23 |
location = /health {
|
24 |
+
default_type text/plain;
|
25 |
+
return 200 'OK\n';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
access_log off;
|
27 |
}
|
28 |
|
deployment/supervisord.conf
CHANGED
@@ -6,7 +6,7 @@ pidfile=/tmp/supervisord.pid
|
|
6 |
loglevel=info
|
7 |
|
8 |
[program:gradio]
|
9 |
-
command
|
10 |
directory=/app
|
11 |
autostart=true
|
12 |
autorestart=true
|
@@ -14,24 +14,24 @@ stdout_logfile=/dev/stdout
|
|
14 |
stderr_logfile=/dev/stderr
|
15 |
stdout_logfile_maxbytes=0
|
16 |
stderr_logfile_maxbytes=0
|
17 |
-
startsecs=
|
18 |
startretries=3
|
19 |
-
stopwaitsecs=
|
20 |
killasgroup=true
|
21 |
stopasgroup=true
|
22 |
environment=PYTHONPATH="/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
|
23 |
|
24 |
-
[program:nginx]
|
25 |
-
command=/usr/sbin/nginx -g 'daemon off;'
|
26 |
-
autostart=true
|
27 |
-
autorestart=true
|
28 |
-
stdout_logfile=/dev/stdout
|
29 |
-
stderr_logfile=/dev/stderr
|
30 |
-
stdout_logfile_maxbytes=0
|
31 |
-
stderr_logfile_maxbytes=0
|
32 |
-
startsecs=5
|
33 |
-
startretries=3
|
34 |
-
stopwaitsecs=10
|
35 |
|
36 |
[program:scheduler]
|
37 |
; wait 180 s before first run, then your scheduler.py handles its own 30 min sleeps
|
|
|
6 |
loglevel=info
|
7 |
|
8 |
[program:gradio]
|
9 |
+
command=/bin/sh -c 'PORT=${PORT:-7860} python /app/src/api/gradio_main.py'
|
10 |
directory=/app
|
11 |
autostart=true
|
12 |
autorestart=true
|
|
|
14 |
stderr_logfile=/dev/stderr
|
15 |
stdout_logfile_maxbytes=0
|
16 |
stderr_logfile_maxbytes=0
|
17 |
+
startsecs=5
|
18 |
startretries=3
|
19 |
+
stopwaitsecs=20
|
20 |
killasgroup=true
|
21 |
stopasgroup=true
|
22 |
environment=PYTHONPATH="/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
|
23 |
|
24 |
+
; [program:nginx]
|
25 |
+
; command=/usr/sbin/nginx -g 'daemon off;'
|
26 |
+
; autostart=true
|
27 |
+
; autorestart=true
|
28 |
+
; stdout_logfile=/dev/stdout
|
29 |
+
; stderr_logfile=/dev/stderr
|
30 |
+
; stdout_logfile_maxbytes=0
|
31 |
+
; stderr_logfile_maxbytes=0
|
32 |
+
; startsecs=5
|
33 |
+
; startretries=3
|
34 |
+
; stopwaitsecs=10
|
35 |
|
36 |
[program:scheduler]
|
37 |
; wait 180 s before first run, then your scheduler.py handles its own 30 min sleeps
|
src/api/gradio_main.py
CHANGED
@@ -9,6 +9,18 @@ from datetime import datetime, timedelta
|
|
9 |
import psutil
|
10 |
from pathlib import Path
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# Add src to Python path for imports
|
13 |
sys.path.insert(0, '/app/src')
|
14 |
sys.path.insert(0, '/app')
|
@@ -37,7 +49,7 @@ def get_health_status():
|
|
37 |
scheduler_running = False
|
38 |
last_run_time = "Unknown"
|
39 |
try:
|
40 |
-
last_run_file =
|
41 |
if os.path.exists(last_run_file):
|
42 |
with open(last_run_file, 'r') as f:
|
43 |
last_run_str = f.read().strip()
|
@@ -71,10 +83,10 @@ def get_pipeline_status():
|
|
71 |
"""Get data pipeline status"""
|
72 |
try:
|
73 |
data_dirs = [
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
]
|
79 |
|
80 |
recent_files = 0
|
@@ -113,10 +125,10 @@ def get_recent_files():
|
|
113 |
"""Get list of recent files in the data directories"""
|
114 |
try:
|
115 |
base_paths = [
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
]
|
121 |
|
122 |
recent_files = []
|
@@ -129,7 +141,7 @@ def get_recent_files():
|
|
129 |
stat = os.stat(file_path)
|
130 |
recent_files.append({
|
131 |
"File": file,
|
132 |
-
"Path": file_path.replace(
|
133 |
"Size": f"{stat.st_size / (1024**2):.2f} MB",
|
134 |
"Modified": datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
|
135 |
})
|
@@ -215,14 +227,19 @@ with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft())
|
|
215 |
def refresh_dashboard():
|
216 |
health = get_health_status()
|
217 |
pipeline = get_pipeline_status()
|
218 |
-
|
|
|
219 |
|
220 |
def refresh_files():
|
221 |
files = get_recent_files()
|
222 |
-
if
|
223 |
-
return [
|
224 |
-
|
225 |
-
|
|
|
|
|
|
|
|
|
226 |
|
227 |
def refresh_logs():
|
228 |
return get_logs()
|
@@ -256,9 +273,10 @@ with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft())
|
|
256 |
|
257 |
if __name__ == "__main__":
|
258 |
logger.info("Starting Gradio app...")
|
|
|
259 |
app.launch(
|
260 |
server_name="0.0.0.0",
|
261 |
-
server_port=
|
262 |
share=False,
|
263 |
show_error=True,
|
264 |
quiet=False
|
|
|
9 |
import psutil
|
10 |
from pathlib import Path
|
11 |
|
12 |
+
# Internal config for paths and markers
|
13 |
+
try:
|
14 |
+
from src.config import DATA_DIR, LOG_DIR, LAST_RUN_PATH
|
15 |
+
except Exception:
|
16 |
+
# Fallbacks if import path differs in Spaces
|
17 |
+
try:
|
18 |
+
from config import DATA_DIR, LOG_DIR, LAST_RUN_PATH # type: ignore
|
19 |
+
except Exception:
|
20 |
+
DATA_DIR = os.environ.get('DATA_DIR', '/data')
|
21 |
+
LOG_DIR = os.environ.get('LOG_DIR', os.path.join(DATA_DIR, 'logs'))
|
22 |
+
LAST_RUN_PATH = os.environ.get('LAST_RUN_PATH', '/tmp/last_run.txt')
|
23 |
+
|
24 |
# Add src to Python path for imports
|
25 |
sys.path.insert(0, '/app/src')
|
26 |
sys.path.insert(0, '/app')
|
|
|
49 |
scheduler_running = False
|
50 |
last_run_time = "Unknown"
|
51 |
try:
|
52 |
+
last_run_file = LAST_RUN_PATH
|
53 |
if os.path.exists(last_run_file):
|
54 |
with open(last_run_file, 'r') as f:
|
55 |
last_run_str = f.read().strip()
|
|
|
83 |
"""Get data pipeline status"""
|
84 |
try:
|
85 |
data_dirs = [
|
86 |
+
os.path.join(DATA_DIR, 'merged', 'features'),
|
87 |
+
os.path.join(DATA_DIR, 'merged', 'train'),
|
88 |
+
os.path.join(DATA_DIR, 'alpaca'),
|
89 |
+
os.path.join(DATA_DIR, 'advisorai-data'),
|
90 |
]
|
91 |
|
92 |
recent_files = 0
|
|
|
125 |
"""Get list of recent files in the data directories"""
|
126 |
try:
|
127 |
base_paths = [
|
128 |
+
os.path.join(DATA_DIR, 'merged', 'features'),
|
129 |
+
os.path.join(DATA_DIR, 'merged', 'train'),
|
130 |
+
os.path.join(DATA_DIR, 'alpaca'),
|
131 |
+
os.path.join(DATA_DIR, 'advisorai-data', 'features'),
|
132 |
]
|
133 |
|
134 |
recent_files = []
|
|
|
141 |
stat = os.stat(file_path)
|
142 |
recent_files.append({
|
143 |
"File": file,
|
144 |
+
"Path": file_path.replace(DATA_DIR.rstrip('/') + '/', ""),
|
145 |
"Size": f"{stat.st_size / (1024**2):.2f} MB",
|
146 |
"Modified": datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
|
147 |
})
|
|
|
227 |
def refresh_dashboard():
|
228 |
health = get_health_status()
|
229 |
pipeline = get_pipeline_status()
|
230 |
+
# JSON components accept dicts directly
|
231 |
+
return health, pipeline
|
232 |
|
233 |
def refresh_files():
|
234 |
files = get_recent_files()
|
235 |
+
if not files:
|
236 |
+
return []
|
237 |
+
if isinstance(files, list) and isinstance(files[0], dict) and "Error" not in files[0]:
|
238 |
+
rows = []
|
239 |
+
for f in files:
|
240 |
+
rows.append([f.get("File",""), f.get("Path",""), f.get("Size",""), f.get("Modified","")])
|
241 |
+
return rows
|
242 |
+
return [["Error", str(files), "", ""]]
|
243 |
|
244 |
def refresh_logs():
|
245 |
return get_logs()
|
|
|
273 |
|
274 |
if __name__ == "__main__":
|
275 |
logger.info("Starting Gradio app...")
|
276 |
+
port = int(os.environ.get("PORT", "7860"))
|
277 |
app.launch(
|
278 |
server_name="0.0.0.0",
|
279 |
+
server_port=port,
|
280 |
share=False,
|
281 |
show_error=True,
|
282 |
quiet=False
|