Maaroufabousaleh commited on
Commit
4b5719e
·
1 Parent(s): c49b21b

Refactor Docker and entrypoint configurations; update NLTK data handling and logging paths

Browse files
Dockerfile CHANGED
@@ -103,6 +103,6 @@ COPY deployment/supervisord.conf /etc/supervisord.conf
103
  ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
104
 
105
  # Ports
106
- EXPOSE 80 7860
107
 
108
  CMD ["supervisord", "-c", "/etc/supervisord.conf"]
 
103
  ENTRYPOINT ["/app/deployment/gradio_entrypoint.sh"]
104
 
105
  # Ports
106
+ EXPOSE 7860
107
 
108
  CMD ["supervisord", "-c", "/etc/supervisord.conf"]
deployment/gradio_entrypoint.sh CHANGED
@@ -1,27 +1,51 @@
1
  #!/bin/bash
2
- set -e
3
 
4
  echo "Starting AdvisorAI Data Pipeline with Gradio..."
5
 
6
- # Create necessary directories
7
- mkdir -p /data/logs /data/nltk_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- # Set proper permissions
10
- chmod -R 777 /data
11
 
12
- # Download NLTK data if needed
13
- python -c "
14
- import nltk
15
  import os
16
- os.environ['NLTK_DATA'] = '/data/nltk_data'
17
  try:
18
- nltk.download('punkt', download_dir='/data/nltk_data', quiet=True)
19
- nltk.download('stopwords', download_dir='/data/nltk_data', quiet=True)
20
- nltk.download('vader_lexicon', download_dir='/data/nltk_data', quiet=True)
21
- print('NLTK data downloaded successfully')
 
 
 
22
  except Exception as e:
23
- print(f'NLTK download failed: {e}')
24
- "
25
 
26
  echo "Starting services..."
27
  exec "$@"
 
1
  #!/bin/bash
2
+ set -u
3
 
4
  echo "Starting AdvisorAI Data Pipeline with Gradio..."
5
 
6
+ # Determine writable data dir via existing Python config logic
7
+ NLTK_DIR=$(python - <<'PY'
8
+ import os
9
+ try:
10
+ from src.config import DATA_DIR
11
+ except Exception:
12
+ # fallback order
13
+ for p in ['/data', '/app/data', '/tmp']:
14
+ try:
15
+ os.makedirs(p, exist_ok=True)
16
+ test = os.path.join(p, '.wtest')
17
+ open(test,'w').close(); os.remove(test)
18
+ DATA_DIR = p
19
+ break
20
+ except Exception:
21
+ continue
22
+ else:
23
+ DATA_DIR = '/tmp'
24
+
25
+ nl = os.path.join(DATA_DIR, 'nltk_data')
26
+ os.makedirs(nl, exist_ok=True)
27
+ print(nl)
28
+ PY
29
+ )
30
 
31
+ export NLTK_DATA="$NLTK_DIR"
32
+ echo "NLTK_DATA set to: $NLTK_DATA"
33
 
34
+ # Best-effort NLTK downloads (do not fail on errors)
35
+ python - <<'PY'
 
36
  import os
37
+ print('Preparing NLTK into', os.environ.get('NLTK_DATA'))
38
  try:
39
+ import nltk
40
+ for pkg in ['punkt', 'stopwords', 'vader_lexicon']:
41
+ try:
42
+ nltk.download(pkg, download_dir=os.environ.get('NLTK_DATA'), quiet=True)
43
+ print('Downloaded', pkg)
44
+ except Exception as e:
45
+ print('NLTK download failed for', pkg, e)
46
  except Exception as e:
47
+ print('NLTK import failed:', e)
48
+ PY
49
 
50
  echo "Starting services..."
51
  exec "$@"
deployment/nginx.conf CHANGED
@@ -1,5 +1,6 @@
1
  server {
2
- listen 80;
 
3
 
4
  # Increase timeouts to handle long-running operations
5
  proxy_connect_timeout 60s;
@@ -18,20 +19,10 @@ server {
18
  client_body_timeout 60s;
19
  client_header_timeout 60s;
20
 
21
- # -- health-check: proxy to gradio app --
22
  location = /health {
23
- proxy_pass http://127.0.0.1:7860/;
24
- proxy_set_header Host $host;
25
- proxy_set_header X-Real-IP $remote_addr;
26
- proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
27
- proxy_set_header X-Forwarded-Proto $scheme;
28
-
29
- # Shorter timeouts for health checks
30
- proxy_connect_timeout 10s;
31
- proxy_send_timeout 10s;
32
- proxy_read_timeout 10s;
33
-
34
- # don't log upstream body
35
  access_log off;
36
  }
37
 
 
1
  server {
2
+ # On Spaces Docker, the platform expects the app to listen on 7860
3
+ listen 7860;
4
 
5
  # Increase timeouts to handle long-running operations
6
  proxy_connect_timeout 60s;
 
19
  client_body_timeout 60s;
20
  client_header_timeout 60s;
21
 
22
+ # -- health-check: serve OK directly --
23
  location = /health {
24
+ default_type text/plain;
25
+ return 200 'OK\n';
 
 
 
 
 
 
 
 
 
 
26
  access_log off;
27
  }
28
 
deployment/supervisord.conf CHANGED
@@ -6,7 +6,7 @@ pidfile=/tmp/supervisord.pid
6
  loglevel=info
7
 
8
  [program:gradio]
9
- command=python /app/src/api/gradio_main.py
10
  directory=/app
11
  autostart=true
12
  autorestart=true
@@ -14,24 +14,24 @@ stdout_logfile=/dev/stdout
14
  stderr_logfile=/dev/stderr
15
  stdout_logfile_maxbytes=0
16
  stderr_logfile_maxbytes=0
17
- startsecs=10
18
  startretries=3
19
- stopwaitsecs=30
20
  killasgroup=true
21
  stopasgroup=true
22
  environment=PYTHONPATH="/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
23
 
24
- [program:nginx]
25
- command=/usr/sbin/nginx -g 'daemon off;'
26
- autostart=true
27
- autorestart=true
28
- stdout_logfile=/dev/stdout
29
- stderr_logfile=/dev/stderr
30
- stdout_logfile_maxbytes=0
31
- stderr_logfile_maxbytes=0
32
- startsecs=5
33
- startretries=3
34
- stopwaitsecs=10
35
 
36
  [program:scheduler]
37
  ; wait 180 s before first run, then your scheduler.py handles its own 30 min sleeps
 
6
  loglevel=info
7
 
8
  [program:gradio]
9
+ command=/bin/sh -c 'PORT=${PORT:-7860} python /app/src/api/gradio_main.py'
10
  directory=/app
11
  autostart=true
12
  autorestart=true
 
14
  stderr_logfile=/dev/stderr
15
  stdout_logfile_maxbytes=0
16
  stderr_logfile_maxbytes=0
17
+ startsecs=5
18
  startretries=3
19
+ stopwaitsecs=20
20
  killasgroup=true
21
  stopasgroup=true
22
  environment=PYTHONPATH="/app:/app/src:/app/src/api:/app/src/data_cloud:/app/src/fetchers:/app/src/merge"
23
 
24
+ ; [program:nginx]
25
+ ; command=/usr/sbin/nginx -g 'daemon off;'
26
+ ; autostart=true
27
+ ; autorestart=true
28
+ ; stdout_logfile=/dev/stdout
29
+ ; stderr_logfile=/dev/stderr
30
+ ; stdout_logfile_maxbytes=0
31
+ ; stderr_logfile_maxbytes=0
32
+ ; startsecs=5
33
+ ; startretries=3
34
+ ; stopwaitsecs=10
35
 
36
  [program:scheduler]
37
  ; wait 180 s before first run, then your scheduler.py handles its own 30 min sleeps
src/api/gradio_main.py CHANGED
@@ -9,6 +9,18 @@ from datetime import datetime, timedelta
9
  import psutil
10
  from pathlib import Path
11
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # Add src to Python path for imports
13
  sys.path.insert(0, '/app/src')
14
  sys.path.insert(0, '/app')
@@ -37,7 +49,7 @@ def get_health_status():
37
  scheduler_running = False
38
  last_run_time = "Unknown"
39
  try:
40
- last_run_file = "/app/deployment/last_run.txt"
41
  if os.path.exists(last_run_file):
42
  with open(last_run_file, 'r') as f:
43
  last_run_str = f.read().strip()
@@ -71,10 +83,10 @@ def get_pipeline_status():
71
  """Get data pipeline status"""
72
  try:
73
  data_dirs = [
74
- "/data/merged/features",
75
- "/data/merged/train",
76
- "/data/alpaca",
77
- "/data/advisorai-data"
78
  ]
79
 
80
  recent_files = 0
@@ -113,10 +125,10 @@ def get_recent_files():
113
  """Get list of recent files in the data directories"""
114
  try:
115
  base_paths = [
116
- "/data/merged/features",
117
- "/data/merged/train",
118
- "/data/alpaca",
119
- "/data/advisorai-data/features"
120
  ]
121
 
122
  recent_files = []
@@ -129,7 +141,7 @@ def get_recent_files():
129
  stat = os.stat(file_path)
130
  recent_files.append({
131
  "File": file,
132
- "Path": file_path.replace("/data/", ""),
133
  "Size": f"{stat.st_size / (1024**2):.2f} MB",
134
  "Modified": datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
135
  })
@@ -215,14 +227,19 @@ with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft())
215
  def refresh_dashboard():
216
  health = get_health_status()
217
  pipeline = get_pipeline_status()
218
- return json.dumps(health, indent=2), json.dumps(pipeline, indent=2)
 
219
 
220
  def refresh_files():
221
  files = get_recent_files()
222
- if files and isinstance(files[0], dict) and "Error" not in files[0]:
223
- return [[f["File"], f["Path"], f["Size"], f["Modified"]] for f in files]
224
- else:
225
- return [["Error", str(files), "", ""]]
 
 
 
 
226
 
227
  def refresh_logs():
228
  return get_logs()
@@ -256,9 +273,10 @@ with gr.Blocks(title="AdvisorAI Data Pipeline Monitor", theme=gr.themes.Soft())
256
 
257
  if __name__ == "__main__":
258
  logger.info("Starting Gradio app...")
 
259
  app.launch(
260
  server_name="0.0.0.0",
261
- server_port=7860,
262
  share=False,
263
  show_error=True,
264
  quiet=False
 
9
  import psutil
10
  from pathlib import Path
11
 
12
+ # Internal config for paths and markers
13
+ try:
14
+ from src.config import DATA_DIR, LOG_DIR, LAST_RUN_PATH
15
+ except Exception:
16
+ # Fallbacks if import path differs in Spaces
17
+ try:
18
+ from config import DATA_DIR, LOG_DIR, LAST_RUN_PATH # type: ignore
19
+ except Exception:
20
+ DATA_DIR = os.environ.get('DATA_DIR', '/data')
21
+ LOG_DIR = os.environ.get('LOG_DIR', os.path.join(DATA_DIR, 'logs'))
22
+ LAST_RUN_PATH = os.environ.get('LAST_RUN_PATH', '/tmp/last_run.txt')
23
+
24
  # Add src to Python path for imports
25
  sys.path.insert(0, '/app/src')
26
  sys.path.insert(0, '/app')
 
49
  scheduler_running = False
50
  last_run_time = "Unknown"
51
  try:
52
+ last_run_file = LAST_RUN_PATH
53
  if os.path.exists(last_run_file):
54
  with open(last_run_file, 'r') as f:
55
  last_run_str = f.read().strip()
 
83
  """Get data pipeline status"""
84
  try:
85
  data_dirs = [
86
+ os.path.join(DATA_DIR, 'merged', 'features'),
87
+ os.path.join(DATA_DIR, 'merged', 'train'),
88
+ os.path.join(DATA_DIR, 'alpaca'),
89
+ os.path.join(DATA_DIR, 'advisorai-data'),
90
  ]
91
 
92
  recent_files = 0
 
125
  """Get list of recent files in the data directories"""
126
  try:
127
  base_paths = [
128
+ os.path.join(DATA_DIR, 'merged', 'features'),
129
+ os.path.join(DATA_DIR, 'merged', 'train'),
130
+ os.path.join(DATA_DIR, 'alpaca'),
131
+ os.path.join(DATA_DIR, 'advisorai-data', 'features'),
132
  ]
133
 
134
  recent_files = []
 
141
  stat = os.stat(file_path)
142
  recent_files.append({
143
  "File": file,
144
+ "Path": file_path.replace(DATA_DIR.rstrip('/') + '/', ""),
145
  "Size": f"{stat.st_size / (1024**2):.2f} MB",
146
  "Modified": datetime.fromtimestamp(stat.st_mtime).strftime("%Y-%m-%d %H:%M")
147
  })
 
227
  def refresh_dashboard():
228
  health = get_health_status()
229
  pipeline = get_pipeline_status()
230
+ # JSON components accept dicts directly
231
+ return health, pipeline
232
 
233
  def refresh_files():
234
  files = get_recent_files()
235
+ if not files:
236
+ return []
237
+ if isinstance(files, list) and isinstance(files[0], dict) and "Error" not in files[0]:
238
+ rows = []
239
+ for f in files:
240
+ rows.append([f.get("File",""), f.get("Path",""), f.get("Size",""), f.get("Modified","")])
241
+ return rows
242
+ return [["Error", str(files), "", ""]]
243
 
244
  def refresh_logs():
245
  return get_logs()
 
273
 
274
  if __name__ == "__main__":
275
  logger.info("Starting Gradio app...")
276
+ port = int(os.environ.get("PORT", "7860"))
277
  app.launch(
278
  server_name="0.0.0.0",
279
+ server_port=port,
280
  share=False,
281
  show_error=True,
282
  quiet=False