Spaces:
Running
on
L40S
Running
on
L40S
Update start.sh
Browse files
start.sh
CHANGED
|
@@ -1,36 +1,68 @@
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
-
echo "Starting
|
| 4 |
-
|
|
|
|
|
|
|
| 5 |
python3 -m vllm.entrypoints.openai.api_server \
|
| 6 |
--model numind/NuMarkdown-8B-Thinking \
|
| 7 |
--port 8000 \
|
| 8 |
--host 0.0.0.0 \
|
| 9 |
-
--max-model-len
|
| 10 |
-
--gpu-memory-utilization 0.
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
VLLM_PID=$!
|
| 13 |
echo "vLLM started with PID: $VLLM_PID"
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
echo "Waiting for vLLM server to start..."
|
| 17 |
-
for i in {1..
|
| 18 |
-
if curl -s http://localhost:8000/
|
| 19 |
-
echo "vLLM
|
|
|
|
|
|
|
|
|
|
| 20 |
break
|
| 21 |
fi
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
sleep 2
|
| 24 |
done
|
| 25 |
|
| 26 |
-
#
|
| 27 |
if ! curl -s http://localhost:8000/v1/models > /dev/null; then
|
| 28 |
-
echo "
|
| 29 |
-
echo "vLLM logs:"
|
| 30 |
-
|
| 31 |
exit 1
|
| 32 |
fi
|
| 33 |
|
| 34 |
-
echo "
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/bin/bash
|
| 2 |
|
| 3 |
+
echo "=== Starting NuMarkdown-8B-Thinking Space ==="
|
| 4 |
+
echo "Starting vLLM server with optimized settings..."
|
| 5 |
+
|
| 6 |
+
# Start vLLM with HF Spaces optimizations
|
| 7 |
python3 -m vllm.entrypoints.openai.api_server \
|
| 8 |
--model numind/NuMarkdown-8B-Thinking \
|
| 9 |
--port 8000 \
|
| 10 |
--host 0.0.0.0 \
|
| 11 |
+
--max-model-len 8048 \
|
| 12 |
+
--gpu-memory-utilization 0.9 \
|
| 13 |
+
--disable-log-requests \
|
| 14 |
+
--tensor-parallel-size 1 \
|
| 15 |
+
--trust-remote-code > $HOME/app/vllm.log 2>&1 &
|
| 16 |
|
| 17 |
VLLM_PID=$!
|
| 18 |
echo "vLLM started with PID: $VLLM_PID"
|
| 19 |
|
| 20 |
+
# More aggressive waiting with health checks
|
| 21 |
+
echo "Waiting for vLLM server to start (this may take 5-10 minutes)..."
|
| 22 |
+
for i in {1..180}; do # Wait up to 6 minutes
|
| 23 |
+
if curl -s --connect-timeout 5 http://localhost:8000/health > /dev/null 2>&1; then
|
| 24 |
+
echo "β vLLM health check passed!"
|
| 25 |
+
break
|
| 26 |
+
elif curl -s --connect-timeout 5 http://localhost:8000/v1/models > /dev/null 2>&1; then
|
| 27 |
+
echo "β vLLM server is ready!"
|
| 28 |
break
|
| 29 |
fi
|
| 30 |
+
|
| 31 |
+
# Show progress every 10 seconds
|
| 32 |
+
if [ $((i % 10)) -eq 0 ]; then
|
| 33 |
+
echo "Still waiting... ($i/180) - checking vLLM process"
|
| 34 |
+
if ! ps -p $VLLM_PID > /dev/null; then
|
| 35 |
+
echo "β vLLM process died! Checking logs:"
|
| 36 |
+
tail -20 $HOME/app/vllm.log
|
| 37 |
+
exit 1
|
| 38 |
+
fi
|
| 39 |
+
fi
|
| 40 |
sleep 2
|
| 41 |
done
|
| 42 |
|
| 43 |
+
# Final check
|
| 44 |
if ! curl -s http://localhost:8000/v1/models > /dev/null; then
|
| 45 |
+
echo "β vLLM server failed to start after 6 minutes!"
|
| 46 |
+
echo "Last 50 lines of vLLM logs:"
|
| 47 |
+
tail -50 $HOME/app/vllm.log
|
| 48 |
exit 1
|
| 49 |
fi
|
| 50 |
|
| 51 |
+
echo "β
vLLM server is ready!"
|
| 52 |
+
echo "=== Starting Gradio App ==="
|
| 53 |
+
echo "Port 7860 status before launching Gradio:"
|
| 54 |
+
netstat -tuln | grep :7860 || echo "Port 7860 is free"
|
| 55 |
+
|
| 56 |
+
echo "Environment check:"
|
| 57 |
+
echo "PORT=${PORT:-7860}"
|
| 58 |
+
echo "PWD=$(pwd)"
|
| 59 |
+
echo "USER=$(whoami)"
|
| 60 |
+
|
| 61 |
+
# Launch Gradio with explicit error handling
|
| 62 |
+
echo "Launching Gradio..."
|
| 63 |
+
python3 $HOME/app/app.py || {
|
| 64 |
+
echo "β Gradio failed to start!"
|
| 65 |
+
echo "Checking if port is in use:"
|
| 66 |
+
netstat -tuln | grep :7860
|
| 67 |
+
exit 1
|
| 68 |
+
}
|