Extrapolis-v1-4B-SFT / config.yaml
qingy2024's picture
Update config.yaml
d4df3b6 verified
logLevel: info
healthCheckTimeout: 1000
models:
"cognitivecomputations/DeepSeek-R1-0528-AWQ":
cmd: >
vllm serve cognitivecomputations/DeepSeek-R1-0528-AWQ
--tensor-parallel-size 8
--trust-remote-code
--max-model-len 65536
--gpu-memory-utilization 0.95
--enable-chunked-prefill
--enable-prefix-caching
--host 0.0.0.0
--port 8181
--reasoning-parser deepseek_r1
--tool-call-parser deepseek_v3
--chat-template tool_r1_chat_template.jinja
proxy: http://0.0.0.0:8181
"QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ":
cmd: >
vllm serve QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ
--enable-expert-parallel
--max-model-len 128000
--gpu_memory_utilization=0.95
--tensor-parallel-size 8
--trust-remote-code
--host 0.0.0.0
--port 8181
--enable-auto-tool
--tool-call-parser hermes
--enable-prefix-caching
proxy: http://0.0.0.0:8181
"Qwen/Qwen3-32B":
cmd: >
vllm serve Qwen/Qwen3-32B
--tensor-parallel-size 8
--trust-remote-code
--max-model-len 32768
--gpu-memory-utilization 0.95
--enable-prefix-caching
--host 0.0.0.0
--port 8181
--reasoning-parser deepseek_r1
--enable-auto-tool
--tool-call-parser hermes
proxy: http://0.0.0.0:8181
"Qwen/Qwen3-235B-A22B-Thinking-2507-FP8":
cmd: >-
bash -c 'export CUDA_VISIBLE_DEVICES=0,1,2,3; vllm serve Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 --tensor-parallel-size 4 --trust-remote-code --max-model-len 64000 --gpu-memory-utilization 0.95 --host 0.0.0.0 --port 8181 --enable-auto-tool --tool-call-parser hermes --reasoning-parser deepseek_r1'
proxy: http://0.0.0.0:8181
"Qwen/Qwen3-235B-A22B-Instruct-2507-FP8":
cmd: >-
bash -c 'export CUDA_VISIBLE_DEVICES=4,5,6,7; vllm serve Qwen/Qwen3-235B-A22B-Instruct-2507-FP8 --tensor-parallel-size 4 --trust-remote-code --max-model-len 64000 --gpu-memory-utilization 0.95 --host 0.0.0.0 --port 8182 --enable-auto-tool --tool-call-parser hermes'
proxy: http://0.0.0.0:8182
"Kwaipilot/KAT-V1-40B":
cmd: >
vllm serve Kwaipilot/KAT-V1-40B
--max-model-len 32768
--tensor-parallel-size 4
--host 0.0.0.0
--port 8181
--chat-template kat.jinja
--gpu-memory-utilization 0.95
proxy: http://0.0.0.0:8181
"unsloth/Devstral-Small-2507":
cmd: >
vllm serve unsloth/Devstral-Small-2507
--max-model-len 65536
--tensor-parallel-size 4
--host 0.0.0.0
--port 8181
--gpu-memory-utilization 0.95
--tool-call-parser mistral
--enable-auto-tool-choice
proxy: http://0.0.0.0:8181
"zai-org/GLM-4.5-FP8-Instruct":
cmd: >
vllm serve zai-org/GLM-4.5-FP8
--tensor-parallel-size 8
--gpu_memory_utilization 0.95
--tool-call-parser glm45
--enable-auto-tool-choice
--chat-template glm-4.5-nothink.jinja
--max-model-len 128000
--served-model-name "zai-org/GLM-4.5-FP8-Instruct"
--host 0.0.0.0
--port 8181
proxy: http://0.0.0.0:8181
"zai-org/GLM-4.5-FP8":
cmd: >
vllm serve zai-org/GLM-4.5-FP8
--tensor-parallel-size 8
--gpu_memory_utilization 0.95
--reasoning-parser glm45
--max-model-len 128000
--host 0.0.0.0
--port 8181
proxy: http://0.0.0.0:8181
"rednote-hilab/dots.llm1.inst":
cmd: >
vllm serve rednote-hilab/dots.llm1.inst
--tensor-parallel-size 8
--trust-remote-code
--max-model-len 32768
--gpu-memory-utilization 0.95
--host 0.0.0.0
--port 8181
proxy: http://0.0.0.0:8181
groups:
"Qwen-235B-FP8-Pair":
swap: false
exclusive: false
members:
- "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8"
- "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"