|
logLevel: info |
|
healthCheckTimeout: 1000 |
|
|
|
models: |
|
"cognitivecomputations/DeepSeek-R1-0528-AWQ": |
|
cmd: > |
|
vllm serve cognitivecomputations/DeepSeek-R1-0528-AWQ |
|
--tensor-parallel-size 8 |
|
--trust-remote-code |
|
--max-model-len 65536 |
|
--gpu-memory-utilization 0.95 |
|
--enable-chunked-prefill |
|
--enable-prefix-caching |
|
--host 0.0.0.0 |
|
--port 8181 |
|
--reasoning-parser deepseek_r1 |
|
--tool-call-parser deepseek_v3 |
|
--chat-template tool_r1_chat_template.jinja |
|
proxy: http://0.0.0.0:8181 |
|
|
|
"QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ": |
|
cmd: > |
|
vllm serve QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ |
|
--enable-expert-parallel |
|
--max-model-len 128000 |
|
--gpu_memory_utilization=0.95 |
|
--tensor-parallel-size 8 |
|
--trust-remote-code |
|
--host 0.0.0.0 |
|
--port 8181 |
|
--enable-auto-tool |
|
--tool-call-parser hermes |
|
--enable-prefix-caching |
|
|
|
proxy: http://0.0.0.0:8181 |
|
|
|
|
|
"Qwen/Qwen3-32B": |
|
cmd: > |
|
vllm serve Qwen/Qwen3-32B |
|
--tensor-parallel-size 8 |
|
--trust-remote-code |
|
--max-model-len 32768 |
|
--gpu-memory-utilization 0.95 |
|
--enable-prefix-caching |
|
--host 0.0.0.0 |
|
--port 8181 |
|
--reasoning-parser deepseek_r1 |
|
--enable-auto-tool |
|
--tool-call-parser hermes |
|
proxy: http://0.0.0.0:8181 |
|
|
|
"Qwen/Qwen3-235B-A22B-Thinking-2507-FP8": |
|
cmd: >- |
|
bash -c 'export CUDA_VISIBLE_DEVICES=0,1,2,3; vllm serve Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 --tensor-parallel-size 4 --trust-remote-code --max-model-len 64000 --gpu-memory-utilization 0.95 --host 0.0.0.0 --port 8181 --enable-auto-tool --tool-call-parser hermes --reasoning-parser deepseek_r1' |
|
proxy: http://0.0.0.0:8181 |
|
|
|
"Qwen/Qwen3-235B-A22B-Instruct-2507-FP8": |
|
cmd: >- |
|
bash -c 'export CUDA_VISIBLE_DEVICES=4,5,6,7; vllm serve Qwen/Qwen3-235B-A22B-Instruct-2507-FP8 --tensor-parallel-size 4 --trust-remote-code --max-model-len 64000 --gpu-memory-utilization 0.95 --host 0.0.0.0 --port 8182 --enable-auto-tool --tool-call-parser hermes' |
|
proxy: http://0.0.0.0:8182 |
|
|
|
|
|
"Kwaipilot/KAT-V1-40B": |
|
cmd: > |
|
vllm serve Kwaipilot/KAT-V1-40B |
|
--max-model-len 32768 |
|
--tensor-parallel-size 4 |
|
--host 0.0.0.0 |
|
--port 8181 |
|
--chat-template kat.jinja |
|
--gpu-memory-utilization 0.95 |
|
proxy: http://0.0.0.0:8181 |
|
|
|
"unsloth/Devstral-Small-2507": |
|
cmd: > |
|
vllm serve unsloth/Devstral-Small-2507 |
|
--max-model-len 65536 |
|
--tensor-parallel-size 4 |
|
--host 0.0.0.0 |
|
--port 8181 |
|
--gpu-memory-utilization 0.95 |
|
--tool-call-parser mistral |
|
--enable-auto-tool-choice |
|
proxy: http://0.0.0.0:8181 |
|
|
|
"zai-org/GLM-4.5-FP8-Instruct": |
|
cmd: > |
|
vllm serve zai-org/GLM-4.5-FP8 |
|
--tensor-parallel-size 8 |
|
--gpu_memory_utilization 0.95 |
|
--tool-call-parser glm45 |
|
--enable-auto-tool-choice |
|
--chat-template glm-4.5-nothink.jinja |
|
--max-model-len 128000 |
|
--served-model-name "zai-org/GLM-4.5-FP8-Instruct" |
|
--host 0.0.0.0 |
|
--port 8181 |
|
|
|
proxy: http://0.0.0.0:8181 |
|
|
|
"zai-org/GLM-4.5-FP8": |
|
cmd: > |
|
vllm serve zai-org/GLM-4.5-FP8 |
|
--tensor-parallel-size 8 |
|
--gpu_memory_utilization 0.95 |
|
--reasoning-parser glm45 |
|
--max-model-len 128000 |
|
--host 0.0.0.0 |
|
--port 8181 |
|
|
|
proxy: http://0.0.0.0:8181 |
|
|
|
"rednote-hilab/dots.llm1.inst": |
|
cmd: > |
|
vllm serve rednote-hilab/dots.llm1.inst |
|
--tensor-parallel-size 8 |
|
--trust-remote-code |
|
--max-model-len 32768 |
|
--gpu-memory-utilization 0.95 |
|
--host 0.0.0.0 |
|
--port 8181 |
|
proxy: http://0.0.0.0:8181 |
|
|
|
groups: |
|
"Qwen-235B-FP8-Pair": |
|
swap: false |
|
exclusive: false |
|
members: |
|
- "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8" |
|
- "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8" |