qingy2024
/

Extrapolis-v1-4B-SFT

Text Generation

text-generation-inference

Model card Files Files and versions

Extrapolis-v1-4B-SFT / config.yaml

qingy2024's picture

Update config.yaml

d4df3b6 verified about 1 month ago

history blame contribute delete

3.93 kB

	logLevel: info
	healthCheckTimeout: 1000

	models:
	"cognitivecomputations/DeepSeek-R1-0528-AWQ":
	cmd: >
	vllm serve cognitivecomputations/DeepSeek-R1-0528-AWQ
	--tensor-parallel-size 8
	--trust-remote-code
	--max-model-len 65536
	--gpu-memory-utilization 0.95
	--enable-chunked-prefill
	--enable-prefix-caching
	--host 0.0.0.0
	--port 8181
	--reasoning-parser deepseek_r1
	--tool-call-parser deepseek_v3
	--chat-template tool_r1_chat_template.jinja
	proxy: http://0.0.0.0:8181

	"QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ":
	cmd: >
	vllm serve QuantTrio/Qwen3-Coder-480B-A35B-Instruct-AWQ
	--enable-expert-parallel
	--max-model-len 128000
	--gpu_memory_utilization=0.95
	--tensor-parallel-size 8
	--trust-remote-code
	--host 0.0.0.0
	--port 8181
	--enable-auto-tool
	--tool-call-parser hermes
	--enable-prefix-caching

	proxy: http://0.0.0.0:8181


	"Qwen/Qwen3-32B":
	cmd: >
	vllm serve Qwen/Qwen3-32B
	--tensor-parallel-size 8
	--trust-remote-code
	--max-model-len 32768
	--gpu-memory-utilization 0.95
	--enable-prefix-caching
	--host 0.0.0.0
	--port 8181
	--reasoning-parser deepseek_r1
	--enable-auto-tool
	--tool-call-parser hermes
	proxy: http://0.0.0.0:8181

	"Qwen/Qwen3-235B-A22B-Thinking-2507-FP8":
	cmd: >-
	bash -c 'export CUDA_VISIBLE_DEVICES=0,1,2,3; vllm serve Qwen/Qwen3-235B-A22B-Thinking-2507-FP8 --tensor-parallel-size 4 --trust-remote-code --max-model-len 64000 --gpu-memory-utilization 0.95 --host 0.0.0.0 --port 8181 --enable-auto-tool --tool-call-parser hermes --reasoning-parser deepseek_r1'
	proxy: http://0.0.0.0:8181

	"Qwen/Qwen3-235B-A22B-Instruct-2507-FP8":
	cmd: >-
	bash -c 'export CUDA_VISIBLE_DEVICES=4,5,6,7; vllm serve Qwen/Qwen3-235B-A22B-Instruct-2507-FP8 --tensor-parallel-size 4 --trust-remote-code --max-model-len 64000 --gpu-memory-utilization 0.95 --host 0.0.0.0 --port 8182 --enable-auto-tool --tool-call-parser hermes'
	proxy: http://0.0.0.0:8182


	"Kwaipilot/KAT-V1-40B":
	cmd: >
	vllm serve Kwaipilot/KAT-V1-40B
	--max-model-len 32768
	--tensor-parallel-size 4
	--host 0.0.0.0
	--port 8181
	--chat-template kat.jinja
	--gpu-memory-utilization 0.95
	proxy: http://0.0.0.0:8181

	"unsloth/Devstral-Small-2507":
	cmd: >
	vllm serve unsloth/Devstral-Small-2507
	--max-model-len 65536
	--tensor-parallel-size 4
	--host 0.0.0.0
	--port 8181
	--gpu-memory-utilization 0.95
	--tool-call-parser mistral
	--enable-auto-tool-choice
	proxy: http://0.0.0.0:8181

	"zai-org/GLM-4.5-FP8-Instruct":
	cmd: >
	vllm serve zai-org/GLM-4.5-FP8
	--tensor-parallel-size 8
	--gpu_memory_utilization 0.95
	--tool-call-parser glm45
	--enable-auto-tool-choice
	--chat-template glm-4.5-nothink.jinja
	--max-model-len 128000
	--served-model-name "zai-org/GLM-4.5-FP8-Instruct"
	--host 0.0.0.0
	--port 8181

	proxy: http://0.0.0.0:8181

	"zai-org/GLM-4.5-FP8":
	cmd: >
	vllm serve zai-org/GLM-4.5-FP8
	--tensor-parallel-size 8
	--gpu_memory_utilization 0.95
	--reasoning-parser glm45
	--max-model-len 128000
	--host 0.0.0.0
	--port 8181

	proxy: http://0.0.0.0:8181

	"rednote-hilab/dots.llm1.inst":
	cmd: >
	vllm serve rednote-hilab/dots.llm1.inst
	--tensor-parallel-size 8
	--trust-remote-code
	--max-model-len 32768
	--gpu-memory-utilization 0.95
	--host 0.0.0.0
	--port 8181
	proxy: http://0.0.0.0:8181

	groups:
	"Qwen-235B-FP8-Pair":
	swap: false
	exclusive: false
	members:
	- "Qwen/Qwen3-235B-A22B-Thinking-2507-FP8"
	- "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"