vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true,tensor_parallel_size=4,gpu_memory_utilization=0.8), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.900 |
± |
0.0190 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.896 |
± |
0.0193 |
vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true,tensor_parallel_size=4,gpu_memory_utilization=0.8), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.852 |
± |
0.0159 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.840 |
± |
0.0164 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7988 |
± |
0.0131 |
- humanities |
2 |
none |
|
acc |
↑ |
0.7897 |
± |
0.0269 |
- other |
2 |
none |
|
acc |
↑ |
0.7590 |
± |
0.0298 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8722 |
± |
0.0252 |
- stem |
2 |
none |
|
acc |
↑ |
0.7860 |
± |
0.0230 |
vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated-awq,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true,tensor_parallel_size=2), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: 1
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.892 |
± |
0.0197 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.888 |
± |
0.0200 |
vllm (pretrained=/root/autodl-tmp/Qwen3-32B-abliterated-awq,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.864 |
± |
0.0153 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.862 |
± |
0.0154 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7871 |
± |
0.0131 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8000 |
± |
0.0266 |
- other |
2 |
none |
|
acc |
↑ |
0.7692 |
± |
0.0280 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8611 |
± |
0.0260 |
- stem |
2 |
none |
|
acc |
↑ |
0.7439 |
± |
0.0240 |