vllm (pretrained=/root/autodl-tmp/Magistral-Small-2506,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.924 |
± |
0.0168 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.920 |
± |
0.0172 |
vllm (pretrained=/root/autodl-tmp/Magistral-Small-2506,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.916 |
± |
0.0124 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.912 |
± |
0.0127 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7766 |
± |
0.0134 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8154 |
± |
0.0259 |
- other |
2 |
none |
|
acc |
↑ |
0.8308 |
± |
0.0266 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8056 |
± |
0.0282 |
- stem |
2 |
none |
|
acc |
↑ |
0.6947 |
± |
0.0256 |
vllm (pretrained=/root/autodl-tmp/root90-256-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.908 |
± |
0.0183 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
llm (pretrained=/root/autodl-tmp/root90-128-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.924 |
± |
0.0168 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.920 |
± |
0.0172 |
vllm (pretrained=/root/autodl-tmp/root90-128-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.912 |
± |
0.0127 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.906 |
± |
0.0131 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7754 |
± |
0.0135 |
- humanities |
2 |
none |
|
acc |
↑ |
0.7949 |
± |
0.0268 |
- other |
2 |
none |
|
acc |
↑ |
0.8103 |
± |
0.0278 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8278 |
± |
0.0273 |
- stem |
2 |
none |
|
acc |
↑ |
0.7053 |
± |
0.0254 |