vllm (pretrained=/root/autodl-tmp/Broken-Tutu-24B-Unslop-v2.0,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.908 |
± |
0.0183 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.904 |
± |
0.0187 |
vllm (pretrained=/root/autodl-tmp/Broken-Tutu-24B-Unslop-v2.0,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.902 |
± |
0.0133 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.894 |
± |
0.0138 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7965 |
± |
0.0130 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8256 |
± |
0.0256 |
- other |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0266 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8556 |
± |
0.0255 |
- stem |
2 |
none |
|
acc |
↑ |
0.7333 |
± |
0.0249 |
vllm (pretrained=/root/autodl-tmp/root86-512-4096,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.868 |
± |
0.0215 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.860 |
± |
0.0220 |
vllm (pretrained=/root/autodl-tmp/root90-64-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.894 |
± |
0.0138 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.888 |
± |
0.0141 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7883 |
± |
0.0130 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0270 |
- other |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0267 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8611 |
± |
0.0250 |
- stem |
2 |
none |
|
acc |
↑ |
0.7193 |
± |
0.0245 |
vllm (pretrained=/root/autodl-tmp/root90-64-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.892 |
± |
0.0197 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.892 |
± |
0.0197 |
vllm (pretrained=/root/autodl-tmp/root90-128-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.896 |
± |
0.0193 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.896 |
± |
0.0193 |
vllm (pretrained=/root/autodl-tmp/root90-128-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.888 |
± |
0.0141 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.886 |
± |
0.0142 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7988 |
± |
0.0129 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0265 |
- other |
2 |
none |
|
acc |
↑ |
0.8051 |
± |
0.0266 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8778 |
± |
0.0239 |
- stem |
2 |
none |
|
acc |
↑ |
0.7404 |
± |
0.0248 |
vllm (pretrained=/root/autodl-tmp/root90-256-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 250.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.900 |
± |
0.019 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.888 |
± |
0.020 |
vllm (pretrained=/root/autodl-tmp/root90-256-4096-9.9999,add_bos_token=true,max_model_len=3096,dtype=bfloat16,trust_remote_code=true), gen_kwargs: (None), limit: 500.0, num_fewshot: 5, batch_size: auto
Tasks |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
gsm8k |
3 |
flexible-extract |
5 |
exact_match |
↑ |
0.896 |
± |
0.0137 |
|
|
strict-match |
5 |
exact_match |
↑ |
0.890 |
± |
0.0140 |
Groups |
Version |
Filter |
n-shot |
Metric |
|
Value |
|
Stderr |
mmlu |
2 |
none |
|
acc |
↑ |
0.7977 |
± |
0.0131 |
- humanities |
2 |
none |
|
acc |
↑ |
0.8205 |
± |
0.0264 |
- other |
2 |
none |
|
acc |
↑ |
0.8154 |
± |
0.0264 |
- social sciences |
2 |
none |
|
acc |
↑ |
0.8667 |
± |
0.0247 |
- stem |
2 |
none |
|
acc |
↑ |
0.7263 |
± |
0.0253 |