Update README.md
Browse files
README.md
CHANGED
@@ -20,36 +20,36 @@ Note: These results are with corrected parsing for BBH from Eleuther's [lm-evalu
|
|
20 |
|
21 |
Smaug-Qwen2-72B-Instruct:
|
22 |
|
23 |
-
|
|
24 |
-
|
25 |
-
|bbh |N/A
|
26 |
-
| - bbh_cot_fewshot_boolean_expressions
|
27 |
-
| - bbh_cot_fewshot_causal_judgement
|
28 |
-
| - bbh_cot_fewshot_date_understanding
|
29 |
-
| - bbh_cot_fewshot_disambiguation_qa
|
30 |
-
| - bbh_cot_fewshot_dyck_languages
|
31 |
-
| - bbh_cot_fewshot_formal_fallacies
|
32 |
-
| - bbh_cot_fewshot_geometric_shapes
|
33 |
-
| - bbh_cot_fewshot_hyperbaton
|
34 |
-
| - bbh_cot_fewshot_logical_deduction_five_objects
|
35 |
-
| - bbh_cot_fewshot_logical_deduction_seven_objects
|
36 |
-
| - bbh_cot_fewshot_logical_deduction_three_objects
|
37 |
-
| - bbh_cot_fewshot_movie_recommendation
|
38 |
-
| - bbh_cot_fewshot_multistep_arithmetic_two
|
39 |
-
| - bbh_cot_fewshot_navigate
|
40 |
-
| - bbh_cot_fewshot_object_counting
|
41 |
-
| - bbh_cot_fewshot_penguins_in_a_table
|
42 |
-
| - bbh_cot_fewshot_reasoning_about_colored_objects
|
43 |
-
| - bbh_cot_fewshot_ruin_names
|
44 |
-
| - bbh_cot_fewshot_salient_translation_error_detection
|
45 |
-
| - bbh_cot_fewshot_snarks
|
46 |
-
| - bbh_cot_fewshot_sports_understanding
|
47 |
-
| - bbh_cot_fewshot_temporal_sequences
|
48 |
-
| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects
|
49 |
-
| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects|
|
50 |
-
| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects|
|
51 |
-
| - bbh_cot_fewshot_web_of_lies
|
52 |
-
| - bbh_cot_fewshot_word_sorting
|
53 |
|
54 |
Qwen2-72B-Instruct:
|
55 |
|
@@ -96,9 +96,9 @@ Score vs selected others (sourced from: (https://lmsys.org/blog/2024-04-19-arena
|
|
96 |
| Claude-3-Opus-20240229 | 60.4 | (-3.3, 2.4) | 541 |
|
97 |
| Smaug-Llama-3-70B-Instruct | 56.7 | (-2.2, 2.6) | 661 |
|
98 |
| GPT-4-0314 | 50.0 | (-0.0, 0.0) | 423 |
|
99 |
-
| Smaug-Qwen2-72B-Instruct |
|
100 |
| Claude-3-Sonnet-20240229 | 46.8 | (-2.1, 2.2) | 552 |
|
101 |
-
| Qwen2-72B-Instruct |
|
102 |
| Llama-3-70B-Instruct | 41.1 | (-2.5, 2.4) | 583 |
|
103 |
| GPT-4-0613 | 37.9 | (-2.2, 2.0) | 354 |
|
104 |
| Mistral-Large-2402 | 37.7 | (-1.9, 2.6) | 400 |
|
@@ -110,21 +110,26 @@ Score vs selected others (sourced from: (https://lmsys.org/blog/2024-04-19-arena
|
|
110 |
|
111 |
## MT-Bench
|
112 |
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
|
130 |
# Model Card for Model ID
|
|
|
20 |
|
21 |
Smaug-Qwen2-72B-Instruct:
|
22 |
|
23 |
+
| Tasks | Version | Filter | n-shot | Metric | Value | Stderr |
|
24 |
+
|-----------------------------------------------------------|---------|------------|--------|-------------|--------|--------|
|
25 |
+
| bbh | N/A | get-answer | 3 | exact_match | 0.8241 | 0.0042 |
|
26 |
+
| - bbh_cot_fewshot_boolean_expressions | 2 | get-answer | 3 | exact_match | 0.9640 | 0.0118 |
|
27 |
+
| - bbh_cot_fewshot_causal_judgement | 2 | get-answer | 3 | exact_match | 0.6578 | 0.0348 |
|
28 |
+
| - bbh_cot_fewshot_date_understanding | 2 | get-answer | 3 | exact_match | 0.8360 | 0.0235 |
|
29 |
+
| - bbh_cot_fewshot_disambiguation_qa | 2 | get-answer | 3 | exact_match | 0.8280 | 0.0239 |
|
30 |
+
| - bbh_cot_fewshot_dyck_languages | 2 | get-answer | 3 | exact_match | 0.3360 | 0.0299 |
|
31 |
+
| - bbh_cot_fewshot_formal_fallacies | 2 | get-answer | 3 | exact_match | 0.7120 | 0.0287 |
|
32 |
+
| - bbh_cot_fewshot_geometric_shapes | 2 | get-answer | 3 | exact_match | 0.5320 | 0.0316 |
|
33 |
+
| - bbh_cot_fewshot_hyperbaton | 2 | get-answer | 3 | exact_match | 0.9880 | 0.0069 |
|
34 |
+
| - bbh_cot_fewshot_logical_deduction_five_objects | 2 | get-answer | 3 | exact_match | 0.7680 | 0.0268 |
|
35 |
+
| - bbh_cot_fewshot_logical_deduction_seven_objects | 2 | get-answer | 3 | exact_match | 0.5360 | 0.0316 |
|
36 |
+
| - bbh_cot_fewshot_logical_deduction_three_objects | 2 | get-answer | 3 | exact_match | 0.9720 | 0.0105 |
|
37 |
+
| - bbh_cot_fewshot_movie_recommendation | 2 | get-answer | 3 | exact_match | 0.8000 | 0.0253 |
|
38 |
+
| - bbh_cot_fewshot_multistep_arithmetic_two | 2 | get-answer | 3 | exact_match | 0.9720 | 0.0105 |
|
39 |
+
| - bbh_cot_fewshot_navigate | 2 | get-answer | 3 | exact_match | 0.9640 | 0.0118 |
|
40 |
+
| - bbh_cot_fewshot_object_counting | 2 | get-answer | 3 | exact_match | 0.9200 | 0.0172 |
|
41 |
+
| - bbh_cot_fewshot_penguins_in_a_table | 2 | get-answer | 3 | exact_match | 0.8493 | 0.0297 |
|
42 |
+
| - bbh_cot_fewshot_reasoning_about_colored_objects | 2 | get-answer | 3 | exact_match | 0.7560 | 0.0272 |
|
43 |
+
| - bbh_cot_fewshot_ruin_names | 2 | get-answer | 3 | exact_match | 0.8520 | 0.0225 |
|
44 |
+
| - bbh_cot_fewshot_salient_translation_error_detection | 2 | get-answer | 3 | exact_match | 0.5920 | 0.0311 |
|
45 |
+
| - bbh_cot_fewshot_snarks | 2 | get-answer | 3 | exact_match | 0.9101 | 0.0215 |
|
46 |
+
| - bbh_cot_fewshot_sports_understanding | 2 | get-answer | 3 | exact_match | 0.9440 | 0.0146 |
|
47 |
+
| - bbh_cot_fewshot_temporal_sequences | 2 | get-answer | 3 | exact_match | 1.0000 | 0.0000 |
|
48 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 2 | get-answer | 3 | exact_match | 0.9800 | 0.0089 |
|
49 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects | 2 | get-answer | 3 | exact_match | 0.9560 | 0.0130 |
|
50 |
+
| - bbh_cot_fewshot_tracking_shuffled_objects_three_objects | 2 | get-answer | 3 | exact_match | 0.9640 | 0.0118 |
|
51 |
+
| - bbh_cot_fewshot_web_of_lies | 2 | get-answer | 3 | exact_match | 1.0000 | 0.0000 |
|
52 |
+
| - bbh_cot_fewshot_word_sorting | 2 | get-answer | 3 | exact_match | 0.6560 | 0.0301 |
|
53 |
|
54 |
Qwen2-72B-Instruct:
|
55 |
|
|
|
96 |
| Claude-3-Opus-20240229 | 60.4 | (-3.3, 2.4) | 541 |
|
97 |
| Smaug-Llama-3-70B-Instruct | 56.7 | (-2.2, 2.6) | 661 |
|
98 |
| GPT-4-0314 | 50.0 | (-0.0, 0.0) | 423 |
|
99 |
+
| Smaug-Qwen2-72B-Instruct | 48.0 | (-1.8, 2.1) | 628 |
|
100 |
| Claude-3-Sonnet-20240229 | 46.8 | (-2.1, 2.2) | 552 |
|
101 |
+
| Qwen2-72B-Instruct | 43.5 | (-2.6, 2.7) | 531 |
|
102 |
| Llama-3-70B-Instruct | 41.1 | (-2.5, 2.4) | 583 |
|
103 |
| GPT-4-0613 | 37.9 | (-2.2, 2.0) | 354 |
|
104 |
| Mistral-Large-2402 | 37.7 | (-1.9, 2.6) | 400 |
|
|
|
110 |
|
111 |
## MT-Bench
|
112 |
|
113 |
+
First turn
|
114 |
+
|
115 |
+
| Model | Turn | Score |
|
116 |
+
|--------------------------|------|---------|
|
117 |
+
| Qwen2-72B-Instruct | 1 | 9.18125 |
|
118 |
+
| Smaug-Qwen2-72B-Instruct | 1 | 9.05625 |
|
119 |
+
|
120 |
+
Second turn
|
121 |
+
|
122 |
+
| Model | Turn | Score |
|
123 |
+
|--------------------------|------|---------|
|
124 |
+
| Qwen2-72B-Instruct | 2 | 8.74684 |
|
125 |
+
| Smaug-Qwen2-72B-Instruct | 2 | 8.67500 |
|
126 |
+
|
127 |
+
Average
|
128 |
+
|
129 |
+
| Model | Score |
|
130 |
+
|--------------------------|---------|
|
131 |
+
| Qwen2-72B-Instruct | 8.96541 |
|
132 |
+
| Smaug-Qwen2-72B-Instruct | 8.86563 |
|
133 |
|
134 |
|
135 |
# Model Card for Model ID
|