ArkaAbacus commited on
Commit
e40b6f4
·
verified ·
1 Parent(s): 1c49031

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +52 -47
README.md CHANGED
@@ -20,36 +20,36 @@ Note: These results are with corrected parsing for BBH from Eleuther's [lm-evalu
20
 
21
  Smaug-Qwen2-72B-Instruct:
22
 
23
- | Tasks |Version| Filter |n-shot| Metric | |Value | |Stderr|
24
- |----------------------------------------------------------|-------|----------|-----:|-----------|---|-----:|---|-----:|
25
- |bbh |N/A |get-answer| 3|exact_match|↑ |0.8241|0.0042|
26
- | - bbh_cot_fewshot_boolean_expressions | 2|get-answer| 3|exact_match|↑ |0.9640|0.0118|
27
- | - bbh_cot_fewshot_causal_judgement | 2|get-answer| 3|exact_match|↑ |0.6578|0.0348|
28
- | - bbh_cot_fewshot_date_understanding | 2|get-answer| 3|exact_match|↑ |0.8360|0.0235|
29
- | - bbh_cot_fewshot_disambiguation_qa | 2|get-answer| 3|exact_match|↑ |0.8280|0.0239|
30
- | - bbh_cot_fewshot_dyck_languages | 2|get-answer| 3|exact_match|↑ |0.3360|0.0299|
31
- | - bbh_cot_fewshot_formal_fallacies | 2|get-answer| 3|exact_match|↑ |0.7120|0.0287|
32
- | - bbh_cot_fewshot_geometric_shapes | 2|get-answer| 3|exact_match|↑ |0.5320|0.0316|
33
- | - bbh_cot_fewshot_hyperbaton | 2|get-answer| 3|exact_match|↑ |0.9880|0.0069|
34
- | - bbh_cot_fewshot_logical_deduction_five_objects | 2|get-answer| 3|exact_match|↑ |0.7680|0.0268|
35
- | - bbh_cot_fewshot_logical_deduction_seven_objects | 2|get-answer| 3|exact_match|↑ |0.5360|0.0316|
36
- | - bbh_cot_fewshot_logical_deduction_three_objects | 2|get-answer| 3|exact_match|↑ |0.9720|0.0105|
37
- | - bbh_cot_fewshot_movie_recommendation | 2|get-answer| 3|exact_match|↑ |0.8000|0.0253|
38
- | - bbh_cot_fewshot_multistep_arithmetic_two | 2|get-answer| 3|exact_match|↑ |0.9720|0.0105|
39
- | - bbh_cot_fewshot_navigate | 2|get-answer| 3|exact_match|↑ |0.9640|0.0118|
40
- | - bbh_cot_fewshot_object_counting | 2|get-answer| 3|exact_match|↑ |0.9200|0.0172|
41
- | - bbh_cot_fewshot_penguins_in_a_table | 2|get-answer| 3|exact_match|↑ |0.8493|0.0297|
42
- | - bbh_cot_fewshot_reasoning_about_colored_objects | 2|get-answer| 3|exact_match|↑ |0.7560|0.0272|
43
- | - bbh_cot_fewshot_ruin_names | 2|get-answer| 3|exact_match|↑ |0.8520|0.0225|
44
- | - bbh_cot_fewshot_salient_translation_error_detection | 2|get-answer| 3|exact_match|↑ |0.5920|0.0311|
45
- | - bbh_cot_fewshot_snarks | 2|get-answer| 3|exact_match|↑ |0.9101|0.0215|
46
- | - bbh_cot_fewshot_sports_understanding | 2|get-answer| 3|exact_match|↑ |0.9440|0.0146|
47
- | - bbh_cot_fewshot_temporal_sequences | 2|get-answer| 3|exact_match|↑ |1.0000|0.0000|
48
- | - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 2|get-answer| 3|exact_match|↑ |0.9800|0.0089|
49
- | - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects| 2|get-answer| 3|exact_match|↑ |0.9560|0.0130|
50
- | - bbh_cot_fewshot_tracking_shuffled_objects_three_objects| 2|get-answer| 3|exact_match|↑ |0.9640|0.0118|
51
- | - bbh_cot_fewshot_web_of_lies | 2|get-answer| 3|exact_match|↑ |1.0000|0.0000|
52
- | - bbh_cot_fewshot_word_sorting | 2|get-answer| 3|exact_match|↑ |0.6560|0.0301|
53
 
54
  Qwen2-72B-Instruct:
55
 
@@ -96,9 +96,9 @@ Score vs selected others (sourced from: (https://lmsys.org/blog/2024-04-19-arena
96
  | Claude-3-Opus-20240229 | 60.4 | (-3.3, 2.4) | 541 |
97
  | Smaug-Llama-3-70B-Instruct | 56.7 | (-2.2, 2.6) | 661 |
98
  | GPT-4-0314 | 50.0 | (-0.0, 0.0) | 423 |
99
- | Smaug-Qwen2-72B-Instruct | score: 48.0 | (-1.8, 2.1) | 628 |
100
  | Claude-3-Sonnet-20240229 | 46.8 | (-2.1, 2.2) | 552 |
101
- | Qwen2-72B-Instruct | score: 43.5 | (-2.6, 2.7) | 531 |
102
  | Llama-3-70B-Instruct | 41.1 | (-2.5, 2.4) | 583 |
103
  | GPT-4-0613 | 37.9 | (-2.2, 2.0) | 354 |
104
  | Mistral-Large-2402 | 37.7 | (-1.9, 2.6) | 400 |
@@ -110,21 +110,26 @@ Score vs selected others (sourced from: (https://lmsys.org/blog/2024-04-19-arena
110
 
111
  ## MT-Bench
112
 
113
- ########## First turn ##########
114
- score
115
- model turn
116
- Qwen2-72B-Instruct 1 9.18125
117
- Smaug-Qwen2-72B-Instruct 1 9.05625
118
- ########## Second turn ##########
119
- score
120
- model turn
121
- Qwen2-72B-Instruct 2 8.74684
122
- Smaug-Qwen2-72B-Instruct 2 8.67500
123
- ########## Average ##########
124
- score
125
- model
126
- Qwen2-72B-Instruct 8.96541
127
- Smaug-Qwen2-72B-Instruct 8.86563
 
 
 
 
 
128
 
129
 
130
  # Model Card for Model ID
 
20
 
21
  Smaug-Qwen2-72B-Instruct:
22
 
23
+ | Tasks | Version | Filter | n-shot | Metric | Value | Stderr |
24
+ |-----------------------------------------------------------|---------|------------|--------|-------------|--------|--------|
25
+ | bbh | N/A | get-answer | 3 | exact_match | 0.8241 | 0.0042 |
26
+ | - bbh_cot_fewshot_boolean_expressions | 2 | get-answer | 3 | exact_match | 0.9640 | 0.0118 |
27
+ | - bbh_cot_fewshot_causal_judgement | 2 | get-answer | 3 | exact_match | 0.6578 | 0.0348 |
28
+ | - bbh_cot_fewshot_date_understanding | 2 | get-answer | 3 | exact_match | 0.8360 | 0.0235 |
29
+ | - bbh_cot_fewshot_disambiguation_qa | 2 | get-answer | 3 | exact_match | 0.8280 | 0.0239 |
30
+ | - bbh_cot_fewshot_dyck_languages | 2 | get-answer | 3 | exact_match | 0.3360 | 0.0299 |
31
+ | - bbh_cot_fewshot_formal_fallacies | 2 | get-answer | 3 | exact_match | 0.7120 | 0.0287 |
32
+ | - bbh_cot_fewshot_geometric_shapes | 2 | get-answer | 3 | exact_match | 0.5320 | 0.0316 |
33
+ | - bbh_cot_fewshot_hyperbaton | 2 | get-answer | 3 | exact_match | 0.9880 | 0.0069 |
34
+ | - bbh_cot_fewshot_logical_deduction_five_objects | 2 | get-answer | 3 | exact_match | 0.7680 | 0.0268 |
35
+ | - bbh_cot_fewshot_logical_deduction_seven_objects | 2 | get-answer | 3 | exact_match | 0.5360 | 0.0316 |
36
+ | - bbh_cot_fewshot_logical_deduction_three_objects | 2 | get-answer | 3 | exact_match | 0.9720 | 0.0105 |
37
+ | - bbh_cot_fewshot_movie_recommendation | 2 | get-answer | 3 | exact_match | 0.8000 | 0.0253 |
38
+ | - bbh_cot_fewshot_multistep_arithmetic_two | 2 | get-answer | 3 | exact_match | 0.9720 | 0.0105 |
39
+ | - bbh_cot_fewshot_navigate | 2 | get-answer | 3 | exact_match | 0.9640 | 0.0118 |
40
+ | - bbh_cot_fewshot_object_counting | 2 | get-answer | 3 | exact_match | 0.9200 | 0.0172 |
41
+ | - bbh_cot_fewshot_penguins_in_a_table | 2 | get-answer | 3 | exact_match | 0.8493 | 0.0297 |
42
+ | - bbh_cot_fewshot_reasoning_about_colored_objects | 2 | get-answer | 3 | exact_match | 0.7560 | 0.0272 |
43
+ | - bbh_cot_fewshot_ruin_names | 2 | get-answer | 3 | exact_match | 0.8520 | 0.0225 |
44
+ | - bbh_cot_fewshot_salient_translation_error_detection | 2 | get-answer | 3 | exact_match | 0.5920 | 0.0311 |
45
+ | - bbh_cot_fewshot_snarks | 2 | get-answer | 3 | exact_match | 0.9101 | 0.0215 |
46
+ | - bbh_cot_fewshot_sports_understanding | 2 | get-answer | 3 | exact_match | 0.9440 | 0.0146 |
47
+ | - bbh_cot_fewshot_temporal_sequences | 2 | get-answer | 3 | exact_match | 1.0000 | 0.0000 |
48
+ | - bbh_cot_fewshot_tracking_shuffled_objects_five_objects | 2 | get-answer | 3 | exact_match | 0.9800 | 0.0089 |
49
+ | - bbh_cot_fewshot_tracking_shuffled_objects_seven_objects | 2 | get-answer | 3 | exact_match | 0.9560 | 0.0130 |
50
+ | - bbh_cot_fewshot_tracking_shuffled_objects_three_objects | 2 | get-answer | 3 | exact_match | 0.9640 | 0.0118 |
51
+ | - bbh_cot_fewshot_web_of_lies | 2 | get-answer | 3 | exact_match | 1.0000 | 0.0000 |
52
+ | - bbh_cot_fewshot_word_sorting | 2 | get-answer | 3 | exact_match | 0.6560 | 0.0301 |
53
 
54
  Qwen2-72B-Instruct:
55
 
 
96
  | Claude-3-Opus-20240229 | 60.4 | (-3.3, 2.4) | 541 |
97
  | Smaug-Llama-3-70B-Instruct | 56.7 | (-2.2, 2.6) | 661 |
98
  | GPT-4-0314 | 50.0 | (-0.0, 0.0) | 423 |
99
+ | Smaug-Qwen2-72B-Instruct | 48.0 | (-1.8, 2.1) | 628 |
100
  | Claude-3-Sonnet-20240229 | 46.8 | (-2.1, 2.2) | 552 |
101
+ | Qwen2-72B-Instruct | 43.5 | (-2.6, 2.7) | 531 |
102
  | Llama-3-70B-Instruct | 41.1 | (-2.5, 2.4) | 583 |
103
  | GPT-4-0613 | 37.9 | (-2.2, 2.0) | 354 |
104
  | Mistral-Large-2402 | 37.7 | (-1.9, 2.6) | 400 |
 
110
 
111
  ## MT-Bench
112
 
113
+ First turn
114
+
115
+ | Model | Turn | Score |
116
+ |--------------------------|------|---------|
117
+ | Qwen2-72B-Instruct | 1 | 9.18125 |
118
+ | Smaug-Qwen2-72B-Instruct | 1 | 9.05625 |
119
+
120
+ Second turn
121
+
122
+ | Model | Turn | Score |
123
+ |--------------------------|------|---------|
124
+ | Qwen2-72B-Instruct | 2 | 8.74684 |
125
+ | Smaug-Qwen2-72B-Instruct | 2 | 8.67500 |
126
+
127
+ Average
128
+
129
+ | Model | Score |
130
+ |--------------------------|---------|
131
+ | Qwen2-72B-Instruct | 8.96541 |
132
+ | Smaug-Qwen2-72B-Instruct | 8.86563 |
133
 
134
 
135
  # Model Card for Model ID