rwmasood commited on
Commit
8091f32
·
verified ·
1 Parent(s): b1cd694

Upload llm-eval-Kiwi-1.0-0.7B-32k-Instruct-complete.json

Browse files
llm-eval-Kiwi-1.0-0.7B-32k-Instruct-complete.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "arc_challenge": {
3
+ "alias": "arc_challenge",
4
+ "acc,none": 0.32081911262798635,
5
+ "acc_stderr,none": 0.013640943091946526,
6
+ "acc_norm,none": 0.32337883959044367,
7
+ "acc_norm_stderr,none": 0.013669421630012118
8
+ },
9
+ "gpqa_diamond_cot_n_shot": {
10
+ "alias": "gpqa_diamond_cot_n_shot",
11
+ "exact_match,strict-match": 0.005050505050505051,
12
+ "exact_match_stderr,strict-match": 0.005050505050505052,
13
+ "exact_match,flexible-extract": 0.23232323232323232,
14
+ "exact_match_stderr,flexible-extract": 0.03008862949021749
15
+ },
16
+ "gpqa_diamond_cot_zeroshot": {
17
+ "alias": "gpqa_diamond_cot_zeroshot",
18
+ "exact_match,strict-match": 0.0,
19
+ "exact_match_stderr,strict-match": 0.0,
20
+ "exact_match,flexible-extract": 0.1717171717171717,
21
+ "exact_match_stderr,flexible-extract": 0.026869716187429917
22
+ },
23
+ "gpqa_diamond_generative_n_shot": {
24
+ "alias": "gpqa_diamond_generative_n_shot",
25
+ "exact_match,strict-match": 0.0,
26
+ "exact_match_stderr,strict-match": 0.0,
27
+ "exact_match,flexible-extract": 0.2222222222222222,
28
+ "exact_match_stderr,flexible-extract": 0.02962022787479049
29
+ },
30
+ "gpqa_diamond_n_shot": {
31
+ "alias": "gpqa_diamond_n_shot",
32
+ "acc,none": 0.2878787878787879,
33
+ "acc_stderr,none": 0.03225883512300993,
34
+ "acc_norm,none": 0.2878787878787879,
35
+ "acc_norm_stderr,none": 0.03225883512300993
36
+ },
37
+ "gpqa_diamond_zeroshot": {
38
+ "alias": "gpqa_diamond_zeroshot",
39
+ "acc,none": 0.3181818181818182,
40
+ "acc_stderr,none": 0.03318477333845331,
41
+ "acc_norm,none": 0.3181818181818182,
42
+ "acc_norm_stderr,none": 0.03318477333845331
43
+ },
44
+ "gpqa_extended_cot_n_shot": {
45
+ "alias": "gpqa_extended_cot_n_shot",
46
+ "exact_match,strict-match": 0.005494505494505495,
47
+ "exact_match_stderr,strict-match": 0.0031664282264934744,
48
+ "exact_match,flexible-extract": 0.21062271062271062,
49
+ "exact_match_stderr,flexible-extract": 0.017466128919599165
50
+ },
51
+ "gpqa_extended_cot_zeroshot": {
52
+ "alias": "gpqa_extended_cot_zeroshot",
53
+ "exact_match,strict-match": 0.0,
54
+ "exact_match_stderr,strict-match": 0.0,
55
+ "exact_match,flexible-extract": 0.2032967032967033,
56
+ "exact_match_stderr,flexible-extract": 0.017239125210643556
57
+ },
58
+ "gpqa_extended_generative_n_shot": {
59
+ "alias": "gpqa_extended_generative_n_shot",
60
+ "exact_match,strict-match": 0.0,
61
+ "exact_match_stderr,strict-match": 0.0,
62
+ "exact_match,flexible-extract": 0.20695970695970695,
63
+ "exact_match_stderr,flexible-extract": 0.017353707620392123
64
+ },
65
+ "gpqa_extended_n_shot": {
66
+ "alias": "gpqa_extended_n_shot",
67
+ "acc,none": 0.25824175824175827,
68
+ "acc_stderr,none": 0.01874762138022981,
69
+ "acc_norm,none": 0.25824175824175827,
70
+ "acc_norm_stderr,none": 0.01874762138022981
71
+ },
72
+ "gpqa_extended_zeroshot": {
73
+ "alias": "gpqa_extended_zeroshot",
74
+ "acc,none": 0.26556776556776557,
75
+ "acc_stderr,none": 0.018917567557968248,
76
+ "acc_norm,none": 0.26556776556776557,
77
+ "acc_norm_stderr,none": 0.018917567557968248
78
+ },
79
+ "gpqa_main_cot_n_shot": {
80
+ "alias": "gpqa_main_cot_n_shot",
81
+ "exact_match,strict-match": 0.004464285714285714,
82
+ "exact_match_stderr,strict-match": 0.003153193710465662,
83
+ "exact_match,flexible-extract": 0.20535714285714285,
84
+ "exact_match_stderr,flexible-extract": 0.019106763899193864
85
+ },
86
+ "gpqa_main_cot_zeroshot": {
87
+ "alias": "gpqa_main_cot_zeroshot",
88
+ "exact_match,strict-match": 0.002232142857142857,
89
+ "exact_match_stderr,strict-match": 0.0022321428571428744,
90
+ "exact_match,flexible-extract": 0.20089285714285715,
91
+ "exact_match_stderr,flexible-extract": 0.018950950292372627
92
+ },
93
+ "gpqa_main_generative_n_shot": {
94
+ "alias": "gpqa_main_generative_n_shot",
95
+ "exact_match,strict-match": 0.0,
96
+ "exact_match_stderr,strict-match": 0.0,
97
+ "exact_match,flexible-extract": 0.24330357142857142,
98
+ "exact_match_stderr,flexible-extract": 0.020294638625866775
99
+ },
100
+ "gpqa_main_n_shot": {
101
+ "alias": "gpqa_main_n_shot",
102
+ "acc,none": 0.328125,
103
+ "acc_stderr,none": 0.0222080353262888,
104
+ "acc_norm,none": 0.328125,
105
+ "acc_norm_stderr,none": 0.0222080353262888
106
+ },
107
+ "gpqa_main_zeroshot": {
108
+ "alias": "gpqa_main_zeroshot",
109
+ "acc,none": 0.234375,
110
+ "acc_stderr,none": 0.020035949758324928,
111
+ "acc_norm,none": 0.234375,
112
+ "acc_norm_stderr,none": 0.020035949758324928
113
+ },
114
+ "hellaswag": {
115
+ "alias": "hellaswag",
116
+ "acc,none": 0.3990240987851026,
117
+ "acc_stderr,none": 0.004886969266944274,
118
+ "acc_norm,none": 0.4858593905596495,
119
+ "acc_norm_stderr,none": 0.004987785530475671
120
+ },
121
+ "ifeval": {
122
+ "alias": "ifeval",
123
+ "prompt_level_strict_acc,none": 0.1423290203327172,
124
+ "prompt_level_strict_acc_stderr,none": 0.0150352345622028,
125
+ "inst_level_strict_acc,none": 0.23381294964028776,
126
+ "inst_level_strict_acc_stderr,none": "N/A",
127
+ "prompt_level_loose_acc,none": 0.16820702402957485,
128
+ "prompt_level_loose_acc_stderr,none": 0.016096550188063007,
129
+ "inst_level_loose_acc,none": 0.2709832134292566,
130
+ "inst_level_loose_acc_stderr,none": "N/A"
131
+ },
132
+ "mmlu_pro": {
133
+ "exact_match,custom-extract": 0.12890625,
134
+ "exact_match_stderr,custom-extract": 0.0030434275267919382,
135
+ "alias": "mmlu_pro"
136
+ },
137
+ "mmlu_pro_biology": {
138
+ "alias": " - biology",
139
+ "exact_match,custom-extract": 0.17154811715481172,
140
+ "exact_match_stderr,custom-extract": 0.014088673719424991
141
+ },
142
+ "mmlu_pro_business": {
143
+ "alias": " - business",
144
+ "exact_match,custom-extract": 0.11913814955640051,
145
+ "exact_match_stderr,custom-extract": 0.011540276571470746
146
+ },
147
+ "mmlu_pro_chemistry": {
148
+ "alias": " - chemistry",
149
+ "exact_match,custom-extract": 0.09187279151943463,
150
+ "exact_match_stderr,custom-extract": 0.008588859484723885
151
+ },
152
+ "mmlu_pro_computer_science": {
153
+ "alias": " - computer_science",
154
+ "exact_match,custom-extract": 0.12926829268292683,
155
+ "exact_match_stderr,custom-extract": 0.016589241600938224
156
+ },
157
+ "mmlu_pro_economics": {
158
+ "alias": " - economics",
159
+ "exact_match,custom-extract": 0.18838862559241706,
160
+ "exact_match_stderr,custom-extract": 0.013467519528615376
161
+ },
162
+ "mmlu_pro_engineering": {
163
+ "alias": " - engineering",
164
+ "exact_match,custom-extract": 0.08565531475748193,
165
+ "exact_match_stderr,custom-extract": 0.008994860895662203
166
+ },
167
+ "mmlu_pro_health": {
168
+ "alias": " - health",
169
+ "exact_match,custom-extract": 0.14425427872860636,
170
+ "exact_match_stderr,custom-extract": 0.012292088876836848
171
+ },
172
+ "mmlu_pro_history": {
173
+ "alias": " - history",
174
+ "exact_match,custom-extract": 0.15485564304461943,
175
+ "exact_match_stderr,custom-extract": 0.018558256274560196
176
+ },
177
+ "mmlu_pro_law": {
178
+ "alias": " - law",
179
+ "exact_match,custom-extract": 0.12534059945504086,
180
+ "exact_match_stderr,custom-extract": 0.009983182840215515
181
+ },
182
+ "mmlu_pro_math": {
183
+ "alias": " - math",
184
+ "exact_match,custom-extract": 0.09844559585492228,
185
+ "exact_match_stderr,custom-extract": 0.008108251878622208
186
+ },
187
+ "mmlu_pro_other": {
188
+ "alias": " - other",
189
+ "exact_match,custom-extract": 0.14502164502164502,
190
+ "exact_match_stderr,custom-extract": 0.011590258522971733
191
+ },
192
+ "mmlu_pro_philosophy": {
193
+ "alias": " - philosophy",
194
+ "exact_match,custom-extract": 0.14829659318637275,
195
+ "exact_match_stderr,custom-extract": 0.015925574493977492
196
+ },
197
+ "mmlu_pro_physics": {
198
+ "alias": " - physics",
199
+ "exact_match,custom-extract": 0.1100846805234796,
200
+ "exact_match_stderr,custom-extract": 0.008687612438998104
201
+ },
202
+ "mmlu_pro_psychology": {
203
+ "alias": " - psychology",
204
+ "exact_match,custom-extract": 0.17042606516290726,
205
+ "exact_match_stderr,custom-extract": 0.013318839850041711
206
+ }
207
+ }