rwmasood commited on
Commit
4d197f3
·
verified ·
1 Parent(s): 1017834

Upload llm-eval-Qwen2.5-0.5B-Instruct-ifeval.json

Browse files
llm-eval-Qwen2.5-0.5B-Instruct-ifeval.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "arc_challenge": {
3
+ "alias": "arc_challenge",
4
+ "acc,none": 0.30631399317406144,
5
+ "acc_stderr,none": 0.013470584417276513,
6
+ "acc_norm,none": 0.33447098976109213,
7
+ "acc_norm_stderr,none": 0.013787460322441382
8
+ },
9
+ "gpqa_diamond_cot_n_shot": {
10
+ "alias": "gpqa_diamond_cot_n_shot",
11
+ "exact_match,strict-match": 0.0,
12
+ "exact_match_stderr,strict-match": 0.0,
13
+ "exact_match,flexible-extract": 0.15151515151515152,
14
+ "exact_match_stderr,flexible-extract": 0.0255456504266036
15
+ },
16
+ "gpqa_diamond_cot_zeroshot": {
17
+ "alias": "gpqa_diamond_cot_zeroshot",
18
+ "exact_match,strict-match": 0.0,
19
+ "exact_match_stderr,strict-match": 0.0,
20
+ "exact_match,flexible-extract": 0.12626262626262627,
21
+ "exact_match_stderr,flexible-extract": 0.02366435940288024
22
+ },
23
+ "gpqa_diamond_generative_n_shot": {
24
+ "alias": "gpqa_diamond_generative_n_shot",
25
+ "exact_match,strict-match": 0.005050505050505051,
26
+ "exact_match_stderr,strict-match": 0.0050505050505050535,
27
+ "exact_match,flexible-extract": 0.18686868686868688,
28
+ "exact_match_stderr,flexible-extract": 0.02777253333421898
29
+ },
30
+ "gpqa_diamond_n_shot": {
31
+ "alias": "gpqa_diamond_n_shot",
32
+ "acc,none": 0.23232323232323232,
33
+ "acc_stderr,none": 0.030088629490217483,
34
+ "acc_norm,none": 0.23232323232323232,
35
+ "acc_norm_stderr,none": 0.030088629490217483
36
+ },
37
+ "gpqa_diamond_zeroshot": {
38
+ "alias": "gpqa_diamond_zeroshot",
39
+ "acc,none": 0.2474747474747475,
40
+ "acc_stderr,none": 0.03074630074212451,
41
+ "acc_norm,none": 0.2474747474747475,
42
+ "acc_norm_stderr,none": 0.03074630074212451
43
+ },
44
+ "gpqa_extended_cot_n_shot": {
45
+ "alias": "gpqa_extended_cot_n_shot",
46
+ "exact_match,strict-match": 0.0,
47
+ "exact_match_stderr,strict-match": 0.0,
48
+ "exact_match,flexible-extract": 0.15934065934065933,
49
+ "exact_match_stderr,flexible-extract": 0.015677437397173054
50
+ },
51
+ "gpqa_extended_cot_zeroshot": {
52
+ "alias": "gpqa_extended_cot_zeroshot",
53
+ "exact_match,strict-match": 0.0018315018315018315,
54
+ "exact_match_stderr,strict-match": 0.0018315018315018447,
55
+ "exact_match,flexible-extract": 0.1336996336996337,
56
+ "exact_match_stderr,flexible-extract": 0.014578106095655245
57
+ },
58
+ "gpqa_extended_generative_n_shot": {
59
+ "alias": "gpqa_extended_generative_n_shot",
60
+ "exact_match,strict-match": 0.0,
61
+ "exact_match_stderr,strict-match": 0.0,
62
+ "exact_match,flexible-extract": 0.20146520146520147,
63
+ "exact_match_stderr,flexible-extract": 0.017181010109243344
64
+ },
65
+ "gpqa_extended_n_shot": {
66
+ "alias": "gpqa_extended_n_shot",
67
+ "acc,none": 0.2600732600732601,
68
+ "acc_stderr,none": 0.018790743352015988,
69
+ "acc_norm,none": 0.2600732600732601,
70
+ "acc_norm_stderr,none": 0.018790743352015988
71
+ },
72
+ "gpqa_extended_zeroshot": {
73
+ "alias": "gpqa_extended_zeroshot",
74
+ "acc,none": 0.26373626373626374,
75
+ "acc_stderr,none": 0.018875713580372485,
76
+ "acc_norm,none": 0.26373626373626374,
77
+ "acc_norm_stderr,none": 0.018875713580372485
78
+ },
79
+ "gpqa_main_cot_n_shot": {
80
+ "alias": "gpqa_main_cot_n_shot",
81
+ "exact_match,strict-match": 0.0,
82
+ "exact_match_stderr,strict-match": 0.0,
83
+ "exact_match,flexible-extract": 0.13616071428571427,
84
+ "exact_match_stderr,flexible-extract": 0.016221410863569787
85
+ },
86
+ "gpqa_main_cot_zeroshot": {
87
+ "alias": "gpqa_main_cot_zeroshot",
88
+ "exact_match,strict-match": 0.0,
89
+ "exact_match_stderr,strict-match": 0.0,
90
+ "exact_match,flexible-extract": 0.15625,
91
+ "exact_match_stderr,flexible-extract": 0.017173671221421365
92
+ },
93
+ "gpqa_main_generative_n_shot": {
94
+ "alias": "gpqa_main_generative_n_shot",
95
+ "exact_match,strict-match": 0.0,
96
+ "exact_match_stderr,strict-match": 0.0,
97
+ "exact_match,flexible-extract": 0.19642857142857142,
98
+ "exact_match_stderr,flexible-extract": 0.018791472419524228
99
+ },
100
+ "gpqa_main_n_shot": {
101
+ "alias": "gpqa_main_n_shot",
102
+ "acc,none": 0.2700892857142857,
103
+ "acc_stderr,none": 0.021000749078822385,
104
+ "acc_norm,none": 0.2700892857142857,
105
+ "acc_norm_stderr,none": 0.021000749078822385
106
+ },
107
+ "gpqa_main_zeroshot": {
108
+ "alias": "gpqa_main_zeroshot",
109
+ "acc,none": 0.30580357142857145,
110
+ "acc_stderr,none": 0.021792582688756983,
111
+ "acc_norm,none": 0.30580357142857145,
112
+ "acc_norm_stderr,none": 0.021792582688756983
113
+ },
114
+ "hellaswag": {
115
+ "alias": "hellaswag",
116
+ "acc,none": 0.4047002589125672,
117
+ "acc_stderr,none": 0.004898308167211847,
118
+ "acc_norm,none": 0.5237004580760805,
119
+ "acc_norm_stderr,none": 0.004984172621822888
120
+ },
121
+ "ifeval": {
122
+ "alias": "ifeval",
123
+ "prompt_level_strict_acc,none": 0.20147874306839186,
124
+ "prompt_level_strict_acc_stderr,none": 0.017260802262371488,
125
+ "inst_level_strict_acc,none": 0.3405275779376499,
126
+ "inst_level_strict_acc_stderr,none": "N/A",
127
+ "prompt_level_loose_acc,none": 0.2365988909426987,
128
+ "prompt_level_loose_acc_stderr,none": 0.018288827582625643,
129
+ "inst_level_loose_acc,none": 0.3752997601918465,
130
+ "inst_level_loose_acc_stderr,none": "N/A"
131
+ },
132
+ "mmlu_pro": {
133
+ "exact_match,custom-extract": 0.14037566489361702,
134
+ "exact_match_stderr,custom-extract": 0.003117505733576593,
135
+ "alias": "mmlu_pro"
136
+ },
137
+ "mmlu_pro_biology": {
138
+ "alias": " - biology",
139
+ "exact_match,custom-extract": 0.25662482566248257,
140
+ "exact_match_stderr,custom-extract": 0.016322882305354162
141
+ },
142
+ "mmlu_pro_business": {
143
+ "alias": " - business",
144
+ "exact_match,custom-extract": 0.1267427122940431,
145
+ "exact_match_stderr,custom-extract": 0.011851395705593072
146
+ },
147
+ "mmlu_pro_chemistry": {
148
+ "alias": " - chemistry",
149
+ "exact_match,custom-extract": 0.06537102473498234,
150
+ "exact_match_stderr,custom-extract": 0.00734989211563516
151
+ },
152
+ "mmlu_pro_computer_science": {
153
+ "alias": " - computer_science",
154
+ "exact_match,custom-extract": 0.14878048780487804,
155
+ "exact_match_stderr,custom-extract": 0.017596736073033845
156
+ },
157
+ "mmlu_pro_economics": {
158
+ "alias": " - economics",
159
+ "exact_match,custom-extract": 0.245260663507109,
160
+ "exact_match_stderr,custom-extract": 0.01481830928170158
161
+ },
162
+ "mmlu_pro_engineering": {
163
+ "alias": " - engineering",
164
+ "exact_match,custom-extract": 0.058823529411764705,
165
+ "exact_match_stderr,custom-extract": 0.007562639370979075
166
+ },
167
+ "mmlu_pro_health": {
168
+ "alias": " - health",
169
+ "exact_match,custom-extract": 0.13447432762836187,
170
+ "exact_match_stderr,custom-extract": 0.011935720476846831
171
+ },
172
+ "mmlu_pro_history": {
173
+ "alias": " - history",
174
+ "exact_match,custom-extract": 0.14698162729658792,
175
+ "exact_match_stderr,custom-extract": 0.018164310621441037
176
+ },
177
+ "mmlu_pro_law": {
178
+ "alias": " - law",
179
+ "exact_match,custom-extract": 0.13533151680290645,
180
+ "exact_match_stderr,custom-extract": 0.010314019468785332
181
+ },
182
+ "mmlu_pro_math": {
183
+ "alias": " - math",
184
+ "exact_match,custom-extract": 0.12731310140636565,
185
+ "exact_match_stderr,custom-extract": 0.009071913335559057
186
+ },
187
+ "mmlu_pro_other": {
188
+ "alias": " - other",
189
+ "exact_match,custom-extract": 0.15151515151515152,
190
+ "exact_match_stderr,custom-extract": 0.011801826546500667
191
+ },
192
+ "mmlu_pro_philosophy": {
193
+ "alias": " - philosophy",
194
+ "exact_match,custom-extract": 0.12625250501002003,
195
+ "exact_match_stderr,custom-extract": 0.014883268009546964
196
+ },
197
+ "mmlu_pro_physics": {
198
+ "alias": " - physics",
199
+ "exact_match,custom-extract": 0.0869899923017706,
200
+ "exact_match_stderr,custom-extract": 0.007822310824931376
201
+ },
202
+ "mmlu_pro_psychology": {
203
+ "alias": " - psychology",
204
+ "exact_match,custom-extract": 0.2543859649122807,
205
+ "exact_match_stderr,custom-extract": 0.0154267502913602
206
+ }
207
+ }