wdevazelhes commited on
Commit
4503191
·
verified ·
1 Parent(s): 7aae4f3

add scores for 1B

Browse files
Files changed (1) hide show
  1. README.md +81 -62
README.md CHANGED
@@ -94,131 +94,150 @@ We report in the following table our internal pipeline benchmarks:
94
  <col style="width: 10%;">
95
  <col style="width: 7%;">
96
  <col style="width: 7%;">
 
97
  <col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
98
  </colgroup>
99
  <thead>
100
  <tr>
101
  <th>Category</th>
102
  <th>Benchmark</th>
103
- <th>Llama-3.1-8B-Instruct</th>
104
- <th>Qwen2.5-7B-Instruct</th>
105
- <th>Falcon3-7B-Instruct</th>
 
106
  </tr>
107
  </thead>
108
  <tbody>
109
  <tr>
110
  <td rowspan="3">General</td>
111
  <td>MMLU (5-shot)</td>
112
- <td>55.9</td>
113
- <td><b>72.4</b></td>
114
- <td>68</td>
 
115
  </tr>
116
  <tr>
117
  <td>MMLU-PRO (5-shot)</td>
118
- <td>21.8</td>
119
- <td>35.8</td>
120
- <td><b>40.7</b></td>
 
121
  </tr>
122
  <tr>
123
  <td>IFEval</td>
124
- <td><b>78.8</b></td>
125
- <td>74.7</td>
126
- <td>76.5</td>
 
127
  </tr>
128
  <tr>
129
  <td rowspan="3">Math</td>
130
  <td>GSM8K (5-shot)</td>
131
- <td>78.1</td>
132
- <td>77.5</td>
133
- <td><b>79.1</b></td>
 
134
  </tr>
135
  <tr>
136
  <td>GSM8K (8-shot, COT)</td>
137
- <td>79.8</td>
138
- <td>72.7</td>
139
- <td><b>80.9</b></td>
 
140
  </tr>
141
  <tr>
142
  <td>MATH Lvl-5 (4-shot)</td>
143
- <td>10.4</td>
144
- <td>26</td>
145
- <td><b>33.1</b></td>
 
146
  </tr>
147
  <tr>
148
- <td rowspan="5">Reasoning</td>
149
  <td>Arc Challenge (25-shot)</td>
150
- <td>46.6</td>
151
- <td>55.7</td>
152
- <td><b>65.9</b></td>
 
153
  </tr>
154
  <tr>
155
  <td>GPQA (0-shot)</td>
156
- <td><b>33.6</b></td>
157
- <td>31.9</td>
158
- <td>32</td>
 
159
  </tr>
160
  <tr>
161
  <td>GPQA (0-shot, COT)</td>
162
- <td>9.6</td>
163
- <td>13.8</td>
164
- <td><b>22.3</b></td>
 
165
  </tr>
166
  <tr>
167
  <td>MUSR (0-shot)</td>
168
- <td>38.6</td>
169
- <td>40.7</td>
170
- <td><b>46.4</b></td>
 
171
  </tr>
172
  <tr>
173
  <td>BBH (3-shot)</td>
174
- <td>43.7</td>
175
- <td><b>53.9</b></td>
176
- <td>52.4</td>
 
177
  </tr>
178
  <tr>
179
- <td rowspan="4">CommonSense Understanding</td>
 
 
 
 
 
 
 
180
  <td>PIQA (0-shot)</td>
181
- <td><b>78.9</b></td>
182
- <td>73.7</td>
183
- <td>78.8</td>
 
184
  </tr>
185
  <tr>
186
  <td>SciQ (0-shot)</td>
187
- <td>80.2</td>
188
- <td>50.9</td>
189
- <td><b>94.7</b></td>
 
190
  </tr>
191
  <tr>
192
  <td>Winogrande (0-shot)</td>
193
  <td>-</td>
194
  <td>-</td>
195
- <td>70.4</td>
 
196
  </tr>
197
  <tr>
198
  <td>OpenbookQA (0-shot)</td>
199
- <td><b>46.2</b></td>
200
- <td>42.4</td>
201
- <td>45.8</td>
 
202
  </tr>
203
  <tr>
204
- <td rowspan="2">Instructions following</td>
205
  <td>MT-Bench (avg)</td>
206
- <td>7.86</td>
207
- <td><b>8.54</b></td>
208
- <td>8.36</td>
 
209
  </tr>
210
  <tr>
 
211
  <td>Alapaca (WC)</td>
212
- <td>26.57</td>
213
- <td><b>31.5</b></td>
214
- <td>26.13</td>
215
- </tr>
216
- <tr>
217
- <td>Tool use</td>
218
- <td>BFCL AST (avg)</td>
219
- <td>90.6</td>
220
- <td><b>91.4</b></td>
221
- <td>72.3</td>
222
  </tr>
223
  </tbody>
224
  </table>
 
94
  <col style="width: 10%;">
95
  <col style="width: 7%;">
96
  <col style="width: 7%;">
97
+ <col style="width: 7%;">
98
  <col style="background-color: rgba(80, 15, 213, 0.5); width: 7%;">
99
  </colgroup>
100
  <thead>
101
  <tr>
102
  <th>Category</th>
103
  <th>Benchmark</th>
104
+ <th>Llama-3.2-1B</th>
105
+ <th>Qwen2.5-1.5B</th>
106
+ <th>SmolLM2-1.7B</th>
107
+ <th>Falcon3-1B-Instruct</th>
108
  </tr>
109
  </thead>
110
  <tbody>
111
  <tr>
112
  <td rowspan="3">General</td>
113
  <td>MMLU (5-shot)</td>
114
+ <td>23.4</td>
115
+ <td><b>58.4</b></td>
116
+ <td>48.4</td>
117
+ <td>43.9</td>
118
  </tr>
119
  <tr>
120
  <td>MMLU-PRO (5-shot)</td>
121
+ <td>11.3</td>
122
+ <td><b>21.3</b></td>
123
+ <td>17.2</td>
124
+ <td>18.6</td>
125
  </tr>
126
  <tr>
127
  <td>IFEval</td>
128
+ <td><b>55.8</b></td>
129
+ <td>44.4</td>
130
+ <td>53.0</td>
131
+ <td>54.4</td>
132
  </tr>
133
  <tr>
134
  <td rowspan="3">Math</td>
135
  <td>GSM8K (5-shot)</td>
136
+ <td>37.4</td>
137
+ <td><b>57.2</b></td>
138
+ <td>43.4</td>
139
+ <td>38.6</td>
140
  </tr>
141
  <tr>
142
  <td>GSM8K (8-shot, COT)</td>
143
+ <td>35.6</td>
144
+ <td><b>62.2</b></td>
145
+ <td>47.2</td>
146
+ <td>41.8</td>
147
  </tr>
148
  <tr>
149
  <td>MATH Lvl-5 (4-shot)</td>
150
+ <td><b>3.9</b></td>
151
+ <td>0.2</td>
152
+ <td>0.1</td>
153
+ <td>1.0</td>
154
  </tr>
155
  <tr>
156
+ <td rowspan="6">Reasoning</td>
157
  <td>Arc Challenge (25-shot)</td>
158
+ <td>34.1</td>
159
+ <td>47.0</td>
160
+ <td><b>47.6</b></td>
161
+ <td>45.9</td>
162
  </tr>
163
  <tr>
164
  <td>GPQA (0-shot)</td>
165
+ <td>25.3</td>
166
+ <td><b>29.6</b></td>
167
+ <td>28.7</td>
168
+ <td>26.5</td>
169
  </tr>
170
  <tr>
171
  <td>GPQA (0-shot, COT)</td>
172
+ <td>13.2</td>
173
+ <td>9.2</td>
174
+ <td>16.0</td>
175
+ <td><b>21.3</b></td>
176
  </tr>
177
  <tr>
178
  <td>MUSR (0-shot)</td>
179
+ <td>32.4</td>
180
+ <td>36.8</td>
181
+ <td>33.0</td>
182
+ <td><b>40.7</b></td>
183
  </tr>
184
  <tr>
185
  <td>BBH (3-shot)</td>
186
+ <td>30.3</td>
187
+ <td><b>38.5</b></td>
188
+ <td>33.1</td>
189
+ <td>35.1</td>
190
  </tr>
191
  <tr>
192
+ <td>BBH (3-shot, COT)</td>
193
+ <td>0.0</td>
194
+ <td>20.3</td>
195
+ <td>0.8</td>
196
+ <td><b>30.5</b></td>
197
+ </tr>
198
+ <tr>
199
+ <td rowspan="5">CommonSense Understanding</td>
200
  <td>PIQA (0-shot)</td>
201
+ <td>72.1</td>
202
+ <td>73.2</td>
203
+ <td><b>74.4</b></td>
204
+ <td>72.0</td>
205
  </tr>
206
  <tr>
207
  <td>SciQ (0-shot)</td>
208
+ <td>61.8</td>
209
+ <td>69.5</td>
210
+ <td>71.4</td>
211
+ <td><b>86.8</b></td>
212
  </tr>
213
  <tr>
214
  <td>Winogrande (0-shot)</td>
215
  <td>-</td>
216
  <td>-</td>
217
+ <td>-</td>
218
+ <td><b>60.2</b></td>
219
  </tr>
220
  <tr>
221
  <td>OpenbookQA (0-shot)</td>
222
+ <td>40.2</td>
223
+ <td>40.4</td>
224
+ <td><b>42.8</b></td>
225
+ <td>40.0</td>
226
  </tr>
227
  <tr>
 
228
  <td>MT-Bench (avg)</td>
229
+ <td>5.4</td>
230
+ <td><b>7.1</b></td>
231
+ <td>6.1</td>
232
+ <td>5.5</td>
233
  </tr>
234
  <tr>
235
+ <td rowspan="1">Instructions following</td>
236
  <td>Alapaca (WC)</td>
237
+ <td><b>8.6</b></td>
238
+ <td><b>8.6</b></td>
239
+ <td>5.4</td>
240
+ <td>6.1</td>
 
 
 
 
 
 
241
  </tr>
242
  </tbody>
243
  </table>