KMasaki commited on
Commit
0eee765
·
verified ·
1 Parent(s): 5e5c69d

Model save

Browse files
Files changed (4) hide show
  1. README.md +4 -6
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +597 -219
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
3
- datasets: open-r1/OpenR1-Math-220k
4
  library_name: transformers
5
  model_name: Qwen2.5-1.5B-Open-R1-Distill
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) on the [open-r1/OpenR1-Math-220k](https://huggingface.co/datasets/open-r1/OpenR1-Math-220k) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/kawamuramasaki/open-r1/runs/8h071yth)
33
 
34
 
35
  This model was trained with SFT.
@@ -39,8 +37,8 @@ This model was trained with SFT.
39
  - TRL: 0.16.0.dev0
40
  - Transformers: 4.49.0
41
  - Pytorch: 2.5.1
42
- - Datasets: 3.3.2
43
- - Tokenizers: 0.21.0
44
 
45
  ## Citations
46
 
 
1
  ---
2
  base_model: Qwen/Qwen2.5-1.5B-Instruct
 
3
  library_name: transformers
4
  model_name: Qwen2.5-1.5B-Open-R1-Distill
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-1.5B-Open-R1-Distill
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/kawamuramasaki/open-r1/runs/9mj5p99y)
31
 
32
 
33
  This model was trained with SFT.
 
37
  - TRL: 0.16.0.dev0
38
  - Transformers: 4.49.0
39
  - Pytorch: 2.5.1
40
+ - Datasets: 3.5.0
41
+ - Tokenizers: 0.21.1
42
 
43
  ## Citations
44
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 488621249396736.0,
3
- "train_loss": 0.5792026920105094,
4
- "train_runtime": 4310.9384,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 7.957,
7
- "train_steps_per_second": 0.062
8
  }
 
1
  {
2
  "total_flos": 488621249396736.0,
3
+ "train_loss": 0.5610535256564617,
4
+ "train_runtime": 6454.4726,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 5.314,
7
+ "train_steps_per_second": 0.083
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 488621249396736.0,
3
- "train_loss": 0.5792026920105094,
4
- "train_runtime": 4310.9384,
5
  "train_samples": 93733,
6
- "train_samples_per_second": 7.957,
7
- "train_steps_per_second": 0.062
8
  }
 
1
  {
2
  "total_flos": 488621249396736.0,
3
+ "train_loss": 0.5610535256564617,
4
+ "train_runtime": 6454.4726,
5
  "train_samples": 93733,
6
+ "train_samples_per_second": 5.314,
7
+ "train_steps_per_second": 0.083
8
  }
trainer_state.json CHANGED
@@ -3,394 +3,772 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 268,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.018656716417910446,
13
- "grad_norm": 1.8348313453918315,
14
- "learning_rate": 1.785714285714286e-05,
15
- "loss": 0.8456,
16
  "step": 5
17
  },
18
  {
19
- "epoch": 0.03731343283582089,
20
- "grad_norm": 0.7820953186638157,
21
- "learning_rate": 3.571428571428572e-05,
22
- "loss": 0.7682,
23
  "step": 10
24
  },
25
  {
26
- "epoch": 0.055970149253731345,
27
- "grad_norm": 0.5019523180117172,
28
- "learning_rate": 4.999827900623038e-05,
29
- "loss": 0.7026,
30
  "step": 15
31
  },
32
  {
33
- "epoch": 0.07462686567164178,
34
- "grad_norm": 0.382461903313769,
35
- "learning_rate": 4.993807186343243e-05,
36
- "loss": 0.6746,
37
  "step": 20
38
  },
39
  {
40
- "epoch": 0.09328358208955224,
41
- "grad_norm": 0.35551933557168003,
42
- "learning_rate": 4.979207812402531e-05,
43
- "loss": 0.6436,
44
  "step": 25
45
  },
46
  {
47
- "epoch": 0.11194029850746269,
48
- "grad_norm": 0.2707499909031633,
49
- "learning_rate": 4.956085596012407e-05,
50
- "loss": 0.6363,
51
  "step": 30
52
  },
53
  {
54
- "epoch": 0.13059701492537312,
55
- "grad_norm": 0.24408756125274592,
56
- "learning_rate": 4.924528939432311e-05,
57
- "loss": 0.6199,
58
  "step": 35
59
  },
60
  {
61
- "epoch": 0.14925373134328357,
62
- "grad_norm": 0.23297660058339695,
63
- "learning_rate": 4.884658491984735e-05,
64
- "loss": 0.6106,
65
  "step": 40
66
  },
67
  {
68
- "epoch": 0.16791044776119404,
69
- "grad_norm": 0.20616997615962274,
70
- "learning_rate": 4.8366266887814235e-05,
71
- "loss": 0.6112,
72
  "step": 45
73
  },
74
  {
75
- "epoch": 0.1865671641791045,
76
- "grad_norm": 0.22064497910198708,
77
- "learning_rate": 4.780617167924209e-05,
78
- "loss": 0.5938,
79
  "step": 50
80
  },
81
  {
82
- "epoch": 0.20522388059701493,
83
- "grad_norm": 0.22601469805529645,
84
- "learning_rate": 4.716844068408693e-05,
85
- "loss": 0.5964,
86
  "step": 55
87
  },
88
  {
89
- "epoch": 0.22388059701492538,
90
- "grad_norm": 0.2465567192412131,
91
- "learning_rate": 4.6455512114150546e-05,
92
- "loss": 0.5918,
93
  "step": 60
94
  },
95
  {
96
- "epoch": 0.24253731343283583,
97
- "grad_norm": 0.4836778183075182,
98
- "learning_rate": 4.5670111681161296e-05,
99
- "loss": 0.5829,
100
  "step": 65
101
  },
102
  {
103
- "epoch": 0.26119402985074625,
104
- "grad_norm": 0.38098490112254135,
105
- "learning_rate": 4.481524217566783e-05,
106
- "loss": 0.5797,
107
  "step": 70
108
  },
109
  {
110
- "epoch": 0.2798507462686567,
111
- "grad_norm": 0.4371168721858062,
112
- "learning_rate": 4.3894171986588217e-05,
113
- "loss": 0.5786,
114
  "step": 75
115
  },
116
  {
117
- "epoch": 0.29850746268656714,
118
- "grad_norm": 0.3738182456668002,
119
- "learning_rate": 4.29104226053073e-05,
120
- "loss": 0.5774,
121
  "step": 80
122
  },
123
  {
124
- "epoch": 0.31716417910447764,
125
- "grad_norm": 0.3026363003997777,
126
- "learning_rate": 4.186775516209732e-05,
127
- "loss": 0.5727,
128
  "step": 85
129
  },
130
  {
131
- "epoch": 0.3358208955223881,
132
- "grad_norm": 0.30121864754056543,
133
- "learning_rate": 4.077015604633669e-05,
134
- "loss": 0.5752,
135
  "step": 90
136
  },
137
  {
138
- "epoch": 0.35447761194029853,
139
- "grad_norm": 0.2715555759227584,
140
- "learning_rate": 3.962182166550441e-05,
141
- "loss": 0.5736,
142
  "step": 95
143
  },
144
  {
145
- "epoch": 0.373134328358209,
146
- "grad_norm": 0.2649152783092657,
147
- "learning_rate": 3.8427142401220634e-05,
148
- "loss": 0.5696,
149
  "step": 100
150
  },
151
  {
152
- "epoch": 0.3917910447761194,
153
- "grad_norm": 0.24764567829993114,
154
- "learning_rate": 3.71906858236735e-05,
155
- "loss": 0.5659,
156
  "step": 105
157
  },
158
  {
159
- "epoch": 0.41044776119402987,
160
- "grad_norm": 0.22991991062523517,
161
- "learning_rate": 3.591717922860785e-05,
162
- "loss": 0.5731,
163
  "step": 110
164
  },
165
  {
166
- "epoch": 0.4291044776119403,
167
- "grad_norm": 0.20518237724726712,
168
- "learning_rate": 3.46114915636416e-05,
169
- "loss": 0.5639,
170
  "step": 115
171
  },
172
  {
173
- "epoch": 0.44776119402985076,
174
- "grad_norm": 0.20766570184771127,
175
- "learning_rate": 3.3278614813010034e-05,
176
- "loss": 0.565,
177
  "step": 120
178
  },
179
  {
180
- "epoch": 0.4664179104477612,
181
- "grad_norm": 0.2122042796549098,
182
- "learning_rate": 3.1923644911909e-05,
183
- "loss": 0.5618,
184
  "step": 125
185
  },
186
  {
187
- "epoch": 0.48507462686567165,
188
- "grad_norm": 0.2042027463923276,
189
- "learning_rate": 3.0551762263406576e-05,
190
- "loss": 0.5605,
191
  "step": 130
192
  },
193
  {
194
- "epoch": 0.503731343283582,
195
- "grad_norm": 0.20586492168965037,
196
- "learning_rate": 2.9168211932412042e-05,
197
- "loss": 0.5577,
198
  "step": 135
199
  },
200
  {
201
- "epoch": 0.5223880597014925,
202
- "grad_norm": 0.18966179188305102,
203
- "learning_rate": 2.777828359242567e-05,
204
- "loss": 0.5631,
205
  "step": 140
206
  },
207
  {
208
- "epoch": 0.5410447761194029,
209
- "grad_norm": 0.22239112831427293,
210
- "learning_rate": 2.6387291301738377e-05,
211
- "loss": 0.5557,
212
  "step": 145
213
  },
214
  {
215
- "epoch": 0.5597014925373134,
216
- "grad_norm": 0.19225992476740958,
217
- "learning_rate": 2.50005531864019e-05,
218
- "loss": 0.5536,
219
  "step": 150
220
  },
221
  {
222
- "epoch": 0.5783582089552238,
223
- "grad_norm": 0.18265395881796748,
224
- "learning_rate": 2.362337110764688e-05,
225
- "loss": 0.5539,
226
  "step": 155
227
  },
228
  {
229
- "epoch": 0.5970149253731343,
230
- "grad_norm": 0.19395607488689207,
231
- "learning_rate": 2.226101039148557e-05,
232
- "loss": 0.5521,
233
  "step": 160
234
  },
235
  {
236
- "epoch": 0.6156716417910447,
237
- "grad_norm": 0.21030220393476035,
238
- "learning_rate": 2.0918679697998252e-05,
239
- "loss": 0.551,
240
  "step": 165
241
  },
242
  {
243
- "epoch": 0.6343283582089553,
244
- "grad_norm": 0.19242117117108423,
245
- "learning_rate": 1.9601511107268255e-05,
246
- "loss": 0.5514,
247
  "step": 170
248
  },
249
  {
250
- "epoch": 0.6529850746268657,
251
- "grad_norm": 0.1876377053928606,
252
- "learning_rate": 1.8314540498102216e-05,
253
- "loss": 0.551,
254
  "step": 175
255
  },
256
  {
257
- "epoch": 0.6716417910447762,
258
- "grad_norm": 0.1658015437913119,
259
- "learning_rate": 1.7062688294552992e-05,
260
- "loss": 0.5431,
261
  "step": 180
262
  },
263
  {
264
- "epoch": 0.6902985074626866,
265
- "grad_norm": 0.1826362280484998,
266
- "learning_rate": 1.5850740653856096e-05,
267
- "loss": 0.5465,
268
  "step": 185
269
  },
270
  {
271
- "epoch": 0.7089552238805971,
272
- "grad_norm": 0.16545958031962485,
273
- "learning_rate": 1.4683331167703218e-05,
274
- "loss": 0.5502,
275
  "step": 190
276
  },
277
  {
278
- "epoch": 0.7276119402985075,
279
- "grad_norm": 0.16924799997686157,
280
- "learning_rate": 1.356492314681356e-05,
281
- "loss": 0.5529,
282
  "step": 195
283
  },
284
  {
285
- "epoch": 0.746268656716418,
286
- "grad_norm": 0.1516512854254837,
287
- "learning_rate": 1.2499792556533716e-05,
288
- "loss": 0.5473,
289
  "step": 200
290
  },
291
  {
292
- "epoch": 0.7649253731343284,
293
- "grad_norm": 0.1689762986341661,
294
- "learning_rate": 1.1492011668707753e-05,
295
- "loss": 0.5448,
296
  "step": 205
297
  },
298
  {
299
- "epoch": 0.7835820895522388,
300
- "grad_norm": 0.16322844448176094,
301
- "learning_rate": 1.0545433492320603e-05,
302
- "loss": 0.5499,
303
  "step": 210
304
  },
305
  {
306
- "epoch": 0.8022388059701493,
307
- "grad_norm": 0.14871916155766746,
308
- "learning_rate": 9.663677042440537e-06,
309
- "loss": 0.5443,
310
  "step": 215
311
  },
312
  {
313
- "epoch": 0.8208955223880597,
314
- "grad_norm": 0.1539983145621574,
315
- "learning_rate": 8.850113503781367e-06,
316
- "loss": 0.5441,
317
  "step": 220
318
  },
319
  {
320
- "epoch": 0.8395522388059702,
321
- "grad_norm": 0.13893092997809098,
322
- "learning_rate": 8.107853341784671e-06,
323
- "loss": 0.5505,
324
  "step": 225
325
  },
326
  {
327
- "epoch": 0.8582089552238806,
328
- "grad_norm": 0.1540249779488284,
329
- "learning_rate": 7.439734410499752e-06,
330
- "loss": 0.5469,
331
  "step": 230
332
  },
333
  {
334
- "epoch": 0.8768656716417911,
335
- "grad_norm": 0.1694291993130961,
336
- "learning_rate": 6.848311102728011e-06,
337
- "loss": 0.547,
338
  "step": 235
339
  },
340
  {
341
- "epoch": 0.8955223880597015,
342
- "grad_norm": 0.1626838965915087,
343
- "learning_rate": 6.335844583913515e-06,
344
- "loss": 0.5432,
345
  "step": 240
346
  },
347
  {
348
- "epoch": 0.914179104477612,
349
- "grad_norm": 0.1380259897848928,
350
- "learning_rate": 5.904294147118193e-06,
351
- "loss": 0.5469,
352
  "step": 245
353
  },
354
  {
355
- "epoch": 0.9328358208955224,
356
- "grad_norm": 0.14289508975860757,
357
- "learning_rate": 5.555309722133842e-06,
358
- "loss": 0.5434,
359
  "step": 250
360
  },
361
  {
362
- "epoch": 0.9514925373134329,
363
- "grad_norm": 0.1375404061999961,
364
- "learning_rate": 5.290225567370509e-06,
365
- "loss": 0.5395,
366
  "step": 255
367
  },
368
  {
369
- "epoch": 0.9701492537313433,
370
- "grad_norm": 0.14150426738142058,
371
- "learning_rate": 5.110055168638854e-06,
372
- "loss": 0.5431,
373
  "step": 260
374
  },
375
  {
376
- "epoch": 0.9888059701492538,
377
- "grad_norm": 0.13763105414820687,
378
- "learning_rate": 5.0154873643297575e-06,
379
- "loss": 0.5469,
380
  "step": 265
381
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  {
383
  "epoch": 1.0,
384
- "step": 268,
385
  "total_flos": 488621249396736.0,
386
- "train_loss": 0.5792026920105094,
387
- "train_runtime": 4310.9384,
388
- "train_samples_per_second": 7.957,
389
- "train_steps_per_second": 0.062
390
  }
391
  ],
392
  "logging_steps": 5,
393
- "max_steps": 268,
394
  "num_input_tokens_seen": 0,
395
  "num_train_epochs": 1,
396
  "save_steps": 100,
 
3
  "best_model_checkpoint": null,
4
  "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 536,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.009328358208955223,
13
+ "grad_norm": 1.453897786558338,
14
+ "learning_rate": 9.259259259259259e-06,
15
+ "loss": 0.8491,
16
  "step": 5
17
  },
18
  {
19
+ "epoch": 0.018656716417910446,
20
+ "grad_norm": 1.1644840596539492,
21
+ "learning_rate": 1.8518518518518518e-05,
22
+ "loss": 0.8006,
23
  "step": 10
24
  },
25
  {
26
+ "epoch": 0.027985074626865673,
27
+ "grad_norm": 0.7401001416095307,
28
+ "learning_rate": 2.777777777777778e-05,
29
+ "loss": 0.7315,
30
  "step": 15
31
  },
32
  {
33
+ "epoch": 0.03731343283582089,
34
+ "grad_norm": 0.45715435120985837,
35
+ "learning_rate": 3.7037037037037037e-05,
36
+ "loss": 0.6803,
37
  "step": 20
38
  },
39
  {
40
+ "epoch": 0.04664179104477612,
41
+ "grad_norm": 0.452456199763512,
42
+ "learning_rate": 4.62962962962963e-05,
43
+ "loss": 0.6578,
44
  "step": 25
45
  },
46
  {
47
+ "epoch": 0.055970149253731345,
48
+ "grad_norm": 0.3770950269873419,
49
+ "learning_rate": 4.999614302517356e-05,
50
+ "loss": 0.6581,
51
  "step": 30
52
  },
53
  {
54
+ "epoch": 0.06529850746268656,
55
+ "grad_norm": 0.3867121454463547,
56
+ "learning_rate": 4.997257741198456e-05,
57
+ "loss": 0.6419,
58
  "step": 35
59
  },
60
  {
61
+ "epoch": 0.07462686567164178,
62
+ "grad_norm": 0.30411968998458416,
63
+ "learning_rate": 4.992761136351291e-05,
64
+ "loss": 0.627,
65
  "step": 40
66
  },
67
  {
68
+ "epoch": 0.08395522388059702,
69
+ "grad_norm": 0.3178169814537855,
70
+ "learning_rate": 4.986128770052603e-05,
71
+ "loss": 0.6084,
72
  "step": 45
73
  },
74
  {
75
+ "epoch": 0.09328358208955224,
76
+ "grad_norm": 0.2807145763333074,
77
+ "learning_rate": 4.9773669582457364e-05,
78
+ "loss": 0.61,
79
  "step": 50
80
  },
81
  {
82
+ "epoch": 0.10261194029850747,
83
+ "grad_norm": 0.34303198218488506,
84
+ "learning_rate": 4.966484044726024e-05,
85
+ "loss": 0.6137,
86
  "step": 55
87
  },
88
  {
89
+ "epoch": 0.11194029850746269,
90
+ "grad_norm": 0.32762450828392975,
91
+ "learning_rate": 4.953490393195063e-05,
92
+ "loss": 0.6003,
93
  "step": 60
94
  },
95
  {
96
+ "epoch": 0.12126865671641791,
97
+ "grad_norm": 0.38462981393609424,
98
+ "learning_rate": 4.938398377391461e-05,
99
+ "loss": 0.5972,
100
  "step": 65
101
  },
102
  {
103
+ "epoch": 0.13059701492537312,
104
+ "grad_norm": 0.4924998750785629,
105
+ "learning_rate": 4.921222369307427e-05,
106
+ "loss": 0.5939,
107
  "step": 70
108
  },
109
  {
110
+ "epoch": 0.13992537313432835,
111
+ "grad_norm": 0.4428236238520031,
112
+ "learning_rate": 4.901978725502454e-05,
113
+ "loss": 0.5895,
114
  "step": 75
115
  },
116
  {
117
+ "epoch": 0.14925373134328357,
118
+ "grad_norm": 0.39121495707754167,
119
+ "learning_rate": 4.880685771527114e-05,
120
+ "loss": 0.5895,
121
  "step": 80
122
  },
123
  {
124
+ "epoch": 0.15858208955223882,
125
+ "grad_norm": 0.42141569967189374,
126
+ "learning_rate": 4.8573637844718e-05,
127
+ "loss": 0.5945,
128
  "step": 85
129
  },
130
  {
131
+ "epoch": 0.16791044776119404,
132
+ "grad_norm": 0.37097437462801275,
133
+ "learning_rate": 4.83203497365703e-05,
134
+ "loss": 0.591,
135
  "step": 90
136
  },
137
  {
138
+ "epoch": 0.17723880597014927,
139
+ "grad_norm": 0.4736806212925385,
140
+ "learning_rate": 4.8047234594837143e-05,
141
+ "loss": 0.5782,
142
  "step": 95
143
  },
144
  {
145
+ "epoch": 0.1865671641791045,
146
+ "grad_norm": 0.4098423065809189,
147
+ "learning_rate": 4.775455250463507e-05,
148
+ "loss": 0.5749,
149
  "step": 100
150
  },
151
  {
152
+ "epoch": 0.1958955223880597,
153
+ "grad_norm": 0.43057914859393537,
154
+ "learning_rate": 4.744258218451135e-05,
155
+ "loss": 0.5846,
156
  "step": 105
157
  },
158
  {
159
+ "epoch": 0.20522388059701493,
160
+ "grad_norm": 0.3079441344852731,
161
+ "learning_rate": 4.71116207210228e-05,
162
+ "loss": 0.5758,
163
  "step": 110
164
  },
165
  {
166
+ "epoch": 0.21455223880597016,
167
+ "grad_norm": 0.35622829843104137,
168
+ "learning_rate": 4.676198328582288e-05,
169
+ "loss": 0.5712,
170
  "step": 115
171
  },
172
  {
173
+ "epoch": 0.22388059701492538,
174
+ "grad_norm": 0.3303782414556567,
175
+ "learning_rate": 4.6394002835526535e-05,
176
+ "loss": 0.582,
177
  "step": 120
178
  },
179
  {
180
+ "epoch": 0.2332089552238806,
181
+ "grad_norm": 0.36692461693115874,
182
+ "learning_rate": 4.6008029794638596e-05,
183
+ "loss": 0.5665,
184
  "step": 125
185
  },
186
  {
187
+ "epoch": 0.24253731343283583,
188
+ "grad_norm": 0.3438548029811277,
189
+ "learning_rate": 4.560443172184763e-05,
190
+ "loss": 0.5691,
191
  "step": 130
192
  },
193
  {
194
+ "epoch": 0.251865671641791,
195
+ "grad_norm": 0.3148363976766754,
196
+ "learning_rate": 4.5183592960003104e-05,
197
+ "loss": 0.5619,
198
  "step": 135
199
  },
200
  {
201
+ "epoch": 0.26119402985074625,
202
+ "grad_norm": 0.27079202033744576,
203
+ "learning_rate": 4.4745914270109055e-05,
204
+ "loss": 0.5671,
205
  "step": 140
206
  },
207
  {
208
+ "epoch": 0.27052238805970147,
209
+ "grad_norm": 0.35294951179615874,
210
+ "learning_rate": 4.429181244968301e-05,
211
+ "loss": 0.5663,
212
  "step": 145
213
  },
214
  {
215
+ "epoch": 0.2798507462686567,
216
+ "grad_norm": 0.2830160917424769,
217
+ "learning_rate": 4.38217199358434e-05,
218
+ "loss": 0.5602,
219
  "step": 150
220
  },
221
  {
222
+ "epoch": 0.2891791044776119,
223
+ "grad_norm": 0.35651413739101817,
224
+ "learning_rate": 4.3336084393503545e-05,
225
+ "loss": 0.5569,
226
  "step": 155
227
  },
228
  {
229
+ "epoch": 0.29850746268656714,
230
+ "grad_norm": 0.3354623584514313,
231
+ "learning_rate": 4.283536828906436e-05,
232
+ "loss": 0.5692,
233
  "step": 160
234
  },
235
  {
236
+ "epoch": 0.30783582089552236,
237
+ "grad_norm": 0.31607235976679265,
238
+ "learning_rate": 4.2320048450011684e-05,
239
+ "loss": 0.5626,
240
  "step": 165
241
  },
242
  {
243
+ "epoch": 0.31716417910447764,
244
+ "grad_norm": 0.28829206615383307,
245
+ "learning_rate": 4.179061561083777e-05,
246
+ "loss": 0.5547,
247
  "step": 170
248
  },
249
  {
250
+ "epoch": 0.32649253731343286,
251
+ "grad_norm": 0.3376743915281689,
252
+ "learning_rate": 4.124757394571914e-05,
253
+ "loss": 0.561,
254
  "step": 175
255
  },
256
  {
257
+ "epoch": 0.3358208955223881,
258
+ "grad_norm": 0.270382329131434,
259
+ "learning_rate": 4.069144058839605e-05,
260
+ "loss": 0.5621,
261
  "step": 180
262
  },
263
  {
264
+ "epoch": 0.3451492537313433,
265
+ "grad_norm": 0.2973987443207836,
266
+ "learning_rate": 4.012274513971061e-05,
267
+ "loss": 0.5661,
268
  "step": 185
269
  },
270
  {
271
+ "epoch": 0.35447761194029853,
272
+ "grad_norm": 0.28887092947840104,
273
+ "learning_rate": 3.954202916327264e-05,
274
+ "loss": 0.5542,
275
  "step": 190
276
  },
277
  {
278
+ "epoch": 0.36380597014925375,
279
+ "grad_norm": 0.2655355295288525,
280
+ "learning_rate": 3.894984566973346e-05,
281
+ "loss": 0.5541,
282
  "step": 195
283
  },
284
  {
285
+ "epoch": 0.373134328358209,
286
+ "grad_norm": 0.2811432888948243,
287
+ "learning_rate": 3.834675859015876e-05,
288
+ "loss": 0.5582,
289
  "step": 200
290
  },
291
  {
292
+ "epoch": 0.3824626865671642,
293
+ "grad_norm": 0.2515745875368655,
294
+ "learning_rate": 3.77333422390021e-05,
295
+ "loss": 0.5489,
296
  "step": 205
297
  },
298
  {
299
+ "epoch": 0.3917910447761194,
300
+ "grad_norm": 0.28289824813151215,
301
+ "learning_rate": 3.711018076719034e-05,
302
+ "loss": 0.556,
303
  "step": 210
304
  },
305
  {
306
+ "epoch": 0.40111940298507465,
307
+ "grad_norm": 0.267361883328445,
308
+ "learning_rate": 3.647786760584194e-05,
309
+ "loss": 0.5604,
310
  "step": 215
311
  },
312
  {
313
+ "epoch": 0.41044776119402987,
314
+ "grad_norm": 0.26327372469127713,
315
+ "learning_rate": 3.583700490114776e-05,
316
+ "loss": 0.5585,
317
  "step": 220
318
  },
319
  {
320
+ "epoch": 0.4197761194029851,
321
+ "grad_norm": 0.306243298391607,
322
+ "learning_rate": 3.518820294095267e-05,
323
+ "loss": 0.5545,
324
  "step": 225
325
  },
326
  {
327
+ "epoch": 0.4291044776119403,
328
+ "grad_norm": 0.26926366241042204,
329
+ "learning_rate": 3.453207957358377e-05,
330
+ "loss": 0.5464,
331
  "step": 230
332
  },
333
  {
334
+ "epoch": 0.43843283582089554,
335
+ "grad_norm": 0.2690060278556445,
336
+ "learning_rate": 3.386925961947906e-05,
337
+ "loss": 0.5475,
338
  "step": 235
339
  },
340
  {
341
+ "epoch": 0.44776119402985076,
342
+ "grad_norm": 0.2826334847052104,
343
+ "learning_rate": 3.320037427617639e-05,
344
+ "loss": 0.555,
345
  "step": 240
346
  },
347
  {
348
+ "epoch": 0.457089552238806,
349
+ "grad_norm": 0.26144344293288974,
350
+ "learning_rate": 3.252606051722972e-05,
351
+ "loss": 0.5535,
352
  "step": 245
353
  },
354
  {
355
+ "epoch": 0.4664179104477612,
356
+ "grad_norm": 0.2911683020725898,
357
+ "learning_rate": 3.1846960485624886e-05,
358
+ "loss": 0.5427,
359
  "step": 250
360
  },
361
  {
362
+ "epoch": 0.47574626865671643,
363
+ "grad_norm": 0.25802207095689544,
364
+ "learning_rate": 3.1163720882272516e-05,
365
+ "loss": 0.5505,
366
  "step": 255
367
  },
368
  {
369
+ "epoch": 0.48507462686567165,
370
+ "grad_norm": 0.2937192282456116,
371
+ "learning_rate": 3.047699235016056e-05,
372
+ "loss": 0.5428,
373
  "step": 260
374
  },
375
  {
376
+ "epoch": 0.4944029850746269,
377
+ "grad_norm": 0.3102482607702826,
378
+ "learning_rate": 2.9787428854752736e-05,
379
+ "loss": 0.5367,
380
  "step": 265
381
  },
382
+ {
383
+ "epoch": 0.503731343283582,
384
+ "grad_norm": 0.2548120450864961,
385
+ "learning_rate": 2.9095687061223058e-05,
386
+ "loss": 0.5513,
387
+ "step": 270
388
+ },
389
+ {
390
+ "epoch": 0.5130597014925373,
391
+ "grad_norm": 0.30245109922773245,
392
+ "learning_rate": 2.8402425709119435e-05,
393
+ "loss": 0.5504,
394
+ "step": 275
395
+ },
396
+ {
397
+ "epoch": 0.5223880597014925,
398
+ "grad_norm": 0.27315442361776204,
399
+ "learning_rate": 2.7708304985051868e-05,
400
+ "loss": 0.5474,
401
+ "step": 280
402
+ },
403
+ {
404
+ "epoch": 0.5317164179104478,
405
+ "grad_norm": 0.2469052900266435,
406
+ "learning_rate": 2.7013985894002623e-05,
407
+ "loss": 0.5353,
408
+ "step": 285
409
+ },
410
+ {
411
+ "epoch": 0.5410447761194029,
412
+ "grad_norm": 0.2381522137991754,
413
+ "learning_rate": 2.6320129629857093e-05,
414
+ "loss": 0.5481,
415
+ "step": 290
416
+ },
417
+ {
418
+ "epoch": 0.5503731343283582,
419
+ "grad_norm": 0.2435217695212293,
420
+ "learning_rate": 2.56273969457547e-05,
421
+ "loss": 0.537,
422
+ "step": 295
423
+ },
424
+ {
425
+ "epoch": 0.5597014925373134,
426
+ "grad_norm": 0.23199546267694518,
427
+ "learning_rate": 2.4936447524859625e-05,
428
+ "loss": 0.5413,
429
+ "step": 300
430
+ },
431
+ {
432
+ "epoch": 0.5690298507462687,
433
+ "grad_norm": 0.30029854485663093,
434
+ "learning_rate": 2.4247939352150386e-05,
435
+ "loss": 0.5365,
436
+ "step": 305
437
+ },
438
+ {
439
+ "epoch": 0.5783582089552238,
440
+ "grad_norm": 0.25071572100454237,
441
+ "learning_rate": 2.3562528087826573e-05,
442
+ "loss": 0.5426,
443
+ "step": 310
444
+ },
445
+ {
446
+ "epoch": 0.5876865671641791,
447
+ "grad_norm": 0.2361656180291179,
448
+ "learning_rate": 2.2880866442929544e-05,
449
+ "loss": 0.5396,
450
+ "step": 315
451
+ },
452
+ {
453
+ "epoch": 0.5970149253731343,
454
+ "grad_norm": 0.23596745328071683,
455
+ "learning_rate": 2.2203603557771447e-05,
456
+ "loss": 0.5357,
457
+ "step": 320
458
+ },
459
+ {
460
+ "epoch": 0.6063432835820896,
461
+ "grad_norm": 0.24184948453857802,
462
+ "learning_rate": 2.153138438376473e-05,
463
+ "loss": 0.5339,
464
+ "step": 325
465
+ },
466
+ {
467
+ "epoch": 0.6156716417910447,
468
+ "grad_norm": 0.23369544200091724,
469
+ "learning_rate": 2.0864849069240645e-05,
470
+ "loss": 0.5386,
471
+ "step": 330
472
+ },
473
+ {
474
+ "epoch": 0.625,
475
+ "grad_norm": 0.2273042144154694,
476
+ "learning_rate": 2.0204632349841667e-05,
477
+ "loss": 0.5355,
478
+ "step": 335
479
+ },
480
+ {
481
+ "epoch": 0.6343283582089553,
482
+ "grad_norm": 0.2094409370066005,
483
+ "learning_rate": 1.9551362944068462e-05,
484
+ "loss": 0.5377,
485
+ "step": 340
486
+ },
487
+ {
488
+ "epoch": 0.6436567164179104,
489
+ "grad_norm": 0.22818911316518153,
490
+ "learning_rate": 1.890566295455678e-05,
491
+ "loss": 0.531,
492
+ "step": 345
493
+ },
494
+ {
495
+ "epoch": 0.6529850746268657,
496
+ "grad_norm": 0.21863119932090663,
497
+ "learning_rate": 1.8268147275654707e-05,
498
+ "loss": 0.541,
499
+ "step": 350
500
+ },
501
+ {
502
+ "epoch": 0.6623134328358209,
503
+ "grad_norm": 0.2106369913594231,
504
+ "learning_rate": 1.7639423007864252e-05,
505
+ "loss": 0.5278,
506
+ "step": 355
507
+ },
508
+ {
509
+ "epoch": 0.6716417910447762,
510
+ "grad_norm": 0.21604757242141778,
511
+ "learning_rate": 1.702008887970491e-05,
512
+ "loss": 0.5287,
513
+ "step": 360
514
+ },
515
+ {
516
+ "epoch": 0.6809701492537313,
517
+ "grad_norm": 0.21597143378114195,
518
+ "learning_rate": 1.6410734677549872e-05,
519
+ "loss": 0.5318,
520
+ "step": 365
521
+ },
522
+ {
523
+ "epoch": 0.6902985074626866,
524
+ "grad_norm": 0.21420392632858606,
525
+ "learning_rate": 1.58119406839777e-05,
526
+ "loss": 0.5308,
527
+ "step": 370
528
+ },
529
+ {
530
+ "epoch": 0.6996268656716418,
531
+ "grad_norm": 0.23220484507328432,
532
+ "learning_rate": 1.5224277125174388e-05,
533
+ "loss": 0.5337,
534
+ "step": 375
535
+ },
536
+ {
537
+ "epoch": 0.7089552238805971,
538
+ "grad_norm": 0.21960291661563633,
539
+ "learning_rate": 1.464830362791204e-05,
540
+ "loss": 0.5359,
541
+ "step": 380
542
+ },
543
+ {
544
+ "epoch": 0.7182835820895522,
545
+ "grad_norm": 0.20363391645776452,
546
+ "learning_rate": 1.4084568686621314e-05,
547
+ "loss": 0.5383,
548
+ "step": 385
549
+ },
550
+ {
551
+ "epoch": 0.7276119402985075,
552
+ "grad_norm": 0.20430129686406656,
553
+ "learning_rate": 1.3533609141065008e-05,
554
+ "loss": 0.5366,
555
+ "step": 390
556
+ },
557
+ {
558
+ "epoch": 0.7369402985074627,
559
+ "grad_norm": 0.21391864672286387,
560
+ "learning_rate": 1.299594966511038e-05,
561
+ "loss": 0.5338,
562
+ "step": 395
563
+ },
564
+ {
565
+ "epoch": 0.746268656716418,
566
+ "grad_norm": 0.20750528279220357,
567
+ "learning_rate": 1.2472102267086904e-05,
568
+ "loss": 0.5295,
569
+ "step": 400
570
+ },
571
+ {
572
+ "epoch": 0.7555970149253731,
573
+ "grad_norm": 0.20652656803309716,
574
+ "learning_rate": 1.1962565802205255e-05,
575
+ "loss": 0.5362,
576
+ "step": 405
577
+ },
578
+ {
579
+ "epoch": 0.7649253731343284,
580
+ "grad_norm": 0.19195629879165202,
581
+ "learning_rate": 1.1467825497501954e-05,
582
+ "loss": 0.5225,
583
+ "step": 410
584
+ },
585
+ {
586
+ "epoch": 0.7742537313432836,
587
+ "grad_norm": 0.21151922246650445,
588
+ "learning_rate": 1.0988352489762006e-05,
589
+ "loss": 0.5384,
590
+ "step": 415
591
+ },
592
+ {
593
+ "epoch": 0.7835820895522388,
594
+ "grad_norm": 0.204760450632927,
595
+ "learning_rate": 1.052460337685951e-05,
596
+ "loss": 0.5298,
597
+ "step": 420
598
+ },
599
+ {
600
+ "epoch": 0.792910447761194,
601
+ "grad_norm": 0.19922336763600637,
602
+ "learning_rate": 1.0077019782943584e-05,
603
+ "loss": 0.5282,
604
+ "step": 425
605
+ },
606
+ {
607
+ "epoch": 0.8022388059701493,
608
+ "grad_norm": 0.2104055047251735,
609
+ "learning_rate": 9.646027937883622e-06,
610
+ "loss": 0.529,
611
+ "step": 430
612
+ },
613
+ {
614
+ "epoch": 0.8115671641791045,
615
+ "grad_norm": 0.18141387654521896,
616
+ "learning_rate": 9.232038271374377e-06,
617
+ "loss": 0.531,
618
+ "step": 435
619
+ },
620
+ {
621
+ "epoch": 0.8208955223880597,
622
+ "grad_norm": 0.19879663171910475,
623
+ "learning_rate": 8.835445022087426e-06,
624
+ "loss": 0.5256,
625
+ "step": 440
626
+ },
627
+ {
628
+ "epoch": 0.8302238805970149,
629
+ "grad_norm": 0.19565091593372,
630
+ "learning_rate": 8.456625862241193e-06,
631
+ "loss": 0.5358,
632
+ "step": 445
633
+ },
634
+ {
635
+ "epoch": 0.8395522388059702,
636
+ "grad_norm": 0.187496803748416,
637
+ "learning_rate": 8.095941537947057e-06,
638
+ "loss": 0.5328,
639
+ "step": 450
640
+ },
641
+ {
642
+ "epoch": 0.8488805970149254,
643
+ "grad_norm": 0.20371049579133477,
644
+ "learning_rate": 7.753735525674059e-06,
645
+ "loss": 0.5256,
646
+ "step": 455
647
+ },
648
+ {
649
+ "epoch": 0.8582089552238806,
650
+ "grad_norm": 0.1964002704579327,
651
+ "learning_rate": 7.430333705159286e-06,
652
+ "loss": 0.5359,
653
+ "step": 460
654
+ },
655
+ {
656
+ "epoch": 0.8675373134328358,
657
+ "grad_norm": 0.1848105783285991,
658
+ "learning_rate": 7.126044049075548e-06,
659
+ "loss": 0.5408,
660
+ "step": 465
661
+ },
662
+ {
663
+ "epoch": 0.8768656716417911,
664
+ "grad_norm": 0.18246504825507467,
665
+ "learning_rate": 6.8411563297516995e-06,
666
+ "loss": 0.5208,
667
+ "step": 470
668
+ },
669
+ {
670
+ "epoch": 0.8861940298507462,
671
+ "grad_norm": 0.17351842603547765,
672
+ "learning_rate": 6.575941843225068e-06,
673
+ "loss": 0.5246,
674
+ "step": 475
675
+ },
676
+ {
677
+ "epoch": 0.8955223880597015,
678
+ "grad_norm": 0.20341591173191198,
679
+ "learning_rate": 6.330653150888617e-06,
680
+ "loss": 0.5292,
681
+ "step": 480
682
+ },
683
+ {
684
+ "epoch": 0.9048507462686567,
685
+ "grad_norm": 0.20204818775308625,
686
+ "learning_rate": 6.105523838979022e-06,
687
+ "loss": 0.5373,
688
+ "step": 485
689
+ },
690
+ {
691
+ "epoch": 0.914179104477612,
692
+ "grad_norm": 0.18421926232056562,
693
+ "learning_rate": 5.900768296134551e-06,
694
+ "loss": 0.5238,
695
+ "step": 490
696
+ },
697
+ {
698
+ "epoch": 0.9235074626865671,
699
+ "grad_norm": 0.1788995157537539,
700
+ "learning_rate": 5.7165815092346825e-06,
701
+ "loss": 0.526,
702
+ "step": 495
703
+ },
704
+ {
705
+ "epoch": 0.9328358208955224,
706
+ "grad_norm": 0.18233827133162475,
707
+ "learning_rate": 5.553138877715833e-06,
708
+ "loss": 0.5279,
709
+ "step": 500
710
+ },
711
+ {
712
+ "epoch": 0.9421641791044776,
713
+ "grad_norm": 0.182345435097955,
714
+ "learning_rate": 5.410596046540051e-06,
715
+ "loss": 0.5229,
716
+ "step": 505
717
+ },
718
+ {
719
+ "epoch": 0.9514925373134329,
720
+ "grad_norm": 0.19791728733223776,
721
+ "learning_rate": 5.28908875797568e-06,
722
+ "loss": 0.5235,
723
+ "step": 510
724
+ },
725
+ {
726
+ "epoch": 0.960820895522388,
727
+ "grad_norm": 0.18416166735753317,
728
+ "learning_rate": 5.1887327223312296e-06,
729
+ "loss": 0.5246,
730
+ "step": 515
731
+ },
732
+ {
733
+ "epoch": 0.9701492537313433,
734
+ "grad_norm": 0.19627109559115194,
735
+ "learning_rate": 5.109623507765466e-06,
736
+ "loss": 0.5289,
737
+ "step": 520
738
+ },
739
+ {
740
+ "epoch": 0.9794776119402985,
741
+ "grad_norm": 0.19326569885491154,
742
+ "learning_rate": 5.051836449278715e-06,
743
+ "loss": 0.5311,
744
+ "step": 525
745
+ },
746
+ {
747
+ "epoch": 0.9888059701492538,
748
+ "grad_norm": 0.19312098703576008,
749
+ "learning_rate": 5.015426576972003e-06,
750
+ "loss": 0.5298,
751
+ "step": 530
752
+ },
753
+ {
754
+ "epoch": 0.9981343283582089,
755
+ "grad_norm": 0.17724401661078515,
756
+ "learning_rate": 5.000428563642382e-06,
757
+ "loss": 0.5299,
758
+ "step": 535
759
+ },
760
  {
761
  "epoch": 1.0,
762
+ "step": 536,
763
  "total_flos": 488621249396736.0,
764
+ "train_loss": 0.5610535256564617,
765
+ "train_runtime": 6454.4726,
766
+ "train_samples_per_second": 5.314,
767
+ "train_steps_per_second": 0.083
768
  }
769
  ],
770
  "logging_steps": 5,
771
+ "max_steps": 536,
772
  "num_input_tokens_seen": 0,
773
  "num_train_epochs": 1,
774
  "save_steps": 100,