tongliuphysics commited on
Commit
820ab46
·
verified ·
1 Parent(s): 09fb597

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +2 -7
  3. train_results.json +2 -2
  4. trainer_state.json +565 -565
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-3B
3
- datasets: ebony59/MATH-lighteval-gen-correct
4
  library_name: transformers
5
  model_name: Qwen2.5-3B-MATH-lighteval-gen-SFT-15epoch
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-3B-MATH-lighteval-gen-SFT-15epoch
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B) on the [ebony59/MATH-lighteval-gen-correct](https://huggingface.co/datasets/ebony59/MATH-lighteval-gen-correct) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/pl03818948-ludwig-maximilianuniversity-of-munich/qwen-math-sft/runs/q3cj7whd)
33
 
34
 
35
  This model was trained with SFT.
 
1
  ---
2
  base_model: Qwen/Qwen2.5-3B
 
3
  library_name: transformers
4
  model_name: Qwen2.5-3B-MATH-lighteval-gen-SFT-15epoch
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-3B-MATH-lighteval-gen-SFT-15epoch
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/pl03818948-ludwig-maximilianuniversity-of-munich/qwen-math-sft/runs/uujrvm5v)
31
 
32
 
33
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,12 +1,7 @@
1
  {
2
- "eval_loss": 0.46884241700172424,
3
- "eval_runtime": 69.6122,
4
- "eval_samples": 3537,
5
- "eval_samples_per_second": 12.282,
6
- "eval_steps_per_second": 1.537,
7
  "total_flos": 24409842647040.0,
8
- "train_loss": 0.12131776079243305,
9
- "train_runtime": 3783.5615,
10
  "train_samples": 6726,
11
  "train_samples_per_second": 2.194,
12
  "train_steps_per_second": 0.067
 
1
  {
 
 
 
 
 
2
  "total_flos": 24409842647040.0,
3
+ "train_loss": 0.12274208276295194,
4
+ "train_runtime": 3782.9235,
5
  "train_samples": 6726,
6
  "train_samples_per_second": 2.194,
7
  "train_steps_per_second": 0.067
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "total_flos": 24409842647040.0,
3
- "train_loss": 0.12131776079243305,
4
- "train_runtime": 3783.5615,
5
  "train_samples": 6726,
6
  "train_samples_per_second": 2.194,
7
  "train_steps_per_second": 0.067
 
1
  {
2
  "total_flos": 24409842647040.0,
3
+ "train_loss": 0.12274208276295194,
4
+ "train_runtime": 3782.9235,
5
  "train_samples": 6726,
6
  "train_samples_per_second": 2.194,
7
  "train_steps_per_second": 0.067
trainer_state.json CHANGED
@@ -11,8 +11,8 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.03855421686746988,
14
- "grad_norm": 2.0454819276337846,
15
- "learning_rate": 7.692307692307694e-07,
16
  "loss": 0.2354,
17
  "mean_token_accuracy": 0.930065356194973,
18
  "num_tokens": 131072.0,
@@ -20,1266 +20,1266 @@
20
  },
21
  {
22
  "epoch": 0.07710843373493977,
23
- "grad_norm": 2.1298212530091987,
24
- "learning_rate": 2.307692307692308e-06,
25
- "loss": 0.25,
26
- "mean_token_accuracy": 0.9257908090949059,
27
  "num_tokens": 262144.0,
28
  "step": 4
29
  },
30
  {
31
  "epoch": 0.11566265060240964,
32
- "grad_norm": 1.7456338929875892,
33
- "learning_rate": 3.846153846153847e-06,
34
- "loss": 0.2397,
35
- "mean_token_accuracy": 0.9267907477915287,
36
  "num_tokens": 393216.0,
37
  "step": 6
38
  },
39
  {
40
  "epoch": 0.15421686746987953,
41
- "grad_norm": 0.9706871081058248,
42
- "learning_rate": 5.384615384615385e-06,
43
- "loss": 0.2015,
44
- "mean_token_accuracy": 0.935897059738636,
45
  "num_tokens": 524288.0,
46
  "step": 8
47
  },
48
  {
49
  "epoch": 0.1927710843373494,
50
- "grad_norm": 0.8194838637969071,
51
- "learning_rate": 6.923076923076923e-06,
52
- "loss": 0.2008,
53
- "mean_token_accuracy": 0.9353887066245079,
54
  "num_tokens": 654484.0,
55
  "step": 10
56
  },
57
  {
58
  "epoch": 0.23132530120481928,
59
- "grad_norm": 1.0319387664578734,
60
- "learning_rate": 8.461538461538462e-06,
61
- "loss": 0.2011,
62
- "mean_token_accuracy": 0.9345459975302219,
63
  "num_tokens": 785556.0,
64
  "step": 12
65
  },
66
  {
67
  "epoch": 0.26987951807228916,
68
- "grad_norm": 0.8352922078235848,
69
- "learning_rate": 1e-05,
70
- "loss": 0.2003,
71
- "mean_token_accuracy": 0.935283150523901,
72
  "num_tokens": 915519.0,
73
  "step": 14
74
  },
75
  {
76
  "epoch": 0.30843373493975906,
77
- "grad_norm": 0.7906073173343502,
78
- "learning_rate": 9.998314826517564e-06,
79
- "loss": 0.1966,
80
- "mean_token_accuracy": 0.935897059738636,
81
  "num_tokens": 1046591.0,
82
  "step": 16
83
  },
84
  {
85
  "epoch": 0.3469879518072289,
86
- "grad_norm": 0.6846232792864836,
87
- "learning_rate": 9.993260441994116e-06,
88
- "loss": 0.2035,
89
- "mean_token_accuracy": 0.9336223900318146,
90
  "num_tokens": 1177663.0,
91
  "step": 18
92
  },
93
  {
94
  "epoch": 0.3855421686746988,
95
- "grad_norm": 0.7432991959274129,
96
- "learning_rate": 9.984840253435569e-06,
97
- "loss": 0.1936,
98
- "mean_token_accuracy": 0.9368893653154373,
99
  "num_tokens": 1308735.0,
100
  "step": 20
101
  },
102
  {
103
  "epoch": 0.3855421686746988,
104
- "eval_loss": 0.33434972167015076,
105
- "eval_mean_token_accuracy": 0.9007918210787194,
106
  "eval_num_tokens": 1308735.0,
107
- "eval_runtime": 70.1692,
108
- "eval_samples_per_second": 12.185,
109
- "eval_steps_per_second": 1.525,
110
  "step": 20
111
  },
112
  {
113
  "epoch": 0.42409638554216866,
114
- "grad_norm": 0.669864276674794,
115
- "learning_rate": 9.973059936633308e-06,
116
- "loss": 0.1824,
117
- "mean_token_accuracy": 0.9392937980592251,
118
  "num_tokens": 1439807.0,
119
  "step": 22
120
  },
121
  {
122
  "epoch": 0.46265060240963857,
123
- "grad_norm": 0.6077910575622785,
124
- "learning_rate": 9.957927432338332e-06,
125
- "loss": 0.1921,
126
- "mean_token_accuracy": 0.9366323500871658,
127
  "num_tokens": 1570062.0,
128
  "step": 24
129
  },
130
  {
131
  "epoch": 0.5012048192771085,
132
- "grad_norm": 0.630582765660151,
133
- "learning_rate": 9.939452940908627e-06,
134
- "loss": 0.1835,
135
- "mean_token_accuracy": 0.939194567501545,
136
  "num_tokens": 1701134.0,
137
  "step": 26
138
  },
139
  {
140
  "epoch": 0.5397590361445783,
141
- "grad_norm": 0.6225933778125143,
142
- "learning_rate": 9.917648915433413e-06,
143
- "loss": 0.1815,
144
- "mean_token_accuracy": 0.9395865201950073,
145
  "num_tokens": 1832133.0,
146
  "step": 28
147
  },
148
  {
149
  "epoch": 0.5783132530120482,
150
- "grad_norm": 0.5894844338892736,
151
- "learning_rate": 9.892530053338909e-06,
152
- "loss": 0.19,
153
- "mean_token_accuracy": 0.9367748685181141,
154
  "num_tokens": 1963205.0,
155
  "step": 30
156
  },
157
  {
158
  "epoch": 0.6168674698795181,
159
- "grad_norm": 0.5961481326657901,
160
- "learning_rate": 9.864113286481237e-06,
161
- "loss": 0.1818,
162
- "mean_token_accuracy": 0.940141074359417,
163
  "num_tokens": 2094277.0,
164
  "step": 32
165
  },
166
  {
167
  "epoch": 0.655421686746988,
168
- "grad_norm": 0.6661943256583603,
169
- "learning_rate": 9.832417769733185e-06,
170
- "loss": 0.1816,
171
- "mean_token_accuracy": 0.9397670514881611,
172
  "num_tokens": 2225349.0,
173
  "step": 34
174
  },
175
  {
176
  "epoch": 0.6939759036144578,
177
- "grad_norm": 0.5553420813627787,
178
- "learning_rate": 9.797464868072489e-06,
179
- "loss": 0.1827,
180
- "mean_token_accuracy": 0.9393003061413765,
181
  "num_tokens": 2355263.0,
182
  "step": 36
183
  },
184
  {
185
  "epoch": 0.7325301204819277,
186
- "grad_norm": 32.92043661735329,
187
- "learning_rate": 9.759278142180348e-06,
188
- "loss": 0.2111,
189
- "mean_token_accuracy": 0.9326835125684738,
190
  "num_tokens": 2486335.0,
191
  "step": 38
192
  },
193
  {
194
  "epoch": 0.7710843373493976,
195
- "grad_norm": 0.6431446641342233,
196
- "learning_rate": 9.717883332559911e-06,
197
- "loss": 0.1751,
198
- "mean_token_accuracy": 0.9413776397705078,
199
  "num_tokens": 2617407.0,
200
  "step": 40
201
  },
202
  {
203
  "epoch": 0.7710843373493976,
204
- "eval_loss": 0.3249877095222473,
205
- "eval_mean_token_accuracy": 0.9019412114241413,
206
  "eval_num_tokens": 2617407.0,
207
- "eval_runtime": 69.6759,
208
- "eval_samples_per_second": 12.271,
209
  "eval_steps_per_second": 1.536,
210
  "step": 40
211
  },
212
  {
213
  "epoch": 0.8096385542168675,
214
- "grad_norm": 0.6295006274836726,
215
- "learning_rate": 9.673308342185366e-06,
216
- "loss": 0.1758,
217
- "mean_token_accuracy": 0.9410570487380028,
218
  "num_tokens": 2748479.0,
219
  "step": 42
220
  },
221
  {
222
  "epoch": 0.8481927710843373,
223
- "grad_norm": 0.6463634329636155,
224
- "learning_rate": 9.625583217693419e-06,
225
- "loss": 0.1852,
226
- "mean_token_accuracy": 0.9372252225875854,
227
  "num_tokens": 2879551.0,
228
  "step": 44
229
  },
230
  {
231
  "epoch": 0.8867469879518072,
232
- "grad_norm": 0.6060328826450527,
233
- "learning_rate": 9.574740129129767e-06,
234
- "loss": 0.1785,
235
- "mean_token_accuracy": 0.940302312374115,
236
  "num_tokens": 3010185.0,
237
  "step": 46
238
  },
239
  {
240
  "epoch": 0.9253012048192771,
241
- "grad_norm": 0.5780214961275311,
242
- "learning_rate": 9.520813348264252e-06,
243
- "loss": 0.1876,
244
- "mean_token_accuracy": 0.9373778849840164,
245
  "num_tokens": 3141257.0,
246
  "step": 48
247
  },
248
  {
249
  "epoch": 0.963855421686747,
250
- "grad_norm": 0.5827155602222767,
251
- "learning_rate": 9.46383922548932e-06,
252
- "loss": 0.1821,
253
- "mean_token_accuracy": 0.9392147175967693,
254
  "num_tokens": 3271788.0,
255
  "step": 50
256
  },
257
  {
258
  "epoch": 1.0192771084337349,
259
- "grad_norm": 0.9342011171038088,
260
- "learning_rate": 9.403856165317322e-06,
261
- "loss": 0.2506,
262
- "mean_token_accuracy": 0.9448522359132767,
263
  "num_tokens": 3435628.0,
264
  "step": 52
265
  },
266
  {
267
  "epoch": 1.0578313253012048,
268
- "grad_norm": 0.704578477475784,
269
- "learning_rate": 9.34090460049322e-06,
270
- "loss": 0.1469,
271
- "mean_token_accuracy": 0.9520105756819248,
272
  "num_tokens": 3566700.0,
273
  "step": 54
274
  },
275
  {
276
  "epoch": 1.0963855421686748,
277
- "grad_norm": 0.6346717783932901,
278
- "learning_rate": 9.275026964740101e-06,
279
- "loss": 0.1398,
280
- "mean_token_accuracy": 0.9541020505130291,
281
  "num_tokens": 3697772.0,
282
  "step": 56
283
  },
284
  {
285
  "epoch": 1.1349397590361445,
286
- "grad_norm": 0.6428670080495259,
287
- "learning_rate": 9.206267664155906e-06,
288
- "loss": 0.1456,
289
- "mean_token_accuracy": 0.951331228017807,
290
  "num_tokens": 3828844.0,
291
  "step": 58
292
  },
293
  {
294
  "epoch": 1.1734939759036145,
295
- "grad_norm": 0.8091470665632142,
296
- "learning_rate": 9.134673047280644e-06,
297
- "loss": 0.1443,
298
- "mean_token_accuracy": 0.9527967870235443,
299
  "num_tokens": 3959916.0,
300
  "step": 60
301
  },
302
  {
303
  "epoch": 1.1734939759036145,
304
- "eval_loss": 0.35964083671569824,
305
- "eval_mean_token_accuracy": 0.9007457895813701,
306
  "eval_num_tokens": 3959916.0,
307
- "eval_runtime": 69.862,
308
- "eval_samples_per_second": 12.238,
309
  "eval_steps_per_second": 1.532,
310
  "step": 60
311
  },
312
  {
313
  "epoch": 1.2120481927710842,
314
- "grad_norm": 0.6832260425200906,
315
- "learning_rate": 9.060291373854252e-06,
316
- "loss": 0.1373,
317
- "mean_token_accuracy": 0.9543332494795322,
318
  "num_tokens": 4089879.0,
319
  "step": 62
320
  },
321
  {
322
  "epoch": 1.2506024096385542,
323
- "grad_norm": 0.6250669964271302,
324
- "learning_rate": 8.98317278228618e-06,
325
- "loss": 0.1321,
326
- "mean_token_accuracy": 0.9564301520586014,
327
  "num_tokens": 4220951.0,
328
  "step": 64
329
  },
330
  {
331
  "epoch": 1.2891566265060241,
332
- "grad_norm": 0.6305219631459404,
333
- "learning_rate": 8.90336925585864e-06,
334
- "loss": 0.1489,
335
- "mean_token_accuracy": 0.9517128840088844,
336
  "num_tokens": 4352023.0,
337
  "step": 66
338
  },
339
  {
340
  "epoch": 1.3277108433734939,
341
- "grad_norm": 0.6841999502708318,
342
- "learning_rate": 8.820934587686247e-06,
343
- "loss": 0.1529,
344
- "mean_token_accuracy": 0.9492626525461674,
345
  "num_tokens": 4483095.0,
346
  "step": 68
347
  },
348
  {
349
  "epoch": 1.3662650602409638,
350
- "grad_norm": 0.6276577378608187,
351
- "learning_rate": 8.735924344455732e-06,
352
- "loss": 0.1385,
353
- "mean_token_accuracy": 0.9543463103473186,
354
  "num_tokens": 4614167.0,
355
  "step": 70
356
  },
357
  {
358
  "epoch": 1.4048192771084338,
359
- "grad_norm": 0.6218471880996238,
360
- "learning_rate": 8.64839582897015e-06,
361
- "loss": 0.1393,
362
- "mean_token_accuracy": 0.9541173167526722,
363
  "num_tokens": 4745239.0,
364
  "step": 72
365
  },
366
  {
367
  "epoch": 1.4433734939759035,
368
- "grad_norm": 0.6120450655890222,
369
- "learning_rate": 8.558408041522801e-06,
370
- "loss": 0.1384,
371
- "mean_token_accuracy": 0.9534379690885544,
372
  "num_tokens": 4876311.0,
373
  "step": 74
374
  },
375
  {
376
  "epoch": 1.4819277108433735,
377
- "grad_norm": 0.6447124346757005,
378
- "learning_rate": 8.466021640126946e-06,
379
- "loss": 0.1358,
380
- "mean_token_accuracy": 0.9546592682600021,
381
  "num_tokens": 5007383.0,
382
  "step": 76
383
  },
384
  {
385
  "epoch": 1.5204819277108435,
386
- "grad_norm": 0.6318066424629517,
387
- "learning_rate": 8.371298899628091e-06,
388
- "loss": 0.1507,
389
- "mean_token_accuracy": 0.950941938906908,
390
  "num_tokens": 5138455.0,
391
  "step": 78
392
  },
393
  {
394
  "epoch": 1.5590361445783132,
395
- "grad_norm": 0.6610045275388146,
396
- "learning_rate": 8.274303669726427e-06,
397
- "loss": 0.1433,
398
- "mean_token_accuracy": 0.9530486799776554,
399
  "num_tokens": 5269527.0,
400
  "step": 80
401
  },
402
  {
403
  "epoch": 1.5590361445783132,
404
- "eval_loss": 0.34594476222991943,
405
- "eval_mean_token_accuracy": 0.9009478883208516,
406
  "eval_num_tokens": 5269527.0,
407
- "eval_runtime": 69.6764,
408
- "eval_samples_per_second": 12.271,
409
  "eval_steps_per_second": 1.536,
410
  "step": 80
411
  },
412
  {
413
  "epoch": 1.5975903614457831,
414
- "grad_norm": 0.5753891679678019,
415
- "learning_rate": 8.175101331937692e-06,
416
- "loss": 0.1404,
417
- "mean_token_accuracy": 0.9533847123384476,
418
  "num_tokens": 5400161.0,
419
  "step": 82
420
  },
421
  {
422
  "epoch": 1.636144578313253,
423
- "grad_norm": 0.5509379887731481,
424
- "learning_rate": 8.073758755521506e-06,
425
- "loss": 0.145,
426
- "mean_token_accuracy": 0.9521632380783558,
427
  "num_tokens": 5531233.0,
428
  "step": 84
429
  },
430
  {
431
  "epoch": 1.6746987951807228,
432
- "grad_norm": 0.6885182433226127,
433
- "learning_rate": 7.970344252406832e-06,
434
- "loss": 0.1528,
435
- "mean_token_accuracy": 0.9493084512650967,
436
  "num_tokens": 5662305.0,
437
  "step": 86
438
  },
439
  {
440
  "epoch": 1.7132530120481928,
441
- "grad_norm": 0.6297519017429533,
442
- "learning_rate": 7.864927531145012e-06,
443
- "loss": 0.1316,
444
- "mean_token_accuracy": 0.9559161737561226,
445
  "num_tokens": 5792219.0,
446
  "step": 88
447
  },
448
  {
449
  "epoch": 1.7518072289156628,
450
- "grad_norm": 0.561550567385262,
451
- "learning_rate": 7.757579649921354e-06,
452
- "loss": 0.1428,
453
- "mean_token_accuracy": 0.9523311667144299,
454
  "num_tokens": 5923291.0,
455
  "step": 90
456
  },
457
  {
458
  "epoch": 1.7903614457831325,
459
- "grad_norm": 0.6644550404207953,
460
- "learning_rate": 7.648372968656995e-06,
461
- "loss": 0.1501,
462
- "mean_token_accuracy": 0.9510430619120598,
463
  "num_tokens": 6053822.0,
464
  "step": 92
465
  },
466
  {
467
  "epoch": 1.8289156626506025,
468
- "grad_norm": 0.9843503664159776,
469
- "learning_rate": 7.5373811002332785e-06,
470
- "loss": 0.144,
471
- "mean_token_accuracy": 0.951483890414238,
472
  "num_tokens": 6184894.0,
473
  "step": 94
474
  },
475
  {
476
  "epoch": 1.8674698795180724,
477
- "grad_norm": 0.5903343417311625,
478
- "learning_rate": 7.424678860871584e-06,
479
- "loss": 0.151,
480
- "mean_token_accuracy": 0.9494382441043854,
481
  "num_tokens": 6315149.0,
482
  "step": 96
483
  },
484
  {
485
  "epoch": 1.9060240963855422,
486
- "grad_norm": 0.5907430095754442,
487
- "learning_rate": 7.310342219701981e-06,
488
- "loss": 0.1387,
489
- "mean_token_accuracy": 0.9540838934481144,
490
  "num_tokens": 6445345.0,
491
  "step": 98
492
  },
493
  {
494
  "epoch": 1.944578313253012,
495
- "grad_norm": 0.6208604074228016,
496
- "learning_rate": 7.19444824755478e-06,
497
- "loss": 0.14,
498
- "mean_token_accuracy": 0.9524588361382484,
499
  "num_tokens": 6576344.0,
500
  "step": 100
501
  },
502
  {
503
  "epoch": 1.944578313253012,
504
- "eval_loss": 0.3435918688774109,
505
- "eval_mean_token_accuracy": 0.9010296330273708,
506
  "eval_num_tokens": 6576344.0,
507
- "eval_runtime": 69.7068,
508
- "eval_samples_per_second": 12.266,
509
- "eval_steps_per_second": 1.535,
510
  "step": 100
511
  },
512
  {
513
  "epoch": 1.983132530120482,
514
- "grad_norm": 0.622234817214251,
515
- "learning_rate": 7.0770750650094335e-06,
516
- "loss": 0.1467,
517
- "mean_token_accuracy": 0.9509495720267296,
518
  "num_tokens": 6707416.0,
519
  "step": 102
520
  },
521
  {
522
  "epoch": 2.0385542168674697,
523
- "grad_norm": 0.6076186741247155,
524
- "learning_rate": 6.958301789735853e-06,
525
- "loss": 0.1728,
526
- "mean_token_accuracy": 0.9648021578788757,
527
  "num_tokens": 6871256.0,
528
  "step": 104
529
  },
530
  {
531
  "epoch": 2.07710843373494,
532
- "grad_norm": 0.5962392778723037,
533
- "learning_rate": 6.838208483163601e-06,
534
- "loss": 0.1072,
535
- "mean_token_accuracy": 0.9662502221763134,
536
  "num_tokens": 7002255.0,
537
  "step": 106
538
  },
539
  {
540
  "epoch": 2.1156626506024097,
541
- "grad_norm": 0.6146925265036353,
542
- "learning_rate": 6.716876096514944e-06,
543
- "loss": 0.107,
544
- "mean_token_accuracy": 0.9663150422275066,
545
  "num_tokens": 7133327.0,
546
  "step": 108
547
  },
548
  {
549
  "epoch": 2.1542168674698794,
550
- "grad_norm": 0.5878685827878811,
551
- "learning_rate": 6.594386416238095e-06,
552
- "loss": 0.1066,
553
- "mean_token_accuracy": 0.9652048833668232,
554
  "num_tokens": 7263523.0,
555
  "step": 110
556
  },
557
  {
558
  "epoch": 2.1927710843373496,
559
- "grad_norm": 0.6074328881642814,
560
- "learning_rate": 6.470822008877482e-06,
561
- "loss": 0.1029,
562
- "mean_token_accuracy": 0.9668646268546581,
563
  "num_tokens": 7394595.0,
564
  "step": 112
565
  },
566
  {
567
  "epoch": 2.2313253012048193,
568
- "grad_norm": 0.7137977824767872,
569
- "learning_rate": 6.346266165418173e-06,
570
- "loss": 0.1028,
571
- "mean_token_accuracy": 0.9666585326194763,
572
  "num_tokens": 7525667.0,
573
  "step": 114
574
  },
575
  {
576
  "epoch": 2.269879518072289,
577
- "grad_norm": 0.5894226672362628,
578
- "learning_rate": 6.2208028451419575e-06,
579
- "loss": 0.111,
580
- "mean_token_accuracy": 0.9638800770044327,
581
  "num_tokens": 7656739.0,
582
  "step": 116
583
  },
584
  {
585
  "epoch": 2.3084337349397592,
586
- "grad_norm": 0.6184779986012526,
587
- "learning_rate": 6.094516619032975e-06,
588
- "loss": 0.1036,
589
- "mean_token_accuracy": 0.9668951593339443,
590
  "num_tokens": 7787811.0,
591
  "step": 118
592
  },
593
  {
594
  "epoch": 2.346987951807229,
595
- "grad_norm": 0.6253267998013609,
596
- "learning_rate": 5.967492612770999e-06,
597
- "loss": 0.1156,
598
- "mean_token_accuracy": 0.9622229151427746,
599
  "num_tokens": 7916616.0,
600
  "step": 120
601
  },
602
  {
603
  "epoch": 2.346987951807229,
604
- "eval_loss": 0.38264763355255127,
605
- "eval_mean_token_accuracy": 0.8993682889180763,
606
  "eval_num_tokens": 7916616.0,
607
- "eval_runtime": 69.7137,
608
- "eval_samples_per_second": 12.264,
609
- "eval_steps_per_second": 1.535,
610
  "step": 120
611
  },
612
  {
613
  "epoch": 2.3855421686746987,
614
- "grad_norm": 0.5808338051377092,
615
- "learning_rate": 5.839816449350824e-06,
616
- "loss": 0.096,
617
- "mean_token_accuracy": 0.9689866341650486,
618
  "num_tokens": 8047688.0,
619
  "step": 122
620
  },
621
  {
622
  "epoch": 2.4240963855421684,
623
- "grad_norm": 0.6319642719991769,
624
- "learning_rate": 5.711574191366427e-06,
625
- "loss": 0.1064,
626
- "mean_token_accuracy": 0.9659181199967861,
627
  "num_tokens": 8178760.0,
628
  "step": 124
629
  },
630
  {
631
  "epoch": 2.4626506024096386,
632
- "grad_norm": 0.6159005696450227,
633
- "learning_rate": 5.5828522829987965e-06,
634
- "loss": 0.1124,
635
- "mean_token_accuracy": 0.9639564082026482,
636
  "num_tokens": 8309832.0,
637
  "step": 126
638
  },
639
  {
640
  "epoch": 2.5012048192771084,
641
- "grad_norm": 0.5961087714027195,
642
- "learning_rate": 5.453737491746572e-06,
643
- "loss": 0.1027,
644
- "mean_token_accuracy": 0.9668798930943012,
645
  "num_tokens": 8440904.0,
646
  "step": 128
647
  },
648
  {
649
  "epoch": 2.539759036144578,
650
- "grad_norm": 0.6095611549299057,
651
- "learning_rate": 5.324316849938715e-06,
652
- "loss": 0.1053,
653
- "mean_token_accuracy": 0.965734925121069,
654
  "num_tokens": 8571976.0,
655
  "step": 130
656
  },
657
  {
658
  "epoch": 2.5783132530120483,
659
- "grad_norm": 0.5508938269425927,
660
- "learning_rate": 5.194677596068689e-06,
661
- "loss": 0.1078,
662
- "mean_token_accuracy": 0.9649029150605202,
663
  "num_tokens": 8703048.0,
664
  "step": 132
665
  },
666
  {
667
  "epoch": 2.616867469879518,
668
- "grad_norm": 0.5459171806933576,
669
- "learning_rate": 5.064907115989655e-06,
670
- "loss": 0.1061,
671
- "mean_token_accuracy": 0.9655211977660656,
672
  "num_tokens": 8834120.0,
673
  "step": 134
674
  },
675
  {
676
  "epoch": 2.6554216867469878,
677
- "grad_norm": 0.5700055233131458,
678
- "learning_rate": 4.935092884010347e-06,
679
- "loss": 0.1132,
680
- "mean_token_accuracy": 0.9629833847284317,
681
  "num_tokens": 8964375.0,
682
  "step": 136
683
  },
684
  {
685
  "epoch": 2.693975903614458,
686
- "grad_norm": 0.5624039262809987,
687
- "learning_rate": 4.805322403931312e-06,
688
- "loss": 0.1033,
689
- "mean_token_accuracy": 0.966806173324585,
690
  "num_tokens": 9094906.0,
691
  "step": 138
692
  },
693
  {
694
  "epoch": 2.7325301204819277,
695
- "grad_norm": 0.5859071114828392,
696
- "learning_rate": 4.6756831500612846e-06,
697
- "loss": 0.1055,
698
- "mean_token_accuracy": 0.9653303697705269,
699
  "num_tokens": 9225978.0,
700
  "step": 140
701
  },
702
  {
703
  "epoch": 2.7325301204819277,
704
- "eval_loss": 0.38171321153640747,
705
- "eval_mean_token_accuracy": 0.8996138817796083,
706
  "eval_num_tokens": 9225978.0,
707
- "eval_runtime": 69.6997,
708
- "eval_samples_per_second": 12.267,
709
- "eval_steps_per_second": 1.535,
710
  "step": 140
711
  },
712
  {
713
  "epoch": 2.7710843373493974,
714
- "grad_norm": 0.6722991310212527,
715
- "learning_rate": 4.546262508253429e-06,
716
- "loss": 0.1021,
717
- "mean_token_accuracy": 0.9667806625366211,
718
  "num_tokens": 9357050.0,
719
  "step": 142
720
  },
721
  {
722
  "epoch": 2.8096385542168676,
723
- "grad_norm": 0.6406013030976072,
724
- "learning_rate": 4.417147717001205e-06,
725
- "loss": 0.1135,
726
- "mean_token_accuracy": 0.9634297229349613,
727
  "num_tokens": 9488122.0,
728
  "step": 144
729
  },
730
  {
731
  "epoch": 2.8481927710843373,
732
- "grad_norm": 0.5462219273577149,
733
- "learning_rate": 4.2884258086335755e-06,
734
- "loss": 0.1084,
735
- "mean_token_accuracy": 0.9645517915487289,
736
  "num_tokens": 9619194.0,
737
  "step": 146
738
  },
739
  {
740
  "epoch": 2.886746987951807,
741
- "grad_norm": 0.6163829542440136,
742
- "learning_rate": 4.160183550649176e-06,
743
- "loss": 0.1136,
744
- "mean_token_accuracy": 0.963452622294426,
745
  "num_tokens": 9750266.0,
746
  "step": 148
747
  },
748
  {
749
  "epoch": 2.9253012048192772,
750
- "grad_norm": 0.5303018809042978,
751
- "learning_rate": 4.032507387229002e-06,
752
- "loss": 0.1027,
753
- "mean_token_accuracy": 0.9666890650987625,
754
  "num_tokens": 9881338.0,
755
  "step": 150
756
  },
757
  {
758
  "epoch": 2.963855421686747,
759
- "grad_norm": 0.5435012633400901,
760
- "learning_rate": 3.905483380967027e-06,
761
- "loss": 0.1034,
762
- "mean_token_accuracy": 0.9660157673060894,
763
  "num_tokens": 10011972.0,
764
  "step": 152
765
  },
766
  {
767
  "epoch": 3.019277108433735,
768
- "grad_norm": 0.9816680723651049,
769
- "learning_rate": 3.779197154858044e-06,
770
- "loss": 0.1457,
771
- "mean_token_accuracy": 0.9704201340675354,
772
  "num_tokens": 10175812.0,
773
  "step": 154
774
  },
775
  {
776
  "epoch": 3.057831325301205,
777
- "grad_norm": 0.613237465904118,
778
- "learning_rate": 3.6537338345818273e-06,
779
- "loss": 0.0837,
780
- "mean_token_accuracy": 0.97471147403121,
781
  "num_tokens": 10306884.0,
782
  "step": 156
783
  },
784
  {
785
  "epoch": 3.0963855421686746,
786
- "grad_norm": 0.5444415226957525,
787
- "learning_rate": 3.529177991122519e-06,
788
- "loss": 0.0833,
789
- "mean_token_accuracy": 0.9740656353533268,
790
  "num_tokens": 10437883.0,
791
  "step": 158
792
  },
793
  {
794
  "epoch": 3.1349397590361447,
795
- "grad_norm": 0.5037775641719325,
796
- "learning_rate": 3.4056135837619077e-06,
797
- "loss": 0.0808,
798
- "mean_token_accuracy": 0.9754060879349709,
799
  "num_tokens": 10568955.0,
800
  "step": 160
801
  },
802
  {
803
  "epoch": 3.1349397590361447,
804
- "eval_loss": 0.4214092493057251,
805
- "eval_mean_token_accuracy": 0.8985673874338097,
806
  "eval_num_tokens": 10568955.0,
807
- "eval_runtime": 69.9377,
808
- "eval_samples_per_second": 12.225,
809
- "eval_steps_per_second": 1.53,
810
  "step": 160
811
  },
812
  {
813
  "epoch": 3.1734939759036145,
814
- "grad_norm": 0.5101646067201494,
815
- "learning_rate": 3.2831239034850593e-06,
816
- "loss": 0.0836,
817
- "mean_token_accuracy": 0.974490113556385,
818
  "num_tokens": 10700027.0,
819
  "step": 162
820
  },
821
  {
822
  "epoch": 3.212048192771084,
823
- "grad_norm": 0.513538637831937,
824
- "learning_rate": 3.1617915168363994e-06,
825
- "loss": 0.0798,
826
- "mean_token_accuracy": 0.9749175682663918,
827
  "num_tokens": 10831099.0,
828
  "step": 164
829
  },
830
  {
831
  "epoch": 3.2506024096385544,
832
- "grad_norm": 0.6172273027962087,
833
- "learning_rate": 3.041698210264149e-06,
834
- "loss": 0.0846,
835
- "mean_token_accuracy": 0.9737649671733379,
836
  "num_tokens": 10962171.0,
837
  "step": 166
838
  },
839
  {
840
  "epoch": 3.289156626506024,
841
- "grad_norm": 0.598237570073578,
842
- "learning_rate": 2.9229249349905686e-06,
843
- "loss": 0.0822,
844
- "mean_token_accuracy": 0.9747206009924412,
845
  "num_tokens": 11092367.0,
846
  "step": 168
847
  },
848
  {
849
  "epoch": 3.327710843373494,
850
- "grad_norm": 0.5468187405423223,
851
- "learning_rate": 2.805551752445222e-06,
852
- "loss": 0.0741,
853
- "mean_token_accuracy": 0.9763831272721291,
854
  "num_tokens": 11223439.0,
855
  "step": 170
856
  },
857
  {
858
  "epoch": 3.3662650602409636,
859
- "grad_norm": 0.5360576538075228,
860
- "learning_rate": 2.689657780298019e-06,
861
- "loss": 0.0845,
862
- "mean_token_accuracy": 0.9735130742192268,
863
  "num_tokens": 11354511.0,
864
  "step": 172
865
  },
866
  {
867
  "epoch": 3.404819277108434,
868
- "grad_norm": 0.553617670290852,
869
- "learning_rate": 2.5753211391284172e-06,
870
- "loss": 0.0909,
871
- "mean_token_accuracy": 0.9720780476927757,
872
  "num_tokens": 11485583.0,
873
  "step": 174
874
  },
875
  {
876
  "epoch": 3.4433734939759035,
877
- "grad_norm": 0.5270914482632371,
878
- "learning_rate": 2.4626188997667224e-06,
879
- "loss": 0.0785,
880
- "mean_token_accuracy": 0.9754440970718861,
881
  "num_tokens": 11616217.0,
882
  "step": 176
883
  },
884
  {
885
  "epoch": 3.4819277108433733,
886
- "grad_norm": 0.5812147686235462,
887
- "learning_rate": 2.3516270313430085e-06,
888
- "loss": 0.0765,
889
- "mean_token_accuracy": 0.9763373285531998,
890
  "num_tokens": 11747289.0,
891
  "step": 178
892
  },
893
  {
894
  "epoch": 3.5204819277108435,
895
- "grad_norm": 0.5745069078285343,
896
- "learning_rate": 2.2424203500786473e-06,
897
  "loss": 0.0777,
898
- "mean_token_accuracy": 0.9760472699999809,
899
  "num_tokens": 11878361.0,
900
  "step": 180
901
  },
902
  {
903
  "epoch": 3.5204819277108435,
904
- "eval_loss": 0.434172123670578,
905
- "eval_mean_token_accuracy": 0.8980075516433359,
906
  "eval_num_tokens": 11878361.0,
907
- "eval_runtime": 69.6978,
908
- "eval_samples_per_second": 12.267,
909
- "eval_steps_per_second": 1.535,
910
  "step": 180
911
  },
912
  {
913
  "epoch": 3.559036144578313,
914
- "grad_norm": 0.5256822014984209,
915
- "learning_rate": 2.1350724688549906e-06,
916
- "loss": 0.0799,
917
- "mean_token_accuracy": 0.9748259708285332,
918
  "num_tokens": 12009433.0,
919
  "step": 182
920
  },
921
  {
922
  "epoch": 3.597590361445783,
923
- "grad_norm": 0.5162737081731001,
924
- "learning_rate": 2.029655747593169e-06,
925
- "loss": 0.0853,
926
- "mean_token_accuracy": 0.9729176908731461,
927
  "num_tokens": 12140505.0,
928
  "step": 184
929
  },
930
  {
931
  "epoch": 3.636144578313253,
932
- "grad_norm": 0.5160106591794058,
933
- "learning_rate": 1.926241244478496e-06,
934
- "loss": 0.08,
935
- "mean_token_accuracy": 0.975032065063715,
936
  "num_tokens": 12271577.0,
937
  "step": 186
938
  },
939
  {
940
  "epoch": 3.674698795180723,
941
- "grad_norm": 0.5536351679795151,
942
- "learning_rate": 1.8248986680623077e-06,
943
- "loss": 0.0826,
944
- "mean_token_accuracy": 0.973932895809412,
945
  "num_tokens": 12402649.0,
946
  "step": 188
947
  },
948
  {
949
  "epoch": 3.7132530120481926,
950
- "grad_norm": 0.524697340208831,
951
- "learning_rate": 1.7256963302735752e-06,
952
- "loss": 0.0861,
953
- "mean_token_accuracy": 0.973070353269577,
954
  "num_tokens": 12533721.0,
955
  "step": 190
956
  },
957
  {
958
  "epoch": 3.7518072289156628,
959
- "grad_norm": 0.5391119770045195,
960
- "learning_rate": 1.6287011003719105e-06,
961
- "loss": 0.0845,
962
- "mean_token_accuracy": 0.9731407649815083,
963
  "num_tokens": 12663435.0,
964
  "step": 192
965
  },
966
  {
967
  "epoch": 3.7903614457831325,
968
- "grad_norm": 0.5377909222832687,
969
- "learning_rate": 1.5339783598730568e-06,
970
- "loss": 0.0765,
971
- "mean_token_accuracy": 0.9760778024792671,
972
  "num_tokens": 12794507.0,
973
  "step": 194
974
  },
975
  {
976
  "epoch": 3.8289156626506022,
977
- "grad_norm": 0.5713118205205915,
978
- "learning_rate": 1.4415919584771999e-06,
979
- "loss": 0.0843,
980
- "mean_token_accuracy": 0.9733833111822605,
981
  "num_tokens": 12925579.0,
982
  "step": 196
983
  },
984
  {
985
  "epoch": 3.8674698795180724,
986
- "grad_norm": 0.5356073069528993,
987
- "learning_rate": 1.35160417102985e-06,
988
- "loss": 0.0808,
989
- "mean_token_accuracy": 0.9754137210547924,
990
  "num_tokens": 13056651.0,
991
  "step": 198
992
  },
993
  {
994
  "epoch": 3.906024096385542,
995
- "grad_norm": 0.5286524017708772,
996
- "learning_rate": 1.2640756555442684e-06,
997
- "loss": 0.0818,
998
- "mean_token_accuracy": 0.9743756167590618,
999
  "num_tokens": 13187723.0,
1000
  "step": 200
1001
  },
1002
  {
1003
  "epoch": 3.906024096385542,
1004
- "eval_loss": 0.436479389667511,
1005
- "eval_mean_token_accuracy": 0.8978864634148428,
1006
  "eval_num_tokens": 13187723.0,
1007
- "eval_runtime": 69.707,
1008
- "eval_samples_per_second": 12.266,
1009
- "eval_steps_per_second": 1.535,
1010
  "step": 200
1011
  },
1012
  {
1013
  "epoch": 3.944578313253012,
1014
- "grad_norm": 0.5493055348438105,
1015
- "learning_rate": 1.1790654123137552e-06,
1016
- "loss": 0.0838,
1017
- "mean_token_accuracy": 0.9733658656477928,
1018
  "num_tokens": 13317686.0,
1019
  "step": 202
1020
  },
1021
  {
1022
  "epoch": 3.983132530120482,
1023
- "grad_norm": 0.5079057943600861,
1024
- "learning_rate": 1.0966307441413598e-06,
1025
- "loss": 0.0777,
1026
- "mean_token_accuracy": 0.9752789959311485,
1027
  "num_tokens": 13447600.0,
1028
  "step": 204
1029
  },
1030
  {
1031
  "epoch": 4.03855421686747,
1032
- "grad_norm": 0.48888783353685356,
1033
- "learning_rate": 1.01682721771382e-06,
1034
- "loss": 0.1076,
1035
- "mean_token_accuracy": 0.978389111161232,
1036
  "num_tokens": 13611440.0,
1037
  "step": 206
1038
  },
1039
  {
1040
  "epoch": 4.0771084337349395,
1041
- "grad_norm": 0.501089775240556,
1042
- "learning_rate": 9.397086261457511e-07,
1043
- "loss": 0.068,
1044
- "mean_token_accuracy": 0.9800592921674252,
1045
  "num_tokens": 13740537.0,
1046
  "step": 208
1047
  },
1048
  {
1049
  "epoch": 4.11566265060241,
1050
- "grad_norm": 0.48538593277097636,
1051
- "learning_rate": 8.65326952719357e-07,
1052
- "loss": 0.0725,
1053
- "mean_token_accuracy": 0.97826087474823,
1054
  "num_tokens": 13871609.0,
1055
  "step": 210
1056
  },
1057
  {
1058
  "epoch": 4.15421686746988,
1059
- "grad_norm": 0.4434876328787144,
1060
- "learning_rate": 7.937323358440935e-07,
1061
- "loss": 0.0736,
1062
- "mean_token_accuracy": 0.9777952544391155,
1063
  "num_tokens": 14002681.0,
1064
  "step": 212
1065
  },
1066
  {
1067
  "epoch": 4.192771084337349,
1068
- "grad_norm": 0.4774125338338672,
1069
- "learning_rate": 7.249730352599e-07,
1070
- "loss": 0.0757,
1071
- "mean_token_accuracy": 0.9774899296462536,
1072
  "num_tokens": 14133753.0,
1073
  "step": 214
1074
  },
1075
  {
1076
  "epoch": 4.231325301204819,
1077
- "grad_norm": 0.4885171753944554,
1078
- "learning_rate": 6.590953995067812e-07,
1079
- "loss": 0.0706,
1080
- "mean_token_accuracy": 0.9788230285048485,
1081
  "num_tokens": 14264314.0,
1082
  "step": 216
1083
  },
1084
  {
1085
  "epoch": 4.2698795180722895,
1086
- "grad_norm": 0.43295382737919863,
1087
- "learning_rate": 5.961438346826792e-07,
1088
- "loss": 0.0671,
1089
- "mean_token_accuracy": 0.9799401611089706,
1090
  "num_tokens": 14395386.0,
1091
  "step": 218
1092
  },
1093
  {
1094
  "epoch": 4.308433734939759,
1095
- "grad_norm": 0.44853208185927035,
1096
- "learning_rate": 5.361607745106817e-07,
1097
- "loss": 0.0709,
1098
- "mean_token_accuracy": 0.9785433001816273,
1099
  "num_tokens": 14526458.0,
1100
  "step": 220
1101
  },
1102
  {
1103
  "epoch": 4.308433734939759,
1104
- "eval_loss": 0.4599273204803467,
1105
- "eval_mean_token_accuracy": 0.8972330873257646,
1106
  "eval_num_tokens": 14526458.0,
1107
- "eval_runtime": 69.7106,
1108
- "eval_samples_per_second": 12.265,
1109
- "eval_steps_per_second": 1.535,
1110
  "step": 220
1111
  },
1112
  {
1113
  "epoch": 4.346987951807229,
1114
- "grad_norm": 0.47608905203927504,
1115
- "learning_rate": 4.791866517357491e-07,
1116
- "loss": 0.0689,
1117
- "mean_token_accuracy": 0.9793831780552864,
1118
  "num_tokens": 14656421.0,
1119
  "step": 222
1120
  },
1121
  {
1122
  "epoch": 4.385542168674699,
1123
- "grad_norm": 0.44038761142354355,
1124
- "learning_rate": 4.2525987087023433e-07,
1125
- "loss": 0.0676,
1126
- "mean_token_accuracy": 0.9797569662332535,
1127
  "num_tokens": 14787493.0,
1128
  "step": 224
1129
  },
1130
  {
1131
  "epoch": 4.424096385542168,
1132
- "grad_norm": 0.45920050524546036,
1133
- "learning_rate": 3.744167823065814e-07,
1134
- "loss": 0.0662,
1135
- "mean_token_accuracy": 0.9802683852612972,
1136
  "num_tokens": 14918565.0,
1137
  "step": 226
1138
  },
1139
  {
1140
  "epoch": 4.462650602409639,
1141
- "grad_norm": 0.47097840934998153,
1142
- "learning_rate": 3.26691657814634e-07,
1143
- "loss": 0.0682,
1144
- "mean_token_accuracy": 0.979138683527708,
1145
  "num_tokens": 15049637.0,
1146
  "step": 228
1147
  },
1148
  {
1149
  "epoch": 4.501204819277109,
1150
- "grad_norm": 0.5013631017535977,
1151
- "learning_rate": 2.821166674400905e-07,
1152
- "loss": 0.071,
1153
- "mean_token_accuracy": 0.9781845435500145,
1154
  "num_tokens": 15180709.0,
1155
  "step": 230
1156
  },
1157
  {
1158
  "epoch": 4.539759036144578,
1159
- "grad_norm": 0.4623396825811124,
1160
- "learning_rate": 2.407218578196524e-07,
1161
- "loss": 0.0702,
1162
- "mean_token_accuracy": 0.9783219397068024,
1163
  "num_tokens": 15311781.0,
1164
  "step": 232
1165
  },
1166
  {
1167
  "epoch": 4.578313253012048,
1168
- "grad_norm": 0.4902395855979876,
1169
- "learning_rate": 2.0253513192751374e-07,
1170
- "loss": 0.0664,
1171
- "mean_token_accuracy": 0.9796730019152164,
1172
  "num_tokens": 15442853.0,
1173
  "step": 234
1174
  },
1175
  {
1176
  "epoch": 4.6168674698795185,
1177
- "grad_norm": 0.5039049018398478,
1178
- "learning_rate": 1.6758223026681507e-07,
1179
- "loss": 0.0798,
1180
- "mean_token_accuracy": 0.9753297567367554,
1181
  "num_tokens": 15573925.0,
1182
  "step": 236
1183
  },
1184
  {
1185
  "epoch": 4.655421686746988,
1186
- "grad_norm": 0.46061650797882114,
1187
- "learning_rate": 1.358867135187636e-07,
1188
- "loss": 0.0724,
1189
- "mean_token_accuracy": 0.9777494557201862,
1190
  "num_tokens": 15704997.0,
1191
  "step": 238
1192
  },
1193
  {
1194
  "epoch": 4.693975903614458,
1195
- "grad_norm": 0.4625716914461487,
1196
- "learning_rate": 1.0746994666109234e-07,
1197
- "loss": 0.0669,
1198
- "mean_token_accuracy": 0.9799096286296844,
1199
  "num_tokens": 15836069.0,
1200
  "step": 240
1201
  },
1202
  {
1203
  "epoch": 4.693975903614458,
1204
- "eval_loss": 0.46857714653015137,
1205
- "eval_mean_token_accuracy": 0.8969991552495511,
1206
  "eval_num_tokens": 15836069.0,
1207
- "eval_runtime": 69.7554,
1208
- "eval_samples_per_second": 12.257,
1209
- "eval_steps_per_second": 1.534,
1210
  "step": 240
1211
  },
1212
  {
1213
  "epoch": 4.732530120481927,
1214
- "grad_norm": 0.4944699774543825,
1215
- "learning_rate": 8.235108456658814e-08,
1216
- "loss": 0.078,
1217
- "mean_token_accuracy": 0.9763373285531998,
1218
  "num_tokens": 15967141.0,
1219
  "step": 242
1220
  },
1221
  {
1222
  "epoch": 4.771084337349397,
1223
- "grad_norm": 0.44131491699880177,
1224
- "learning_rate": 6.054705909137426e-08,
1225
- "loss": 0.064,
1226
- "mean_token_accuracy": 0.9806805737316608,
1227
  "num_tokens": 16098213.0,
1228
  "step": 244
1229
  },
1230
  {
1231
  "epoch": 4.809638554216868,
1232
- "grad_norm": 0.4628370285868946,
1233
- "learning_rate": 4.207256766166845e-08,
1234
- "loss": 0.0695,
1235
- "mean_token_accuracy": 0.9792837128043175,
1236
  "num_tokens": 16229285.0,
1237
  "step": 246
1238
  },
1239
  {
1240
  "epoch": 4.848192771084337,
1241
- "grad_norm": 0.5090336786908616,
1242
- "learning_rate": 2.6940063366693303e-08,
1243
- "loss": 0.0702,
1244
- "mean_token_accuracy": 0.9789783880114555,
1245
  "num_tokens": 16360357.0,
1246
  "step": 248
1247
  },
1248
  {
1249
  "epoch": 4.886746987951807,
1250
- "grad_norm": 0.4552845063798719,
1251
- "learning_rate": 1.51597465644332e-08,
1252
- "loss": 0.0664,
1253
- "mean_token_accuracy": 0.9799019955098629,
1254
  "num_tokens": 16491429.0,
1255
  "step": 250
1256
  },
1257
  {
1258
  "epoch": 4.925301204819277,
1259
- "grad_norm": 0.456376642155954,
1260
- "learning_rate": 6.739558005884883e-09,
1261
- "loss": 0.0656,
1262
- "mean_token_accuracy": 0.9802608676254749,
1263
  "num_tokens": 16621960.0,
1264
  "step": 252
1265
  },
1266
  {
1267
  "epoch": 4.9638554216867465,
1268
- "grad_norm": 0.4501859344400936,
1269
- "learning_rate": 1.6851734824380184e-09,
1270
- "loss": 0.0698,
1271
- "mean_token_accuracy": 0.9787566177546978,
1272
  "num_tokens": 16752156.0,
1273
  "step": 254
1274
  },
1275
  {
1276
  "epoch": 4.983132530120482,
1277
- "mean_token_accuracy": 0.9759251400828362,
1278
  "num_tokens": 16817692.0,
1279
  "step": 255,
1280
  "total_flos": 24409842647040.0,
1281
- "train_loss": 0.12131776079243305,
1282
- "train_runtime": 3783.5615,
1283
  "train_samples_per_second": 2.194,
1284
  "train_steps_per_second": 0.067
1285
  }
 
11
  "log_history": [
12
  {
13
  "epoch": 0.03855421686746988,
14
+ "grad_norm": 2.0457221564142256,
15
+ "learning_rate": 3.846153846153847e-07,
16
  "loss": 0.2354,
17
  "mean_token_accuracy": 0.930065356194973,
18
  "num_tokens": 131072.0,
 
20
  },
21
  {
22
  "epoch": 0.07710843373493977,
23
+ "grad_norm": 2.1086974646270145,
24
+ "learning_rate": 1.153846153846154e-06,
25
+ "loss": 0.2508,
26
+ "mean_token_accuracy": 0.9255465492606163,
27
  "num_tokens": 262144.0,
28
  "step": 4
29
  },
30
  {
31
  "epoch": 0.11566265060240964,
32
+ "grad_norm": 1.698182282959437,
33
+ "learning_rate": 1.9230769230769234e-06,
34
+ "loss": 0.2473,
35
+ "mean_token_accuracy": 0.9256381466984749,
36
  "num_tokens": 393216.0,
37
  "step": 6
38
  },
39
  {
40
  "epoch": 0.15421686746987953,
41
+ "grad_norm": 1.4331583326698771,
42
+ "learning_rate": 2.6923076923076923e-06,
43
+ "loss": 0.2193,
44
+ "mean_token_accuracy": 0.9314393177628517,
45
  "num_tokens": 524288.0,
46
  "step": 8
47
  },
48
  {
49
  "epoch": 0.1927710843373494,
50
+ "grad_norm": 1.280978852144958,
51
+ "learning_rate": 3.4615384615384617e-06,
52
+ "loss": 0.2205,
53
+ "mean_token_accuracy": 0.930450152605772,
54
  "num_tokens": 654484.0,
55
  "step": 10
56
  },
57
  {
58
  "epoch": 0.23132530120481928,
59
+ "grad_norm": 0.8255955634911271,
60
+ "learning_rate": 4.230769230769231e-06,
61
+ "loss": 0.2117,
62
+ "mean_token_accuracy": 0.9317141100764275,
63
  "num_tokens": 785556.0,
64
  "step": 12
65
  },
66
  {
67
  "epoch": 0.26987951807228916,
68
+ "grad_norm": 0.7584680371226415,
69
+ "learning_rate": 5e-06,
70
+ "loss": 0.206,
71
+ "mean_token_accuracy": 0.9338631108403206,
72
  "num_tokens": 915519.0,
73
  "step": 14
74
  },
75
  {
76
  "epoch": 0.30843373493975906,
77
+ "grad_norm": 0.9495192852210463,
78
+ "learning_rate": 5.769230769230769e-06,
79
+ "loss": 0.1982,
80
+ "mean_token_accuracy": 0.9358359947800636,
81
  "num_tokens": 1046591.0,
82
  "step": 16
83
  },
84
  {
85
  "epoch": 0.3469879518072289,
86
+ "grad_norm": 0.9714974283482016,
87
+ "learning_rate": 6.538461538461539e-06,
88
+ "loss": 0.2055,
89
+ "mean_token_accuracy": 0.9338132180273533,
90
  "num_tokens": 1177663.0,
91
  "step": 18
92
  },
93
  {
94
  "epoch": 0.3855421686746988,
95
+ "grad_norm": 0.6339236056292388,
96
+ "learning_rate": 7.307692307692308e-06,
97
+ "loss": 0.1917,
98
+ "mean_token_accuracy": 0.9378740377724171,
99
  "num_tokens": 1308735.0,
100
  "step": 20
101
  },
102
  {
103
  "epoch": 0.3855421686746988,
104
+ "eval_loss": 0.3343917727470398,
105
+ "eval_mean_token_accuracy": 0.9013295725127247,
106
  "eval_num_tokens": 1308735.0,
107
+ "eval_runtime": 70.0593,
108
+ "eval_samples_per_second": 12.204,
109
+ "eval_steps_per_second": 1.527,
110
  "step": 20
111
  },
112
  {
113
  "epoch": 0.42409638554216866,
114
+ "grad_norm": 0.7315888499202351,
115
+ "learning_rate": 8.076923076923077e-06,
116
+ "loss": 0.1809,
117
+ "mean_token_accuracy": 0.9400189444422722,
118
  "num_tokens": 1439807.0,
119
  "step": 22
120
  },
121
  {
122
  "epoch": 0.46265060240963857,
123
+ "grad_norm": 0.7642349616310066,
124
+ "learning_rate": 8.846153846153847e-06,
125
+ "loss": 0.1928,
126
+ "mean_token_accuracy": 0.9367095269262791,
127
  "num_tokens": 1570062.0,
128
  "step": 24
129
  },
130
  {
131
  "epoch": 0.5012048192771085,
132
+ "grad_norm": 0.6114978913375759,
133
+ "learning_rate": 9.615384615384616e-06,
134
+ "loss": 0.1828,
135
+ "mean_token_accuracy": 0.9394693598151207,
136
  "num_tokens": 1701134.0,
137
  "step": 26
138
  },
139
  {
140
  "epoch": 0.5397590361445783,
141
+ "grad_norm": 0.6229653774047121,
142
+ "learning_rate": 9.999529497453782e-06,
143
+ "loss": 0.1806,
144
+ "mean_token_accuracy": 0.9402282536029816,
145
  "num_tokens": 1832133.0,
146
  "step": 28
147
  },
148
  {
149
  "epoch": 0.5783132530120482,
150
+ "grad_norm": 0.6722415161460822,
151
+ "learning_rate": 9.99576600836172e-06,
152
+ "loss": 0.1896,
153
+ "mean_token_accuracy": 0.9363855794072151,
154
  "num_tokens": 1963205.0,
155
  "step": 30
156
  },
157
  {
158
  "epoch": 0.6168674698795181,
159
+ "grad_norm": 0.5974286474799401,
160
+ "learning_rate": 9.988241863214212e-06,
161
+ "loss": 0.1814,
162
+ "mean_token_accuracy": 0.9404540322721004,
163
  "num_tokens": 2094277.0,
164
  "step": 32
165
  },
166
  {
167
  "epoch": 0.655421686746988,
168
+ "grad_norm": 0.601035342701654,
169
+ "learning_rate": 9.976962725951878e-06,
170
+ "loss": 0.1801,
171
+ "mean_token_accuracy": 0.9400342106819153,
172
  "num_tokens": 2225349.0,
173
  "step": 34
174
  },
175
  {
176
  "epoch": 0.6939759036144578,
177
+ "grad_norm": 0.5765003488310966,
178
+ "learning_rate": 9.961937087155697e-06,
179
+ "loss": 0.1828,
180
+ "mean_token_accuracy": 0.9392519034445286,
181
  "num_tokens": 2355263.0,
182
  "step": 36
183
  },
184
  {
185
  "epoch": 0.7325301204819277,
186
+ "grad_norm": 34.52047518558373,
187
+ "learning_rate": 9.943176257655567e-06,
188
+ "loss": 0.2098,
189
+ "mean_token_accuracy": 0.9331491328775883,
190
  "num_tokens": 2486335.0,
191
  "step": 38
192
  },
193
  {
194
  "epoch": 0.7710843373493976,
195
+ "grad_norm": 0.6276699276820382,
196
+ "learning_rate": 9.920694360015864e-06,
197
+ "loss": 0.1745,
198
+ "mean_token_accuracy": 0.9413929060101509,
199
  "num_tokens": 2617407.0,
200
  "step": 40
201
  },
202
  {
203
  "epoch": 0.7710843373493976,
204
+ "eval_loss": 0.32280808687210083,
205
+ "eval_mean_token_accuracy": 0.9021720039510281,
206
  "eval_num_tokens": 2617407.0,
207
+ "eval_runtime": 69.6577,
208
+ "eval_samples_per_second": 12.274,
209
  "eval_steps_per_second": 1.536,
210
  "step": 40
211
  },
212
  {
213
  "epoch": 0.8096385542168675,
214
+ "grad_norm": 0.6015365123041743,
215
+ "learning_rate": 9.894508317904418e-06,
216
+ "loss": 0.1751,
217
+ "mean_token_accuracy": 0.9412707760930061,
218
  "num_tokens": 2748479.0,
219
  "step": 42
220
  },
221
  {
222
  "epoch": 0.8481927710843373,
223
+ "grad_norm": 0.6316203175238668,
224
+ "learning_rate": 9.864637843352916e-06,
225
+ "loss": 0.184,
226
+ "mean_token_accuracy": 0.9374923817813396,
227
  "num_tokens": 2879551.0,
228
  "step": 44
229
  },
230
  {
231
  "epoch": 0.8867469879518072,
232
+ "grad_norm": 0.5904610746669308,
233
+ "learning_rate": 9.831105421918287e-06,
234
+ "loss": 0.1777,
235
+ "mean_token_accuracy": 0.9405580870807171,
236
  "num_tokens": 3010185.0,
237
  "step": 46
238
  },
239
  {
240
  "epoch": 0.9253012048192771,
241
+ "grad_norm": 0.5994215271575196,
242
+ "learning_rate": 9.793936295756292e-06,
243
+ "loss": 0.187,
244
+ "mean_token_accuracy": 0.9375152811408043,
245
  "num_tokens": 3141257.0,
246
  "step": 48
247
  },
248
  {
249
  "epoch": 0.963855421686747,
250
+ "grad_norm": 0.5854742456446934,
251
+ "learning_rate": 9.753158444620013e-06,
252
+ "loss": 0.1815,
253
+ "mean_token_accuracy": 0.9394976831972599,
254
  "num_tokens": 3271788.0,
255
  "step": 50
256
  },
257
  {
258
  "epoch": 1.0192771084337349,
259
+ "grad_norm": 0.957499837849808,
260
+ "learning_rate": 9.70880256479758e-06,
261
+ "loss": 0.2534,
262
+ "mean_token_accuracy": 0.9437652796506881,
263
  "num_tokens": 3435628.0,
264
  "step": 52
265
  },
266
  {
267
  "epoch": 1.0578313253012048,
268
+ "grad_norm": 0.6854514205992324,
269
+ "learning_rate": 9.660902046004954e-06,
270
+ "loss": 0.151,
271
+ "mean_token_accuracy": 0.9503083899617195,
272
  "num_tokens": 3566700.0,
273
  "step": 54
274
  },
275
  {
276
  "epoch": 1.0963855421686748,
277
+ "grad_norm": 0.6080507225701574,
278
+ "learning_rate": 9.60949294625121e-06,
279
+ "loss": 0.1415,
280
+ "mean_token_accuracy": 0.9535066671669483,
281
  "num_tokens": 3697772.0,
282
  "step": 56
283
  },
284
  {
285
  "epoch": 1.1349397590361445,
286
+ "grad_norm": 0.6054065882233389,
287
+ "learning_rate": 9.554613964695189e-06,
288
+ "loss": 0.1493,
289
+ "mean_token_accuracy": 0.9502549581229687,
290
  "num_tokens": 3828844.0,
291
  "step": 58
292
  },
293
  {
294
  "epoch": 1.1734939759036145,
295
+ "grad_norm": 0.7694600057204949,
296
+ "learning_rate": 9.496306412513989e-06,
297
+ "loss": 0.1462,
298
+ "mean_token_accuracy": 0.9519953094422817,
299
  "num_tokens": 3959916.0,
300
  "step": 60
301
  },
302
  {
303
  "epoch": 1.1734939759036145,
304
+ "eval_loss": 0.359206885099411,
305
+ "eval_mean_token_accuracy": 0.9007850846397543,
306
  "eval_num_tokens": 3959916.0,
307
+ "eval_runtime": 69.8215,
308
+ "eval_samples_per_second": 12.246,
309
  "eval_steps_per_second": 1.532,
310
  "step": 60
311
  },
312
  {
313
  "epoch": 1.2120481927710842,
314
+ "grad_norm": 0.6845669867023433,
315
+ "learning_rate": 9.434614181805203e-06,
316
+ "loss": 0.1407,
317
+ "mean_token_accuracy": 0.9533876590430737,
318
  "num_tokens": 4089879.0,
319
  "step": 62
320
  },
321
  {
322
  "epoch": 1.2506024096385542,
323
+ "grad_norm": 0.6197114152379135,
324
+ "learning_rate": 9.369583712546322e-06,
325
+ "loss": 0.1349,
326
+ "mean_token_accuracy": 0.9554836452007294,
327
  "num_tokens": 4220951.0,
328
  "step": 64
329
  },
330
  {
331
  "epoch": 1.2891566265060241,
332
+ "grad_norm": 0.6172158164875755,
333
+ "learning_rate": 9.30126395763618e-06,
334
+ "loss": 0.1535,
335
+ "mean_token_accuracy": 0.95006413012743,
336
  "num_tokens": 4352023.0,
337
  "step": 66
338
  },
339
  {
340
  "epoch": 1.3277108433734939,
341
+ "grad_norm": 0.6409060214608714,
342
+ "learning_rate": 9.229706346044749e-06,
343
+ "loss": 0.156,
344
+ "mean_token_accuracy": 0.9484306424856186,
345
  "num_tokens": 4483095.0,
346
  "step": 68
347
  },
348
  {
349
  "epoch": 1.3662650602409638,
350
+ "grad_norm": 0.6166450609513697,
351
+ "learning_rate": 9.154964744099006e-06,
352
+ "loss": 0.1419,
353
+ "mean_token_accuracy": 0.9533540047705173,
354
  "num_tokens": 4614167.0,
355
  "step": 70
356
  },
357
  {
358
  "epoch": 1.4048192771084338,
359
+ "grad_norm": 0.6058092262037136,
360
+ "learning_rate": 9.077095414934076e-06,
361
+ "loss": 0.1439,
362
+ "mean_token_accuracy": 0.9524685628712177,
363
  "num_tokens": 4745239.0,
364
  "step": 72
365
  },
366
  {
367
  "epoch": 1.4433734939759035,
368
+ "grad_norm": 0.6464674278239464,
369
+ "learning_rate": 8.996156976140088e-06,
370
+ "loss": 0.1427,
371
+ "mean_token_accuracy": 0.9521632380783558,
372
  "num_tokens": 4876311.0,
373
  "step": 74
374
  },
375
  {
376
  "epoch": 1.4819277108433735,
377
+ "grad_norm": 0.6232124362016298,
378
+ "learning_rate": 8.91221035563669e-06,
379
+ "loss": 0.1387,
380
+ "mean_token_accuracy": 0.9537738263607025,
381
  "num_tokens": 5007383.0,
382
  "step": 76
383
  },
384
  {
385
  "epoch": 1.5204819277108435,
386
+ "grad_norm": 0.6251055517263481,
387
+ "learning_rate": 8.82531874580844e-06,
388
+ "loss": 0.1544,
389
+ "mean_token_accuracy": 0.9496977403759956,
390
  "num_tokens": 5138455.0,
391
  "step": 78
392
  },
393
  {
394
  "epoch": 1.5590361445783132,
395
+ "grad_norm": 0.6597130966145244,
396
+ "learning_rate": 8.735547555935538e-06,
397
+ "loss": 0.1467,
398
+ "mean_token_accuracy": 0.951957143843174,
399
  "num_tokens": 5269527.0,
400
  "step": 80
401
  },
402
  {
403
  "epoch": 1.5590361445783132,
404
+ "eval_loss": 0.34304243326187134,
405
+ "eval_mean_token_accuracy": 0.9011661727851796,
406
  "eval_num_tokens": 5269527.0,
407
+ "eval_runtime": 69.6573,
408
+ "eval_samples_per_second": 12.274,
409
  "eval_steps_per_second": 1.536,
410
  "step": 80
411
  },
412
  {
413
  "epoch": 1.5975903614457831,
414
+ "grad_norm": 0.6093216234766912,
415
+ "learning_rate": 8.642964362955781e-06,
416
+ "loss": 0.145,
417
+ "mean_token_accuracy": 0.9515700563788414,
418
  "num_tokens": 5400161.0,
419
  "step": 82
420
  },
421
  {
422
  "epoch": 1.636144578313253,
423
+ "grad_norm": 0.5687703380048487,
424
+ "learning_rate": 8.547638860594765e-06,
425
+ "loss": 0.1484,
426
+ "mean_token_accuracy": 0.9509495720267296,
427
  "num_tokens": 5531233.0,
428
  "step": 84
429
  },
430
  {
431
  "epoch": 1.6746987951807228,
432
+ "grad_norm": 0.6551898466798518,
433
+ "learning_rate": 8.449642806902623e-06,
434
+ "loss": 0.1568,
435
+ "mean_token_accuracy": 0.9481558501720428,
436
  "num_tokens": 5662305.0,
437
  "step": 86
438
  },
439
  {
440
  "epoch": 1.7132530120481928,
441
+ "grad_norm": 0.6433780292504243,
442
+ "learning_rate": 8.349049970236822e-06,
443
+ "loss": 0.1349,
444
+ "mean_token_accuracy": 0.954715259373188,
445
  "num_tokens": 5792219.0,
446
  "step": 88
447
  },
448
  {
449
  "epoch": 1.7518072289156628,
450
+ "grad_norm": 0.5701046312406493,
451
+ "learning_rate": 8.245936073731654e-06,
452
+ "loss": 0.147,
453
+ "mean_token_accuracy": 0.9507969096302986,
454
  "num_tokens": 5923291.0,
455
  "step": 90
456
  },
457
  {
458
  "epoch": 1.7903614457831325,
459
+ "grad_norm": 0.6865332623152001,
460
+ "learning_rate": 8.140378738296233e-06,
461
+ "loss": 0.1529,
462
+ "mean_token_accuracy": 0.9498768150806427,
463
  "num_tokens": 6053822.0,
464
  "step": 92
465
  },
466
  {
467
  "epoch": 1.8289156626506025,
468
+ "grad_norm": 0.6305307568855328,
469
+ "learning_rate": 8.032457424183909e-06,
470
+ "loss": 0.1476,
471
+ "mean_token_accuracy": 0.9505984485149384,
472
  "num_tokens": 6184894.0,
473
  "step": 94
474
  },
475
  {
476
  "epoch": 1.8674698795180724,
477
+ "grad_norm": 0.5748443476790706,
478
+ "learning_rate": 7.922253371177081e-06,
479
+ "loss": 0.155,
480
+ "mean_token_accuracy": 0.9482144415378571,
481
  "num_tokens": 6315149.0,
482
  "step": 96
483
  },
484
  {
485
  "epoch": 1.9060240963855422,
486
+ "grad_norm": 0.5993128969226361,
487
+ "learning_rate": 7.809849537432432e-06,
488
+ "loss": 0.1434,
489
+ "mean_token_accuracy": 0.9525645859539509,
490
  "num_tokens": 6445345.0,
491
  "step": 98
492
  },
493
  {
494
  "epoch": 1.944578313253012,
495
+ "grad_norm": 0.6280456904784001,
496
+ "learning_rate": 7.695330537032629e-06,
497
+ "loss": 0.1445,
498
+ "mean_token_accuracy": 0.9512222707271576,
499
  "num_tokens": 6576344.0,
500
  "step": 100
501
  },
502
  {
503
  "epoch": 1.944578313253012,
504
+ "eval_loss": 0.3398211598396301,
505
+ "eval_mean_token_accuracy": 0.901328669530209,
506
  "eval_num_tokens": 6576344.0,
507
+ "eval_runtime": 69.654,
508
+ "eval_samples_per_second": 12.275,
509
+ "eval_steps_per_second": 1.536,
510
  "step": 100
511
  },
512
  {
513
  "epoch": 1.983132530120482,
514
+ "grad_norm": 0.6197902890500856,
515
+ "learning_rate": 7.578782576291501e-06,
516
+ "loss": 0.1506,
517
+ "mean_token_accuracy": 0.9492092207074165,
518
  "num_tokens": 6707416.0,
519
  "step": 102
520
  },
521
  {
522
  "epoch": 2.0385542168674697,
523
+ "grad_norm": 0.6409344863530665,
524
+ "learning_rate": 7.460293388860616e-06,
525
+ "loss": 0.1754,
526
+ "mean_token_accuracy": 0.9643502771854401,
527
  "num_tokens": 6871256.0,
528
  "step": 104
529
  },
530
  {
531
  "epoch": 2.07710843373494,
532
+ "grad_norm": 0.6097248296204885,
533
+ "learning_rate": 7.3399521696861505e-06,
534
+ "loss": 0.1092,
535
+ "mean_token_accuracy": 0.9659219309687614,
536
  "num_tokens": 7002255.0,
537
  "step": 106
538
  },
539
  {
540
  "epoch": 2.1156626506024097,
541
+ "grad_norm": 0.5903613108322504,
542
+ "learning_rate": 7.217849507865724e-06,
543
+ "loss": 0.1066,
544
+ "mean_token_accuracy": 0.9660860486328602,
545
  "num_tokens": 7133327.0,
546
  "step": 108
547
  },
548
  {
549
  "epoch": 2.1542168674698794,
550
+ "grad_norm": 0.625091072426359,
551
+ "learning_rate": 7.094077318455762e-06,
552
+ "loss": 0.1091,
553
+ "mean_token_accuracy": 0.9645588099956512,
554
  "num_tokens": 7263523.0,
555
  "step": 110
556
  },
557
  {
558
  "epoch": 2.1927710843373496,
559
+ "grad_norm": 0.6604015968164485,
560
+ "learning_rate": 6.96872877328073e-06,
561
+ "loss": 0.1052,
562
+ "mean_token_accuracy": 0.9661929123103619,
563
  "num_tokens": 7394595.0,
564
  "step": 112
565
  },
566
  {
567
  "epoch": 2.2313253012048193,
568
+ "grad_norm": 0.7455880093770229,
569
+ "learning_rate": 6.841898230796302e-06,
570
+ "loss": 0.1049,
571
+ "mean_token_accuracy": 0.9661089479923248,
572
  "num_tokens": 7525667.0,
573
  "step": 114
574
  },
575
  {
576
  "epoch": 2.269879518072289,
577
+ "grad_norm": 0.6028303919109465,
578
+ "learning_rate": 6.713681165059271e-06,
579
+ "loss": 0.1127,
580
+ "mean_token_accuracy": 0.9631625637412071,
581
  "num_tokens": 7656739.0,
582
  "step": 116
583
  },
584
  {
585
  "epoch": 2.3084337349397592,
586
+ "grad_norm": 0.6799912009709536,
587
+ "learning_rate": 6.584174093857676e-06,
588
+ "loss": 0.1035,
589
+ "mean_token_accuracy": 0.9669562242925167,
590
  "num_tokens": 7787811.0,
591
  "step": 118
592
  },
593
  {
594
  "epoch": 2.346987951807229,
595
+ "grad_norm": 0.6255570427114552,
596
+ "learning_rate": 6.453474506055228e-06,
597
+ "loss": 0.1176,
598
+ "mean_token_accuracy": 0.9615787602961063,
599
  "num_tokens": 7916616.0,
600
  "step": 120
601
  },
602
  {
603
  "epoch": 2.346987951807229,
604
+ "eval_loss": 0.38193774223327637,
605
+ "eval_mean_token_accuracy": 0.8994210568543907,
606
  "eval_num_tokens": 7916616.0,
607
+ "eval_runtime": 69.6436,
608
+ "eval_samples_per_second": 12.277,
609
+ "eval_steps_per_second": 1.536,
610
  "step": 120
611
  },
612
  {
613
  "epoch": 2.3855421686746987,
614
+ "grad_norm": 0.6279356138996781,
615
+ "learning_rate": 6.3216807882047585e-06,
616
+ "loss": 0.0974,
617
+ "mean_token_accuracy": 0.968185156583786,
618
  "num_tokens": 8047688.0,
619
  "step": 122
620
  },
621
  {
622
  "epoch": 2.4240963855421684,
623
+ "grad_norm": 0.6479503216427691,
624
+ "learning_rate": 6.188892150485904e-06,
625
+ "loss": 0.1087,
626
+ "mean_token_accuracy": 0.9651853404939175,
627
  "num_tokens": 8178760.0,
628
  "step": 124
629
  },
630
  {
631
  "epoch": 2.4626506024096386,
632
+ "grad_norm": 0.7228376218883897,
633
+ "learning_rate": 6.0552085520227875e-06,
634
+ "loss": 0.1136,
635
+ "mean_token_accuracy": 0.9631396643817425,
636
  "num_tokens": 8309832.0,
637
  "step": 126
638
  },
639
  {
640
  "epoch": 2.5012048192771084,
641
+ "grad_norm": 0.6292530226739607,
642
+ "learning_rate": 5.920730625637934e-06,
643
+ "loss": 0.1043,
644
+ "mean_token_accuracy": 0.9666203670203686,
645
  "num_tokens": 8440904.0,
646
  "step": 128
647
  },
648
  {
649
  "epoch": 2.539759036144578,
650
+ "grad_norm": 0.6120273359022707,
651
+ "learning_rate": 5.785559602099019e-06,
652
+ "loss": 0.1073,
653
+ "mean_token_accuracy": 0.9648876488208771,
654
  "num_tokens": 8571976.0,
655
  "step": 130
656
  },
657
  {
658
  "epoch": 2.5783132530120483,
659
+ "grad_norm": 0.6294342722298523,
660
+ "learning_rate": 5.649797233915539e-06,
661
+ "loss": 0.1092,
662
+ "mean_token_accuracy": 0.9644067622721195,
663
  "num_tokens": 8703048.0,
664
  "step": 132
665
  },
666
  {
667
  "epoch": 2.616867469879518,
668
+ "grad_norm": 0.5665304014502571,
669
+ "learning_rate": 5.513545718742702e-06,
670
+ "loss": 0.1086,
671
+ "mean_token_accuracy": 0.9646815545856953,
672
  "num_tokens": 8834120.0,
673
  "step": 134
674
  },
675
  {
676
  "epoch": 2.6554216867469878,
677
+ "grad_norm": 0.5673111264101424,
678
+ "learning_rate": 5.376907622450229e-06,
679
+ "loss": 0.1154,
680
+ "mean_token_accuracy": 0.9624109007418156,
681
  "num_tokens": 8964375.0,
682
  "step": 136
683
  },
684
  {
685
  "epoch": 2.693975903614458,
686
+ "grad_norm": 0.5636466902202368,
687
+ "learning_rate": 5.2399858019140005e-06,
688
+ "loss": 0.1045,
689
+ "mean_token_accuracy": 0.9666311480104923,
690
  "num_tokens": 9094906.0,
691
  "step": 138
692
  },
693
  {
694
  "epoch": 2.7325301204819277,
695
+ "grad_norm": 0.5754464602822424,
696
+ "learning_rate": 5.102883327588608e-06,
697
+ "loss": 0.1075,
698
+ "mean_token_accuracy": 0.9647044539451599,
699
  "num_tokens": 9225978.0,
700
  "step": 140
701
  },
702
  {
703
  "epoch": 2.7325301204819277,
704
+ "eval_loss": 0.37826669216156006,
705
+ "eval_mean_token_accuracy": 0.8995784972315637,
706
  "eval_num_tokens": 9225978.0,
707
+ "eval_runtime": 69.6803,
708
+ "eval_samples_per_second": 12.27,
709
+ "eval_steps_per_second": 1.536,
710
  "step": 140
711
  },
712
  {
713
  "epoch": 2.7710843373493974,
714
+ "grad_norm": 0.5987257906687522,
715
+ "learning_rate": 4.965703405919154e-06,
716
+ "loss": 0.1041,
717
+ "mean_token_accuracy": 0.9660173505544662,
718
  "num_tokens": 9357050.0,
719
  "step": 142
720
  },
721
  {
722
  "epoch": 2.8096385542168676,
723
+ "grad_norm": 0.6727909756019579,
724
+ "learning_rate": 4.828549301650673e-06,
725
+ "loss": 0.1165,
726
+ "mean_token_accuracy": 0.9626206122338772,
727
  "num_tokens": 9488122.0,
728
  "step": 144
729
  },
730
  {
731
  "epoch": 2.8481927710843373,
732
+ "grad_norm": 0.5483728501054262,
733
+ "learning_rate": 4.691524260093672e-06,
734
+ "loss": 0.1101,
735
+ "mean_token_accuracy": 0.9640556387603283,
736
  "num_tokens": 9619194.0,
737
  "step": 146
738
  },
739
  {
740
  "epoch": 2.886746987951807,
741
+ "grad_norm": 0.6578615356471254,
742
+ "learning_rate": 4.554731429404293e-06,
743
+ "loss": 0.1167,
744
+ "mean_token_accuracy": 0.9623610861599445,
745
  "num_tokens": 9750266.0,
746
  "step": 148
747
  },
748
  {
749
  "epoch": 2.9253012048192772,
750
+ "grad_norm": 0.544341897970942,
751
+ "learning_rate": 4.4182737829376135e-06,
752
+ "loss": 0.1068,
753
+ "mean_token_accuracy": 0.965429600328207,
754
  "num_tokens": 9881338.0,
755
  "step": 150
756
  },
757
  {
758
  "epoch": 2.963855421686747,
759
+ "grad_norm": 0.5807218274090602,
760
+ "learning_rate": 4.28225404173254e-06,
761
+ "loss": 0.1058,
762
+ "mean_token_accuracy": 0.965176422148943,
763
  "num_tokens": 10011972.0,
764
  "step": 152
765
  },
766
  {
767
  "epoch": 3.019277108433735,
768
+ "grad_norm": 1.007803950038667,
769
+ "learning_rate": 4.146774597186622e-06,
770
+ "loss": 0.1488,
771
+ "mean_token_accuracy": 0.9695591181516647,
772
  "num_tokens": 10175812.0,
773
  "step": 154
774
  },
775
  {
776
  "epoch": 3.057831325301205,
777
+ "grad_norm": 0.6613641201206724,
778
+ "learning_rate": 4.011937433979014e-06,
779
+ "loss": 0.0847,
780
+ "mean_token_accuracy": 0.9746656753122807,
781
  "num_tokens": 10306884.0,
782
  "step": 156
783
  },
784
  {
785
  "epoch": 3.0963855421686746,
786
+ "grad_norm": 0.5427167115705699,
787
+ "learning_rate": 3.87784405329962e-06,
788
+ "loss": 0.0838,
789
+ "mean_token_accuracy": 0.9741344675421715,
790
  "num_tokens": 10437883.0,
791
  "step": 158
792
  },
793
  {
794
  "epoch": 3.1349397590361447,
795
+ "grad_norm": 0.5059704125761413,
796
+ "learning_rate": 3.744595396442169e-06,
797
+ "loss": 0.0814,
798
+ "mean_token_accuracy": 0.9750473313033581,
799
  "num_tokens": 10568955.0,
800
  "step": 160
801
  },
802
  {
803
  "epoch": 3.1349397590361447,
804
+ "eval_loss": 0.4201391637325287,
805
+ "eval_mean_token_accuracy": 0.8986482670374005,
806
  "eval_num_tokens": 10568955.0,
807
+ "eval_runtime": 69.8903,
808
+ "eval_samples_per_second": 12.233,
809
+ "eval_steps_per_second": 1.531,
810
  "step": 160
811
  },
812
  {
813
  "epoch": 3.1734939759036145,
814
+ "grad_norm": 0.4955524619584041,
815
+ "learning_rate": 3.612291768818772e-06,
816
+ "loss": 0.0827,
817
+ "mean_token_accuracy": 0.9744977466762066,
818
  "num_tokens": 10700027.0,
819
  "step": 162
820
  },
821
  {
822
  "epoch": 3.212048192771084,
823
+ "grad_norm": 0.5481909266796648,
824
+ "learning_rate": 3.4810327644531606e-06,
825
+ "loss": 0.0804,
826
+ "mean_token_accuracy": 0.9746122434735298,
827
  "num_tokens": 10831099.0,
828
  "step": 164
829
  },
830
  {
831
  "epoch": 3.2506024096385544,
832
+ "grad_norm": 0.5869274418415635,
833
+ "learning_rate": 3.3509171910094162e-06,
834
+ "loss": 0.0849,
835
+ "mean_token_accuracy": 0.9735665060579777,
836
  "num_tokens": 10962171.0,
837
  "step": 166
838
  },
839
  {
840
  "epoch": 3.289156626506024,
841
+ "grad_norm": 0.5997938570160334,
842
+ "learning_rate": 3.222042995412669e-06,
843
+ "loss": 0.0826,
844
+ "mean_token_accuracy": 0.9744274839758873,
845
  "num_tokens": 11092367.0,
846
  "step": 168
847
  },
848
  {
849
  "epoch": 3.327710843373494,
850
+ "grad_norm": 0.5638967234440626,
851
+ "learning_rate": 3.094507190117715e-06,
852
+ "loss": 0.0752,
853
+ "mean_token_accuracy": 0.9760014712810516,
854
  "num_tokens": 11223439.0,
855
  "step": 170
856
  },
857
  {
858
  "epoch": 3.3662650602409636,
859
+ "grad_norm": 0.5677450107311146,
860
+ "learning_rate": 2.9684057800810844e-06,
861
+ "loss": 0.0849,
862
+ "mean_token_accuracy": 0.9734520092606544,
863
  "num_tokens": 11354511.0,
864
  "step": 172
865
  },
866
  {
867
  "epoch": 3.404819277108434,
868
+ "grad_norm": 0.5694190125459168,
869
+ "learning_rate": 2.8438336904915186e-06,
870
+ "loss": 0.0907,
871
+ "mean_token_accuracy": 0.9719940833747387,
872
  "num_tokens": 11485583.0,
873
  "step": 174
874
  },
875
  {
876
  "epoch": 3.4433734939759035,
877
+ "grad_norm": 0.5008764813796651,
878
+ "learning_rate": 2.7208846953132685e-06,
879
+ "loss": 0.0782,
880
+ "mean_token_accuracy": 0.9755356945097446,
881
  "num_tokens": 11616217.0,
882
  "step": 176
883
  },
884
  {
885
  "epoch": 3.4819277108433733,
886
+ "grad_norm": 0.5027767263738213,
887
+ "learning_rate": 2.599651346695979e-06,
888
+ "loss": 0.0773,
889
+ "mean_token_accuracy": 0.9762609973549843,
890
  "num_tokens": 11747289.0,
891
  "step": 178
892
  },
893
  {
894
  "epoch": 3.5204819277108435,
895
+ "grad_norm": 0.5747857741850161,
896
+ "learning_rate": 2.4802249053043525e-06,
897
  "loss": 0.0777,
898
+ "mean_token_accuracy": 0.976215198636055,
899
  "num_tokens": 11878361.0,
900
  "step": 180
901
  },
902
  {
903
  "epoch": 3.5204819277108435,
904
+ "eval_loss": 0.43149346113204956,
905
+ "eval_mean_token_accuracy": 0.898219308563482,
906
  "eval_num_tokens": 11878361.0,
907
+ "eval_runtime": 69.6743,
908
+ "eval_samples_per_second": 12.271,
909
+ "eval_steps_per_second": 1.536,
910
  "step": 180
911
  },
912
  {
913
  "epoch": 3.559036144578313,
914
+ "grad_norm": 0.5115273312879999,
915
+ "learning_rate": 2.3626952716199647e-06,
916
+ "loss": 0.0792,
917
+ "mean_token_accuracy": 0.9750167988240719,
918
  "num_tokens": 12009433.0,
919
  "step": 182
920
  },
921
  {
922
  "epoch": 3.597590361445783,
923
+ "grad_norm": 0.5172911491980401,
924
+ "learning_rate": 2.247150918267008e-06,
925
+ "loss": 0.0851,
926
+ "mean_token_accuracy": 0.9730398207902908,
927
  "num_tokens": 12140505.0,
928
  "step": 184
929
  },
930
  {
931
  "epoch": 3.636144578313253,
932
+ "grad_norm": 0.5260093719963543,
933
+ "learning_rate": 2.133678823412873e-06,
934
+ "loss": 0.0797,
935
+ "mean_token_accuracy": 0.9751236625015736,
936
  "num_tokens": 12271577.0,
937
  "step": 186
938
  },
939
  {
940
  "epoch": 3.674698795180723,
941
+ "grad_norm": 0.5267292864138245,
942
+ "learning_rate": 2.022364405293703e-06,
943
+ "loss": 0.0832,
944
+ "mean_token_accuracy": 0.9738947302103043,
945
  "num_tokens": 12402649.0,
946
  "step": 188
947
  },
948
  {
949
  "epoch": 3.7132530120481926,
950
+ "grad_norm": 0.5065512725199254,
951
+ "learning_rate": 1.913291457914234e-06,
952
+ "loss": 0.0856,
953
+ "mean_token_accuracy": 0.9732001163065434,
954
  "num_tokens": 12533721.0,
955
  "step": 190
956
  },
957
  {
958
  "epoch": 3.7518072289156628,
959
+ "grad_norm": 0.5465242770321679,
960
+ "learning_rate": 1.8065420879702888e-06,
961
+ "loss": 0.0838,
962
+ "mean_token_accuracy": 0.9731762520968914,
963
  "num_tokens": 12663435.0,
964
  "step": 192
965
  },
966
  {
967
  "epoch": 3.7903614457831325,
968
+ "grad_norm": 0.7823063875533764,
969
+ "learning_rate": 1.7021966530414303e-06,
970
+ "loss": 0.0762,
971
+ "mean_token_accuracy": 0.9758411757647991,
972
  "num_tokens": 12794507.0,
973
  "step": 194
974
  },
975
  {
976
  "epoch": 3.8289156626506022,
977
+ "grad_norm": 0.571380544699335,
978
+ "learning_rate": 1.6003337011002928e-06,
979
+ "loss": 0.084,
980
+ "mean_token_accuracy": 0.9734901748597622,
981
  "num_tokens": 12925579.0,
982
  "step": 196
983
  },
984
  {
985
  "epoch": 3.8674698795180724,
986
+ "grad_norm": 0.5400258981871386,
987
+ "learning_rate": 1.5010299113841397e-06,
988
+ "loss": 0.0807,
989
+ "mean_token_accuracy": 0.9752305261790752,
990
  "num_tokens": 13056651.0,
991
  "step": 198
992
  },
993
  {
994
  "epoch": 3.906024096385542,
995
+ "grad_norm": 0.5204832843446408,
996
+ "learning_rate": 1.4043600366731213e-06,
997
+ "loss": 0.0821,
998
+ "mean_token_accuracy": 0.9745206460356712,
999
  "num_tokens": 13187723.0,
1000
  "step": 200
1001
  },
1002
  {
1003
  "epoch": 3.906024096385542,
1004
+ "eval_loss": 0.43459072709083557,
1005
+ "eval_mean_token_accuracy": 0.8980461002510285,
1006
  "eval_num_tokens": 13187723.0,
1007
+ "eval_runtime": 69.6812,
1008
+ "eval_samples_per_second": 12.27,
1009
+ "eval_steps_per_second": 1.536,
1010
  "step": 200
1011
  },
1012
  {
1013
  "epoch": 3.944578313253012,
1014
+ "grad_norm": 0.5732935867678565,
1015
+ "learning_rate": 1.3103968470187384e-06,
1016
+ "loss": 0.0841,
1017
+ "mean_token_accuracy": 0.973306454718113,
1018
  "num_tokens": 13317686.0,
1019
  "step": 202
1020
  },
1021
  {
1022
  "epoch": 3.983132530120482,
1023
+ "grad_norm": 0.5049593156468802,
1024
+ "learning_rate": 1.2192110749648233e-06,
1025
+ "loss": 0.0783,
1026
+ "mean_token_accuracy": 0.9752342775464058,
1027
  "num_tokens": 13447600.0,
1028
  "step": 204
1029
  },
1030
  {
1031
  "epoch": 4.03855421686747,
1032
+ "grad_norm": 0.4900616503984239,
1033
+ "learning_rate": 1.1308713623022988e-06,
1034
+ "loss": 0.1075,
1035
+ "mean_token_accuracy": 0.9786272644996643,
1036
  "num_tokens": 13611440.0,
1037
  "step": 206
1038
  },
1039
  {
1040
  "epoch": 4.0771084337349395,
1041
+ "grad_norm": 0.4917129834327916,
1042
+ "learning_rate": 1.045444208397791e-06,
1043
+ "loss": 0.0676,
1044
+ "mean_token_accuracy": 0.9801687188446522,
1045
  "num_tokens": 13740537.0,
1046
  "step": 208
1047
  },
1048
  {
1049
  "epoch": 4.11566265060241,
1050
+ "grad_norm": 0.47200516762524886,
1051
+ "learning_rate": 9.629939201349852e-07,
1052
+ "loss": 0.0723,
1053
+ "mean_token_accuracy": 0.9782837741076946,
1054
  "num_tokens": 13871609.0,
1055
  "step": 210
1056
  },
1057
  {
1058
  "epoch": 4.15421686746988,
1059
+ "grad_norm": 0.44277012092487705,
1060
+ "learning_rate": 8.835825635064266e-07,
1061
+ "loss": 0.0729,
1062
+ "mean_token_accuracy": 0.9780853129923344,
1063
  "num_tokens": 14002681.0,
1064
  "step": 212
1065
  },
1066
  {
1067
  "epoch": 4.192771084337349,
1068
+ "grad_norm": 0.4753962832603972,
1069
+ "learning_rate": 8.072699168921827e-07,
1070
+ "loss": 0.0749,
1071
+ "mean_token_accuracy": 0.9778944849967957,
1072
  "num_tokens": 14133753.0,
1073
  "step": 214
1074
  },
1075
  {
1076
  "epoch": 4.231325301204819,
1077
+ "grad_norm": 0.48346978347475456,
1078
+ "learning_rate": 7.341134260605537e-07,
1079
+ "loss": 0.0692,
1080
+ "mean_token_accuracy": 0.9793745614588261,
1081
  "num_tokens": 14264314.0,
1082
  "step": 216
1083
  },
1084
  {
1085
  "epoch": 4.2698795180722895,
1086
+ "grad_norm": 0.4328206037632282,
1087
+ "learning_rate": 6.641681609246981e-07,
1088
+ "loss": 0.066,
1089
+ "mean_token_accuracy": 0.9801309891045094,
1090
  "num_tokens": 14395386.0,
1091
  "step": 218
1092
  },
1093
  {
1094
  "epoch": 4.308433734939759,
1095
+ "grad_norm": 0.46221534542018206,
1096
+ "learning_rate": 5.974867740877282e-07,
1097
+ "loss": 0.0696,
1098
+ "mean_token_accuracy": 0.9789478555321693,
1099
  "num_tokens": 14526458.0,
1100
  "step": 220
1101
  },
1102
  {
1103
  "epoch": 4.308433734939759,
1104
+ "eval_loss": 0.4595886468887329,
1105
+ "eval_mean_token_accuracy": 0.897223442514366,
1106
  "eval_num_tokens": 14526458.0,
1107
+ "eval_runtime": 69.6441,
1108
+ "eval_samples_per_second": 12.277,
1109
+ "eval_steps_per_second": 1.536,
1110
  "step": 220
1111
  },
1112
  {
1113
  "epoch": 4.346987951807229,
1114
+ "grad_norm": 0.4739286679144528,
1115
+ "learning_rate": 5.341194612074824e-07,
1116
+ "loss": 0.068,
1117
+ "mean_token_accuracy": 0.9796868488192558,
1118
  "num_tokens": 14656421.0,
1119
  "step": 222
1120
  },
1121
  {
1122
  "epoch": 4.385542168674699,
1123
+ "grad_norm": 0.43096986690967987,
1124
+ "learning_rate": 4.7411392321080606e-07,
1125
+ "loss": 0.0663,
1126
+ "mean_token_accuracy": 0.9802683852612972,
1127
  "num_tokens": 14787493.0,
1128
  "step": 224
1129
  },
1130
  {
1131
  "epoch": 4.424096385542168,
1132
+ "grad_norm": 0.46557922408208563,
1133
+ "learning_rate": 4.175153303857887e-07,
1134
+ "loss": 0.0654,
1135
+ "mean_token_accuracy": 0.9804821126163006,
1136
  "num_tokens": 14918565.0,
1137
  "step": 226
1138
  },
1139
  {
1140
  "epoch": 4.462650602409639,
1141
+ "grad_norm": 0.5546707256189516,
1142
+ "learning_rate": 3.643662883789878e-07,
1143
+ "loss": 0.0673,
1144
+ "mean_token_accuracy": 0.979527972638607,
1145
  "num_tokens": 15049637.0,
1146
  "step": 228
1147
  },
1148
  {
1149
  "epoch": 4.501204819277109,
1150
+ "grad_norm": 0.49021519394663,
1151
+ "learning_rate": 3.1470680612323503e-07,
1152
+ "loss": 0.07,
1153
+ "mean_token_accuracy": 0.9785585664212704,
1154
  "num_tokens": 15180709.0,
1155
  "step": 230
1156
  },
1157
  {
1158
  "epoch": 4.539759036144578,
1159
+ "grad_norm": 0.45571708386475684,
1160
+ "learning_rate": 2.685742657201601e-07,
1161
+ "loss": 0.0697,
1162
+ "mean_token_accuracy": 0.9785204008221626,
1163
  "num_tokens": 15311781.0,
1164
  "step": 232
1165
  },
1166
  {
1167
  "epoch": 4.578313253012048,
1168
+ "grad_norm": 0.5641008416839415,
1169
+ "learning_rate": 2.260033943001244e-07,
1170
+ "loss": 0.0663,
1171
+ "mean_token_accuracy": 0.9797416999936104,
1172
  "num_tokens": 15442853.0,
1173
  "step": 234
1174
  },
1175
  {
1176
  "epoch": 4.6168674698795185,
1177
+ "grad_norm": 0.5607141029792978,
1178
+ "learning_rate": 1.8702623788072028e-07,
1179
+ "loss": 0.0793,
1180
+ "mean_token_accuracy": 0.9755663834512234,
1181
  "num_tokens": 15573925.0,
1182
  "step": 236
1183
  },
1184
  {
1185
  "epoch": 4.655421686746988,
1186
+ "grad_norm": 0.46095439859311127,
1187
+ "learning_rate": 1.5167213724353426e-07,
1188
+ "loss": 0.0714,
1189
+ "mean_token_accuracy": 0.9779479168355465,
1190
  "num_tokens": 15704997.0,
1191
  "step": 238
1192
  },
1193
  {
1194
  "epoch": 4.693975903614458,
1195
+ "grad_norm": 0.464368810663561,
1196
+ "learning_rate": 1.199677058473292e-07,
1197
+ "loss": 0.066,
1198
+ "mean_token_accuracy": 0.980153888463974,
1199
  "num_tokens": 15836069.0,
1200
  "step": 240
1201
  },
1202
  {
1203
  "epoch": 4.693975903614458,
1204
+ "eval_loss": 0.46903374791145325,
1205
+ "eval_mean_token_accuracy": 0.8968599628065234,
1206
  "eval_num_tokens": 15836069.0,
1207
+ "eval_runtime": 69.6558,
1208
+ "eval_samples_per_second": 12.275,
1209
+ "eval_steps_per_second": 1.536,
1210
  "step": 240
1211
  },
1212
  {
1213
  "epoch": 4.732530120481927,
1214
+ "grad_norm": 0.5162077757262011,
1215
+ "learning_rate": 9.193680979426189e-08,
1216
+ "loss": 0.0775,
1217
+ "mean_token_accuracy": 0.9764594584703445,
1218
  "num_tokens": 15967141.0,
1219
  "step": 242
1220
  },
1221
  {
1222
  "epoch": 4.771084337349397,
1223
+ "grad_norm": 0.4482450270539155,
1224
+ "learning_rate": 6.760054986423459e-08,
1225
+ "loss": 0.0632,
1226
+ "mean_token_accuracy": 0.9808179698884487,
1227
  "num_tokens": 16098213.0,
1228
  "step": 244
1229
  },
1230
  {
1231
  "epoch": 4.809638554216868,
1232
+ "grad_norm": 0.4698597407866022,
1233
+ "learning_rate": 4.697724563088646e-08,
1234
+ "loss": 0.0681,
1235
+ "mean_token_accuracy": 0.9797111675143242,
1236
  "num_tokens": 16229285.0,
1237
  "step": 246
1238
  },
1239
  {
1240
  "epoch": 4.848192771084337,
1241
+ "grad_norm": 0.4662674319978425,
1242
+ "learning_rate": 3.0082421671192576e-08,
1243
+ "loss": 0.0688,
1244
+ "mean_token_accuracy": 0.97944400832057,
1245
  "num_tokens": 16360357.0,
1246
  "step": 248
1247
  },
1248
  {
1249
  "epoch": 4.886746987951807,
1250
+ "grad_norm": 0.46327536754981147,
1251
+ "learning_rate": 1.692879587904983e-08,
1252
+ "loss": 0.0662,
1253
+ "mean_token_accuracy": 0.9799401611089706,
1254
  "num_tokens": 16491429.0,
1255
  "step": 250
1256
  },
1257
  {
1258
  "epoch": 4.925301204819277,
1259
+ "grad_norm": 0.4688691090714117,
1260
+ "learning_rate": 7.526269891646176e-09,
1261
+ "loss": 0.0642,
1262
+ "mean_token_accuracy": 0.9807046689093113,
1263
  "num_tokens": 16621960.0,
1264
  "step": 252
1265
  },
1266
  {
1267
  "epoch": 4.9638554216867465,
1268
+ "grad_norm": 0.4516057398304381,
1269
+ "learning_rate": 1.8819216358156865e-09,
1270
+ "loss": 0.0688,
1271
+ "mean_token_accuracy": 0.9792744368314743,
1272
  "num_tokens": 16752156.0,
1273
  "step": 254
1274
  },
1275
  {
1276
  "epoch": 4.983132530120482,
1277
+ "mean_token_accuracy": 0.976367861032486,
1278
  "num_tokens": 16817692.0,
1279
  "step": 255,
1280
  "total_flos": 24409842647040.0,
1281
+ "train_loss": 0.12274208276295194,
1282
+ "train_runtime": 3782.9235,
1283
  "train_samples_per_second": 2.194,
1284
  "train_steps_per_second": 0.067
1285
  }