li-muyang commited on
Commit
9c62a22
·
verified ·
1 Parent(s): 9cfdd98

Model save

Browse files
Files changed (4) hide show
  1. README.md +13 -13
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +477 -477
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
- base_model: mistralai/Mistral-7B-v0.1
5
  tags:
6
  - trl
7
  - sft
@@ -18,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  # zephyr-7b-sft-full
20
 
21
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.9423
24
 
25
  ## Model description
26
 
@@ -57,16 +57,16 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.0186 | 0.0923 | 100 | 1.0212 |
61
- | 1.029 | 0.1846 | 200 | 1.0410 |
62
- | 1.0367 | 0.2769 | 300 | 1.0391 |
63
- | 1.0094 | 0.3692 | 400 | 1.0263 |
64
- | 1.0163 | 0.4615 | 500 | 1.0116 |
65
- | 0.9715 | 0.5538 | 600 | 0.9919 |
66
- | 0.9408 | 0.6461 | 700 | 0.9743 |
67
- | 0.925 | 0.7383 | 800 | 0.9587 |
68
- | 0.936 | 0.8306 | 900 | 0.9477 |
69
- | 0.9192 | 0.9229 | 1000 | 0.9423 |
70
 
71
 
72
  ### Framework versions
 
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
+ base_model: mistralai/Mistral-7B-v0.3
5
  tags:
6
  - trl
7
  - sft
 
18
 
19
  # zephyr-7b-sft-full
20
 
21
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.9448
24
 
25
  ## Model description
26
 
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 1.025 | 0.0923 | 100 | 1.0240 |
61
+ | 1.033 | 0.1846 | 200 | 1.0464 |
62
+ | 1.037 | 0.2769 | 300 | 1.0424 |
63
+ | 1.0136 | 0.3692 | 400 | 1.0295 |
64
+ | 1.0229 | 0.4615 | 500 | 1.0151 |
65
+ | 0.9745 | 0.5538 | 600 | 0.9945 |
66
+ | 0.9441 | 0.6461 | 700 | 0.9769 |
67
+ | 0.9277 | 0.7383 | 800 | 0.9613 |
68
+ | 0.9384 | 0.8306 | 900 | 0.9501 |
69
+ | 0.9216 | 0.9229 | 1000 | 0.9448 |
70
 
71
 
72
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
- "train_loss": 0.9835369018966802,
5
- "train_runtime": 35743.0085,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 3.88,
8
- "train_steps_per_second": 0.03
9
  }
 
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
+ "train_loss": 0.986508995762382,
5
+ "train_runtime": 33955.1767,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 4.084,
8
+ "train_steps_per_second": 0.032
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
- "train_loss": 0.9835369018966802,
5
- "train_runtime": 35743.0085,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 3.88,
8
- "train_steps_per_second": 0.03
9
  }
 
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
+ "train_loss": 0.986508995762382,
5
+ "train_runtime": 33955.1767,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 4.084,
8
+ "train_steps_per_second": 0.032
9
  }
trainer_state.json CHANGED
@@ -10,1611 +10,1611 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0009229349330872173,
13
- "grad_norm": 9.22159860236364,
14
  "learning_rate": 1.8348623853211012e-07,
15
- "loss": 1.1391,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0046146746654360865,
20
- "grad_norm": 7.244567353076034,
21
  "learning_rate": 9.174311926605506e-07,
22
- "loss": 1.1291,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.009229349330872173,
27
- "grad_norm": 4.141715353193002,
28
  "learning_rate": 1.8348623853211011e-06,
29
- "loss": 1.0628,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01384402399630826,
34
- "grad_norm": 2.6770593588546556,
35
  "learning_rate": 2.7522935779816517e-06,
36
- "loss": 1.0101,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.018458698661744346,
41
- "grad_norm": 2.517095227909403,
42
  "learning_rate": 3.6697247706422022e-06,
43
- "loss": 1.0075,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.023073373327180433,
48
- "grad_norm": 1.959044583294977,
49
  "learning_rate": 4.587155963302753e-06,
50
- "loss": 1.0002,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.02768804799261652,
55
- "grad_norm": 2.278653882410259,
56
  "learning_rate": 5.504587155963303e-06,
57
- "loss": 0.9701,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.032302722658052604,
62
- "grad_norm": 2.1854662026419565,
63
  "learning_rate": 6.422018348623854e-06,
64
- "loss": 0.9961,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03691739732348869,
69
- "grad_norm": 2.703047495193763,
70
  "learning_rate": 7.3394495412844045e-06,
71
- "loss": 1.0028,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04153207198892478,
76
- "grad_norm": 2.0051793390039445,
77
  "learning_rate": 8.256880733944956e-06,
78
- "loss": 0.9837,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.046146746654360866,
83
- "grad_norm": 3.110213357879861,
84
  "learning_rate": 9.174311926605506e-06,
85
- "loss": 0.9801,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.050761421319796954,
90
- "grad_norm": 2.075271224694112,
91
  "learning_rate": 1.0091743119266055e-05,
92
- "loss": 0.9828,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.05537609598523304,
97
- "grad_norm": 2.1623283709662813,
98
  "learning_rate": 1.1009174311926607e-05,
99
- "loss": 0.9956,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05999077065066913,
104
- "grad_norm": 2.8186018424693513,
105
  "learning_rate": 1.1926605504587156e-05,
106
- "loss": 1.0,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06460544531610521,
111
- "grad_norm": 2.697088593929761,
112
  "learning_rate": 1.2844036697247708e-05,
113
- "loss": 0.9785,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.0692201199815413,
118
- "grad_norm": 2.282445866061795,
119
  "learning_rate": 1.3761467889908258e-05,
120
- "loss": 0.9874,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07383479464697738,
125
- "grad_norm": 2.3609900670130375,
126
  "learning_rate": 1.4678899082568809e-05,
127
- "loss": 0.9815,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.07844946931241348,
132
- "grad_norm": 2.2736638273293015,
133
  "learning_rate": 1.559633027522936e-05,
134
- "loss": 1.0052,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08306414397784956,
139
- "grad_norm": 2.1184675412706575,
140
  "learning_rate": 1.6513761467889912e-05,
141
- "loss": 1.0101,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08767881864328565,
146
- "grad_norm": 2.3650428823693317,
147
  "learning_rate": 1.743119266055046e-05,
148
- "loss": 1.0132,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09229349330872173,
153
- "grad_norm": 2.6519058675176015,
154
  "learning_rate": 1.834862385321101e-05,
155
- "loss": 1.0186,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09229349330872173,
160
- "eval_loss": 1.021215558052063,
161
- "eval_runtime": 665.2749,
162
- "eval_samples_per_second": 23.073,
163
- "eval_steps_per_second": 0.18,
164
  "step": 100
165
  },
166
  {
167
  "epoch": 0.09690816797415783,
168
- "grad_norm": 2.2491240926501317,
169
  "learning_rate": 1.9266055045871563e-05,
170
- "loss": 1.0463,
171
  "step": 105
172
  },
173
  {
174
  "epoch": 0.10152284263959391,
175
- "grad_norm": 1.921393633511697,
176
  "learning_rate": 1.9999947982262415e-05,
177
- "loss": 1.0296,
178
  "step": 110
179
  },
180
  {
181
  "epoch": 0.10613751730503,
182
- "grad_norm": 2.0975875410987648,
183
  "learning_rate": 1.9998127418269004e-05,
184
- "loss": 1.0416,
185
  "step": 115
186
  },
187
  {
188
  "epoch": 0.11075219197046608,
189
- "grad_norm": 1.980566933294205,
190
  "learning_rate": 1.9993706508539968e-05,
191
- "loss": 1.0222,
192
  "step": 120
193
  },
194
  {
195
  "epoch": 0.11536686663590216,
196
- "grad_norm": 1.9186102149566504,
197
  "learning_rate": 1.998668640288e-05,
198
- "loss": 1.0473,
199
  "step": 125
200
  },
201
  {
202
  "epoch": 0.11998154130133826,
203
- "grad_norm": 2.0850542458886947,
204
  "learning_rate": 1.997706892710117e-05,
205
- "loss": 1.0191,
206
  "step": 130
207
  },
208
  {
209
  "epoch": 0.12459621596677434,
210
- "grad_norm": 2.0433211031137146,
211
  "learning_rate": 1.9964856582548094e-05,
212
- "loss": 1.051,
213
  "step": 135
214
  },
215
  {
216
  "epoch": 0.12921089063221042,
217
- "grad_norm": 1.7777153775315244,
218
  "learning_rate": 1.9950052545447354e-05,
219
- "loss": 1.0462,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 0.13382556529764653,
224
- "grad_norm": 1.8240694233739712,
225
  "learning_rate": 1.993266066608142e-05,
226
- "loss": 1.0245,
227
  "step": 145
228
  },
229
  {
230
  "epoch": 0.1384402399630826,
231
- "grad_norm": 1.8705378942424304,
232
  "learning_rate": 1.991268546778726e-05,
233
- "loss": 1.012,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 0.1430549146285187,
238
- "grad_norm": 1.7172542543301377,
239
  "learning_rate": 1.9890132145779885e-05,
240
- "loss": 1.0642,
241
  "step": 155
242
  },
243
  {
244
  "epoch": 0.14766958929395477,
245
- "grad_norm": 1.9516254525505181,
246
  "learning_rate": 1.986500656580118e-05,
247
- "loss": 1.024,
248
  "step": 160
249
  },
250
  {
251
  "epoch": 0.15228426395939088,
252
- "grad_norm": 2.3004163573765686,
253
  "learning_rate": 1.9837315262594307e-05,
254
- "loss": 1.033,
255
  "step": 165
256
  },
257
  {
258
  "epoch": 0.15689893862482696,
259
- "grad_norm": 2.147336782856912,
260
  "learning_rate": 1.980706543820412e-05,
261
  "loss": 1.0367,
262
  "step": 170
263
  },
264
  {
265
  "epoch": 0.16151361329026304,
266
- "grad_norm": 2.1046272403642505,
267
  "learning_rate": 1.9774264960104056e-05,
268
- "loss": 1.0195,
269
  "step": 175
270
  },
271
  {
272
  "epoch": 0.16612828795569912,
273
- "grad_norm": 2.036505434534158,
274
  "learning_rate": 1.9738922359149927e-05,
275
- "loss": 1.0309,
276
  "step": 180
277
  },
278
  {
279
  "epoch": 0.1707429626211352,
280
- "grad_norm": 1.800781655290288,
281
  "learning_rate": 1.9701046827361175e-05,
282
- "loss": 1.0336,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 0.1753576372865713,
287
- "grad_norm": 1.693559480289246,
288
  "learning_rate": 1.9660648215530207e-05,
289
- "loss": 1.019,
290
  "step": 190
291
  },
292
  {
293
  "epoch": 0.17997231195200739,
294
- "grad_norm": 1.7602561596511241,
295
  "learning_rate": 1.9617737030660338e-05,
296
- "loss": 1.0257,
297
  "step": 195
298
  },
299
  {
300
  "epoch": 0.18458698661744347,
301
- "grad_norm": 1.596017436906569,
302
  "learning_rate": 1.9572324433233122e-05,
303
- "loss": 1.029,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 0.18458698661744347,
308
- "eval_loss": 1.0409572124481201,
309
- "eval_runtime": 659.1728,
310
- "eval_samples_per_second": 23.287,
311
- "eval_steps_per_second": 0.182,
312
  "step": 200
313
  },
314
  {
315
  "epoch": 0.18920166128287955,
316
- "grad_norm": 1.732016322184742,
317
  "learning_rate": 1.9524422234305677e-05,
318
- "loss": 1.0222,
319
  "step": 205
320
  },
321
  {
322
  "epoch": 0.19381633594831565,
323
- "grad_norm": 1.754292517288075,
324
  "learning_rate": 1.9474042892438848e-05,
325
- "loss": 1.0064,
326
  "step": 210
327
  },
328
  {
329
  "epoch": 0.19843101061375173,
330
- "grad_norm": 1.5853431903204918,
331
  "learning_rate": 1.942119951045692e-05,
332
- "loss": 1.0193,
333
  "step": 215
334
  },
335
  {
336
  "epoch": 0.20304568527918782,
337
- "grad_norm": 1.7537946224731296,
338
  "learning_rate": 1.9365905832039814e-05,
339
- "loss": 1.0401,
340
  "step": 220
341
  },
342
  {
343
  "epoch": 0.2076603599446239,
344
- "grad_norm": 1.5570957640931664,
345
  "learning_rate": 1.9308176238148565e-05,
346
- "loss": 1.0463,
347
  "step": 225
348
  },
349
  {
350
  "epoch": 0.21227503461006,
351
- "grad_norm": 1.8510932168688097,
352
  "learning_rate": 1.924802574328509e-05,
353
- "loss": 1.026,
354
  "step": 230
355
  },
356
  {
357
  "epoch": 0.21688970927549608,
358
- "grad_norm": 1.8345788195665518,
359
  "learning_rate": 1.9185469991587166e-05,
360
- "loss": 1.0444,
361
  "step": 235
362
  },
363
  {
364
  "epoch": 0.22150438394093216,
365
- "grad_norm": 1.558823481306957,
366
  "learning_rate": 1.912052525275965e-05,
367
- "loss": 1.0327,
368
  "step": 240
369
  },
370
  {
371
  "epoch": 0.22611905860636825,
372
- "grad_norm": 1.7955068678391877,
373
  "learning_rate": 1.905320841784298e-05,
374
- "loss": 1.0328,
375
  "step": 245
376
  },
377
  {
378
  "epoch": 0.23073373327180433,
379
- "grad_norm": 1.572619634284493,
380
  "learning_rate": 1.898353699482014e-05,
381
- "loss": 1.031,
382
  "step": 250
383
  },
384
  {
385
  "epoch": 0.23534840793724043,
386
- "grad_norm": 1.6236474418730058,
387
  "learning_rate": 1.8911529104063093e-05,
388
- "loss": 1.0371,
389
  "step": 255
390
  },
391
  {
392
  "epoch": 0.23996308260267651,
393
- "grad_norm": 1.4717263075966749,
394
  "learning_rate": 1.8837203473619978e-05,
395
- "loss": 1.041,
396
  "step": 260
397
  },
398
  {
399
  "epoch": 0.2445777572681126,
400
- "grad_norm": 1.6756793292053787,
401
  "learning_rate": 1.8760579434344283e-05,
402
- "loss": 1.0469,
403
  "step": 265
404
  },
405
  {
406
  "epoch": 0.24919243193354867,
407
- "grad_norm": 1.6409196589048798,
408
  "learning_rate": 1.8681676914867176e-05,
409
- "loss": 1.0468,
410
  "step": 270
411
  },
412
  {
413
  "epoch": 0.25380710659898476,
414
- "grad_norm": 1.9407976084394043,
415
  "learning_rate": 1.860051643641443e-05,
416
- "loss": 1.0307,
417
  "step": 275
418
  },
419
  {
420
  "epoch": 0.25842178126442084,
421
- "grad_norm": 1.7250648731233242,
422
  "learning_rate": 1.8517119107469194e-05,
423
- "loss": 1.0407,
424
  "step": 280
425
  },
426
  {
427
  "epoch": 0.26303645592985697,
428
- "grad_norm": 1.5322925665538232,
429
  "learning_rate": 1.8431506618282e-05,
430
- "loss": 1.0297,
431
  "step": 285
432
  },
433
  {
434
  "epoch": 0.26765113059529305,
435
- "grad_norm": 1.6904457033651568,
436
  "learning_rate": 1.834370123522954e-05,
437
- "loss": 1.0419,
438
  "step": 290
439
  },
440
  {
441
  "epoch": 0.27226580526072913,
442
- "grad_norm": 1.5941511416866747,
443
  "learning_rate": 1.8253725795023504e-05,
444
- "loss": 1.0291,
445
  "step": 295
446
  },
447
  {
448
  "epoch": 0.2768804799261652,
449
- "grad_norm": 1.6111251284693955,
450
  "learning_rate": 1.816160369877117e-05,
451
- "loss": 1.0367,
452
  "step": 300
453
  },
454
  {
455
  "epoch": 0.2768804799261652,
456
- "eval_loss": 1.0390825271606445,
457
- "eval_runtime": 631.9337,
458
- "eval_samples_per_second": 24.291,
459
- "eval_steps_per_second": 0.19,
460
  "step": 300
461
  },
462
  {
463
  "epoch": 0.2814951545916013,
464
- "grad_norm": 1.5577447669962958,
465
  "learning_rate": 1.8067358905889148e-05,
466
- "loss": 1.0076,
467
  "step": 305
468
  },
469
  {
470
  "epoch": 0.2861098292570374,
471
- "grad_norm": 1.5744013062583462,
472
  "learning_rate": 1.797101592787194e-05,
473
- "loss": 1.0298,
474
  "step": 310
475
  },
476
  {
477
  "epoch": 0.29072450392247345,
478
- "grad_norm": 1.5662295967375852,
479
  "learning_rate": 1.7872599821916922e-05,
480
- "loss": 1.0219,
481
  "step": 315
482
  },
483
  {
484
  "epoch": 0.29533917858790953,
485
- "grad_norm": 1.6786923569192709,
486
  "learning_rate": 1.7772136184407367e-05,
487
- "loss": 1.0235,
488
  "step": 320
489
  },
490
  {
491
  "epoch": 0.2999538532533456,
492
- "grad_norm": 1.5619445773068321,
493
  "learning_rate": 1.7669651144255265e-05,
494
- "loss": 1.0336,
495
  "step": 325
496
  },
497
  {
498
  "epoch": 0.30456852791878175,
499
- "grad_norm": 1.6094241012251198,
500
  "learning_rate": 1.7565171356105627e-05,
501
- "loss": 1.0442,
502
  "step": 330
503
  },
504
  {
505
  "epoch": 0.30918320258421783,
506
- "grad_norm": 2.0012505433561683,
507
  "learning_rate": 1.7458723993404065e-05,
508
- "loss": 1.0388,
509
  "step": 335
510
  },
511
  {
512
  "epoch": 0.3137978772496539,
513
- "grad_norm": 1.731680358218969,
514
  "learning_rate": 1.7350336741329413e-05,
515
- "loss": 1.0302,
516
  "step": 340
517
  },
518
  {
519
  "epoch": 0.31841255191509,
520
- "grad_norm": 1.5756520751714067,
521
  "learning_rate": 1.7240037789593307e-05,
522
- "loss": 1.0388,
523
  "step": 345
524
  },
525
  {
526
  "epoch": 0.3230272265805261,
527
- "grad_norm": 1.598908263014295,
528
  "learning_rate": 1.712785582510848e-05,
529
- "loss": 1.0119,
530
  "step": 350
531
  },
532
  {
533
  "epoch": 0.32764190124596215,
534
- "grad_norm": 1.5588179492774312,
535
  "learning_rate": 1.70138200245278e-05,
536
- "loss": 1.0235,
537
  "step": 355
538
  },
539
  {
540
  "epoch": 0.33225657591139823,
541
- "grad_norm": 2.719544835233645,
542
  "learning_rate": 1.6897960046655886e-05,
543
- "loss": 1.0291,
544
  "step": 360
545
  },
546
  {
547
  "epoch": 0.3368712505768343,
548
- "grad_norm": 1.7224906132772548,
549
  "learning_rate": 1.6780306024735384e-05,
550
- "loss": 1.0193,
551
  "step": 365
552
  },
553
  {
554
  "epoch": 0.3414859252422704,
555
- "grad_norm": 1.5806127742561251,
556
  "learning_rate": 1.6660888558609774e-05,
557
- "loss": 1.042,
558
  "step": 370
559
  },
560
  {
561
  "epoch": 0.34610059990770653,
562
- "grad_norm": 1.568089679819643,
563
  "learning_rate": 1.6539738706764895e-05,
564
- "loss": 1.025,
565
  "step": 375
566
  },
567
  {
568
  "epoch": 0.3507152745731426,
569
- "grad_norm": 1.5475613908827464,
570
  "learning_rate": 1.6416887978251134e-05,
571
- "loss": 1.0179,
572
  "step": 380
573
  },
574
  {
575
  "epoch": 0.3553299492385787,
576
- "grad_norm": 1.5281204414406897,
577
  "learning_rate": 1.6292368324488462e-05,
578
- "loss": 0.9986,
579
  "step": 385
580
  },
581
  {
582
  "epoch": 0.35994462390401477,
583
- "grad_norm": 1.5385364331546185,
584
  "learning_rate": 1.6166212130956383e-05,
585
- "loss": 1.0075,
586
  "step": 390
587
  },
588
  {
589
  "epoch": 0.36455929856945085,
590
- "grad_norm": 1.5354876751143718,
591
  "learning_rate": 1.6038452208771037e-05,
592
- "loss": 1.0294,
593
  "step": 395
594
  },
595
  {
596
  "epoch": 0.36917397323488693,
597
- "grad_norm": 1.4603598177059978,
598
  "learning_rate": 1.590912178615157e-05,
599
- "loss": 1.0094,
600
  "step": 400
601
  },
602
  {
603
  "epoch": 0.36917397323488693,
604
- "eval_loss": 1.0263434648513794,
605
- "eval_runtime": 690.0701,
606
- "eval_samples_per_second": 22.244,
607
- "eval_steps_per_second": 0.174,
608
  "step": 400
609
  },
610
  {
611
  "epoch": 0.373788647900323,
612
- "grad_norm": 1.665640112517328,
613
  "learning_rate": 1.5778254499778006e-05,
614
- "loss": 1.0159,
615
  "step": 405
616
  },
617
  {
618
  "epoch": 0.3784033225657591,
619
- "grad_norm": 1.4667601213154255,
620
  "learning_rate": 1.564588438604296e-05,
621
- "loss": 1.0047,
622
  "step": 410
623
  },
624
  {
625
  "epoch": 0.3830179972311952,
626
- "grad_norm": 1.45174399572694,
627
  "learning_rate": 1.551204587219928e-05,
628
- "loss": 0.9944,
629
  "step": 415
630
  },
631
  {
632
  "epoch": 0.3876326718966313,
633
- "grad_norm": 1.5137353156884679,
634
  "learning_rate": 1.5376773767406142e-05,
635
- "loss": 1.0358,
636
  "step": 420
637
  },
638
  {
639
  "epoch": 0.3922473465620674,
640
- "grad_norm": 1.427796298190285,
641
  "learning_rate": 1.5240103253675756e-05,
642
- "loss": 1.0063,
643
  "step": 425
644
  },
645
  {
646
  "epoch": 0.39686202122750347,
647
- "grad_norm": 1.7132551479225844,
648
  "learning_rate": 1.51020698767231e-05,
649
- "loss": 1.0139,
650
  "step": 430
651
  },
652
  {
653
  "epoch": 0.40147669589293955,
654
- "grad_norm": 1.365792024614782,
655
  "learning_rate": 1.4962709536721087e-05,
656
- "loss": 0.9946,
657
  "step": 435
658
  },
659
  {
660
  "epoch": 0.40609137055837563,
661
- "grad_norm": 1.4603319638367434,
662
  "learning_rate": 1.4822058478963532e-05,
663
- "loss": 1.0109,
664
  "step": 440
665
  },
666
  {
667
  "epoch": 0.4107060452238117,
668
- "grad_norm": 1.5104072479651949,
669
  "learning_rate": 1.4680153284438345e-05,
670
- "loss": 1.0079,
671
  "step": 445
672
  },
673
  {
674
  "epoch": 0.4153207198892478,
675
- "grad_norm": 1.5003898869030268,
676
  "learning_rate": 1.4537030860313443e-05,
677
- "loss": 1.017,
678
  "step": 450
679
  },
680
  {
681
  "epoch": 0.41993539455468387,
682
- "grad_norm": 1.5143550375677852,
683
  "learning_rate": 1.4392728430337801e-05,
684
- "loss": 0.9938,
685
  "step": 455
686
  },
687
  {
688
  "epoch": 0.42455006922012,
689
- "grad_norm": 1.4973860738319764,
690
  "learning_rate": 1.4247283525160178e-05,
691
- "loss": 0.9947,
692
  "step": 460
693
  },
694
  {
695
  "epoch": 0.4291647438855561,
696
- "grad_norm": 1.46765520104305,
697
  "learning_rate": 1.4100733972568038e-05,
698
- "loss": 1.0063,
699
  "step": 465
700
  },
701
  {
702
  "epoch": 0.43377941855099217,
703
- "grad_norm": 1.502671999263143,
704
  "learning_rate": 1.3953117887649153e-05,
705
- "loss": 1.0191,
706
  "step": 470
707
  },
708
  {
709
  "epoch": 0.43839409321642825,
710
- "grad_norm": 1.483874282862377,
711
  "learning_rate": 1.3804473662878519e-05,
712
- "loss": 1.0137,
713
  "step": 475
714
  },
715
  {
716
  "epoch": 0.44300876788186433,
717
- "grad_norm": 1.461904130124467,
718
  "learning_rate": 1.3654839958133118e-05,
719
- "loss": 1.0003,
720
  "step": 480
721
  },
722
  {
723
  "epoch": 0.4476234425473004,
724
- "grad_norm": 1.4713025735431442,
725
  "learning_rate": 1.3504255690637122e-05,
726
- "loss": 1.0186,
727
  "step": 485
728
  },
729
  {
730
  "epoch": 0.4522381172127365,
731
- "grad_norm": 1.537354618893577,
732
  "learning_rate": 1.3352760024840174e-05,
733
- "loss": 0.9941,
734
  "step": 490
735
  },
736
  {
737
  "epoch": 0.45685279187817257,
738
- "grad_norm": 1.4600444670015191,
739
  "learning_rate": 1.3200392362231385e-05,
740
- "loss": 0.9828,
741
  "step": 495
742
  },
743
  {
744
  "epoch": 0.46146746654360865,
745
- "grad_norm": 1.5367214120380763,
746
  "learning_rate": 1.3047192331091636e-05,
747
- "loss": 1.0163,
748
  "step": 500
749
  },
750
  {
751
  "epoch": 0.46146746654360865,
752
- "eval_loss": 1.0115509033203125,
753
- "eval_runtime": 629.8272,
754
- "eval_samples_per_second": 24.372,
755
- "eval_steps_per_second": 0.191,
756
  "step": 500
757
  },
758
  {
759
  "epoch": 0.4660821412090448,
760
- "grad_norm": 1.4574927964092395,
761
  "learning_rate": 1.2893199776186957e-05,
762
- "loss": 0.991,
763
  "step": 505
764
  },
765
  {
766
  "epoch": 0.47069681587448087,
767
- "grad_norm": 1.5417624187440955,
768
  "learning_rate": 1.2738454748405552e-05,
769
- "loss": 1.0085,
770
  "step": 510
771
  },
772
  {
773
  "epoch": 0.47531149053991695,
774
- "grad_norm": 1.410519905571099,
775
  "learning_rate": 1.258299749434123e-05,
776
- "loss": 1.0112,
777
  "step": 515
778
  },
779
  {
780
  "epoch": 0.47992616520535303,
781
- "grad_norm": 1.452159940025483,
782
  "learning_rate": 1.2426868445825955e-05,
783
- "loss": 0.9999,
784
  "step": 520
785
  },
786
  {
787
  "epoch": 0.4845408398707891,
788
- "grad_norm": 1.474028826882843,
789
  "learning_rate": 1.2270108209414186e-05,
790
- "loss": 0.981,
791
  "step": 525
792
  },
793
  {
794
  "epoch": 0.4891555145362252,
795
- "grad_norm": 1.4384321957893453,
796
  "learning_rate": 1.2112757555821796e-05,
797
- "loss": 0.9938,
798
  "step": 530
799
  },
800
  {
801
  "epoch": 0.49377018920166127,
802
- "grad_norm": 1.4588174941974965,
803
  "learning_rate": 1.1954857409322302e-05,
804
- "loss": 0.9779,
805
  "step": 535
806
  },
807
  {
808
  "epoch": 0.49838486386709735,
809
- "grad_norm": 1.477386259114682,
810
  "learning_rate": 1.179644883710313e-05,
811
- "loss": 0.9926,
812
  "step": 540
813
  },
814
  {
815
  "epoch": 0.5029995385325334,
816
- "grad_norm": 1.4064276225459815,
817
  "learning_rate": 1.1637573038584729e-05,
818
- "loss": 0.9819,
819
  "step": 545
820
  },
821
  {
822
  "epoch": 0.5076142131979695,
823
- "grad_norm": 1.562831675658506,
824
  "learning_rate": 1.1478271334705302e-05,
825
- "loss": 0.9879,
826
  "step": 550
827
  },
828
  {
829
  "epoch": 0.5122288878634056,
830
- "grad_norm": 1.4842942517557047,
831
  "learning_rate": 1.1318585157173913e-05,
832
- "loss": 0.9941,
833
  "step": 555
834
  },
835
  {
836
  "epoch": 0.5168435625288417,
837
- "grad_norm": 1.4711158924165912,
838
  "learning_rate": 1.115855603769479e-05,
839
- "loss": 0.9847,
840
  "step": 560
841
  },
842
  {
843
  "epoch": 0.5214582371942778,
844
- "grad_norm": 1.4142173667096958,
845
  "learning_rate": 1.0998225597165628e-05,
846
- "loss": 0.9812,
847
  "step": 565
848
  },
849
  {
850
  "epoch": 0.5260729118597139,
851
- "grad_norm": 1.928841770349413,
852
  "learning_rate": 1.0837635534852687e-05,
853
- "loss": 0.9804,
854
  "step": 570
855
  },
856
  {
857
  "epoch": 0.53068758652515,
858
- "grad_norm": 1.4609059672802447,
859
  "learning_rate": 1.0676827617545511e-05,
860
- "loss": 0.9784,
861
  "step": 575
862
  },
863
  {
864
  "epoch": 0.5353022611905861,
865
- "grad_norm": 1.4158177675520824,
866
  "learning_rate": 1.0515843668694087e-05,
867
- "loss": 0.9762,
868
  "step": 580
869
  },
870
  {
871
  "epoch": 0.5399169358560222,
872
- "grad_norm": 1.4673507154635572,
873
  "learning_rate": 1.0354725557531258e-05,
874
- "loss": 0.9747,
875
  "step": 585
876
  },
877
  {
878
  "epoch": 0.5445316105214583,
879
- "grad_norm": 1.3571129755048559,
880
  "learning_rate": 1.0193515188183246e-05,
881
- "loss": 0.9657,
882
  "step": 590
883
  },
884
  {
885
  "epoch": 0.5491462851868943,
886
- "grad_norm": 1.4126105366024362,
887
  "learning_rate": 1.003225448877108e-05,
888
- "loss": 0.9941,
889
  "step": 595
890
  },
891
  {
892
  "epoch": 0.5537609598523304,
893
- "grad_norm": 1.3691646139951152,
894
  "learning_rate": 9.870985400505805e-06,
895
- "loss": 0.9715,
896
  "step": 600
897
  },
898
  {
899
  "epoch": 0.5537609598523304,
900
- "eval_loss": 0.9918625950813293,
901
- "eval_runtime": 658.0431,
902
- "eval_samples_per_second": 23.327,
903
- "eval_steps_per_second": 0.182,
904
  "step": 600
905
  },
906
  {
907
  "epoch": 0.5583756345177665,
908
- "grad_norm": 1.420929026631665,
909
  "learning_rate": 9.709749866780248e-06,
910
- "loss": 0.978,
911
  "step": 605
912
  },
913
  {
914
  "epoch": 0.5629903091832026,
915
- "grad_norm": 1.4531326195389616,
916
  "learning_rate": 9.548589822260281e-06,
917
- "loss": 0.9875,
918
  "step": 610
919
  },
920
  {
921
  "epoch": 0.5676049838486387,
922
- "grad_norm": 1.3776913328884506,
923
  "learning_rate": 9.387547181978291e-06,
924
- "loss": 0.9722,
925
  "step": 615
926
  },
927
  {
928
  "epoch": 0.5722196585140747,
929
- "grad_norm": 1.367969972697207,
930
  "learning_rate": 9.226663830431777e-06,
931
- "loss": 0.98,
932
  "step": 620
933
  },
934
  {
935
  "epoch": 0.5768343331795108,
936
- "grad_norm": 1.3562504249301128,
937
  "learning_rate": 9.065981610689915e-06,
938
- "loss": 0.9679,
939
  "step": 625
940
  },
941
  {
942
  "epoch": 0.5814490078449469,
943
- "grad_norm": 1.3687377076518379,
944
  "learning_rate": 8.905542313510846e-06,
945
- "loss": 0.9773,
946
  "step": 630
947
  },
948
  {
949
  "epoch": 0.586063682510383,
950
- "grad_norm": 1.4723781099911786,
951
  "learning_rate": 8.745387666472639e-06,
952
- "loss": 0.9851,
953
  "step": 635
954
  },
955
  {
956
  "epoch": 0.5906783571758191,
957
- "grad_norm": 1.4217076682395915,
958
  "learning_rate": 8.58555932312059e-06,
959
- "loss": 0.9864,
960
  "step": 640
961
  },
962
  {
963
  "epoch": 0.5952930318412551,
964
- "grad_norm": 1.410095405895401,
965
  "learning_rate": 8.426098852133892e-06,
966
- "loss": 0.9607,
967
  "step": 645
968
  },
969
  {
970
  "epoch": 0.5999077065066912,
971
- "grad_norm": 1.3772130152565503,
972
  "learning_rate": 8.267047726514278e-06,
973
- "loss": 0.9786,
974
  "step": 650
975
  },
976
  {
977
  "epoch": 0.6045223811721273,
978
- "grad_norm": 1.3660074782209246,
979
  "learning_rate": 8.108447312799588e-06,
980
- "loss": 0.9701,
981
  "step": 655
982
  },
983
  {
984
  "epoch": 0.6091370558375635,
985
- "grad_norm": 1.4470086526766173,
986
  "learning_rate": 7.950338860305049e-06,
987
- "loss": 0.9605,
988
  "step": 660
989
  },
990
  {
991
  "epoch": 0.6137517305029996,
992
- "grad_norm": 1.3171246011913702,
993
  "learning_rate": 7.792763490394983e-06,
994
- "loss": 0.9704,
995
  "step": 665
996
  },
997
  {
998
  "epoch": 0.6183664051684357,
999
- "grad_norm": 1.3411633903465385,
1000
  "learning_rate": 7.635762185787868e-06,
1001
- "loss": 0.9743,
1002
  "step": 670
1003
  },
1004
  {
1005
  "epoch": 0.6229810798338717,
1006
- "grad_norm": 1.3318464659750435,
1007
  "learning_rate": 7.479375779897379e-06,
1008
- "loss": 0.9716,
1009
  "step": 675
1010
  },
1011
  {
1012
  "epoch": 0.6275957544993078,
1013
- "grad_norm": 1.3843707717527398,
1014
  "learning_rate": 7.3236449462123315e-06,
1015
- "loss": 0.9651,
1016
  "step": 680
1017
  },
1018
  {
1019
  "epoch": 0.6322104291647439,
1020
- "grad_norm": 1.4219024896409418,
1021
  "learning_rate": 7.168610187718164e-06,
1022
- "loss": 0.9633,
1023
  "step": 685
1024
  },
1025
  {
1026
  "epoch": 0.63682510383018,
1027
- "grad_norm": 1.3808605454092582,
1028
  "learning_rate": 7.014311826362804e-06,
1029
- "loss": 0.9611,
1030
  "step": 690
1031
  },
1032
  {
1033
  "epoch": 0.6414397784956161,
1034
- "grad_norm": 1.3567673086915946,
1035
  "learning_rate": 6.860789992569601e-06,
1036
- "loss": 0.9764,
1037
  "step": 695
1038
  },
1039
  {
1040
  "epoch": 0.6460544531610521,
1041
- "grad_norm": 1.3750872799851055,
1042
  "learning_rate": 6.708084614800065e-06,
1043
- "loss": 0.9408,
1044
  "step": 700
1045
  },
1046
  {
1047
  "epoch": 0.6460544531610521,
1048
- "eval_loss": 0.9742818474769592,
1049
- "eval_runtime": 631.1699,
1050
- "eval_samples_per_second": 24.32,
1051
- "eval_steps_per_second": 0.19,
1052
  "step": 700
1053
  },
1054
  {
1055
  "epoch": 0.6506691278264882,
1056
- "grad_norm": 1.3508823042531621,
1057
  "learning_rate": 6.556235409169154e-06,
1058
- "loss": 0.9412,
1059
  "step": 705
1060
  },
1061
  {
1062
  "epoch": 0.6552838024919243,
1063
- "grad_norm": 1.2896871883040175,
1064
  "learning_rate": 6.405281869115768e-06,
1065
- "loss": 0.9454,
1066
  "step": 710
1067
  },
1068
  {
1069
  "epoch": 0.6598984771573604,
1070
- "grad_norm": 1.3142111477173872,
1071
  "learning_rate": 6.255263255131172e-06,
1072
- "loss": 0.9612,
1073
  "step": 715
1074
  },
1075
  {
1076
  "epoch": 0.6645131518227965,
1077
- "grad_norm": 1.328270773616224,
1078
  "learning_rate": 6.106218584547992e-06,
1079
- "loss": 0.9616,
1080
  "step": 720
1081
  },
1082
  {
1083
  "epoch": 0.6691278264882325,
1084
- "grad_norm": 1.3080269409607574,
1085
  "learning_rate": 5.9581866213924656e-06,
1086
- "loss": 0.9497,
1087
  "step": 725
1088
  },
1089
  {
1090
  "epoch": 0.6737425011536686,
1091
- "grad_norm": 1.3131216546886917,
1092
  "learning_rate": 5.811205866302571e-06,
1093
- "loss": 0.9486,
1094
  "step": 730
1095
  },
1096
  {
1097
  "epoch": 0.6783571758191047,
1098
- "grad_norm": 1.3044180991043575,
1099
  "learning_rate": 5.665314546514633e-06,
1100
- "loss": 0.9517,
1101
  "step": 735
1102
  },
1103
  {
1104
  "epoch": 0.6829718504845408,
1105
- "grad_norm": 1.336189592453345,
1106
  "learning_rate": 5.520550605921091e-06,
1107
- "loss": 0.9525,
1108
  "step": 740
1109
  },
1110
  {
1111
  "epoch": 0.687586525149977,
1112
- "grad_norm": 1.323337183074246,
1113
  "learning_rate": 5.376951695201894e-06,
1114
- "loss": 0.955,
1115
  "step": 745
1116
  },
1117
  {
1118
  "epoch": 0.6922011998154131,
1119
- "grad_norm": 1.3688297255320676,
1120
  "learning_rate": 5.234555162032221e-06,
1121
- "loss": 0.9453,
1122
  "step": 750
1123
  },
1124
  {
1125
  "epoch": 0.6968158744808491,
1126
- "grad_norm": 1.3807647297081027,
1127
  "learning_rate": 5.093398041368942e-06,
1128
- "loss": 0.955,
1129
  "step": 755
1130
  },
1131
  {
1132
  "epoch": 0.7014305491462852,
1133
- "grad_norm": 1.396007214455348,
1134
  "learning_rate": 4.9535170458184735e-06,
1135
- "loss": 0.9509,
1136
  "step": 760
1137
  },
1138
  {
1139
  "epoch": 0.7060452238117213,
1140
- "grad_norm": 1.3167881087405213,
1141
  "learning_rate": 4.81494855608843e-06,
1142
- "loss": 0.9524,
1143
  "step": 765
1144
  },
1145
  {
1146
  "epoch": 0.7106598984771574,
1147
- "grad_norm": 1.319725953292428,
1148
  "learning_rate": 4.677728611525605e-06,
1149
- "loss": 0.9484,
1150
  "step": 770
1151
  },
1152
  {
1153
  "epoch": 0.7152745731425935,
1154
- "grad_norm": 1.3234923457514638,
1155
  "learning_rate": 4.541892900742757e-06,
1156
- "loss": 0.9393,
1157
  "step": 775
1158
  },
1159
  {
1160
  "epoch": 0.7198892478080295,
1161
- "grad_norm": 1.3170003796409075,
1162
  "learning_rate": 4.407476752336575e-06,
1163
- "loss": 0.9407,
1164
  "step": 780
1165
  },
1166
  {
1167
  "epoch": 0.7245039224734656,
1168
- "grad_norm": 1.2793467268798606,
1169
  "learning_rate": 4.2745151256993325e-06,
1170
- "loss": 0.9401,
1171
  "step": 785
1172
  },
1173
  {
1174
  "epoch": 0.7291185971389017,
1175
- "grad_norm": 1.3434754761600083,
1176
  "learning_rate": 4.143042601926492e-06,
1177
- "loss": 0.9509,
1178
  "step": 790
1179
  },
1180
  {
1181
  "epoch": 0.7337332718043378,
1182
- "grad_norm": 1.3540277586068123,
1183
  "learning_rate": 4.013093374822789e-06,
1184
- "loss": 0.9346,
1185
  "step": 795
1186
  },
1187
  {
1188
  "epoch": 0.7383479464697739,
1189
- "grad_norm": 1.2479933947202246,
1190
  "learning_rate": 3.884701242008949e-06,
1191
- "loss": 0.925,
1192
  "step": 800
1193
  },
1194
  {
1195
  "epoch": 0.7383479464697739,
1196
- "eval_loss": 0.9587027430534363,
1197
- "eval_runtime": 794.6067,
1198
- "eval_samples_per_second": 19.318,
1199
- "eval_steps_per_second": 0.151,
1200
  "step": 800
1201
  },
1202
  {
1203
  "epoch": 0.7429626211352099,
1204
- "grad_norm": 1.294794320625323,
1205
  "learning_rate": 3.757899596131529e-06,
1206
- "loss": 0.9583,
1207
  "step": 805
1208
  },
1209
  {
1210
  "epoch": 0.747577295800646,
1211
- "grad_norm": 1.2802834042823585,
1212
  "learning_rate": 3.6327214161780287e-06,
1213
- "loss": 0.9781,
1214
  "step": 810
1215
  },
1216
  {
1217
  "epoch": 0.7521919704660821,
1218
- "grad_norm": 1.3383479712956539,
1219
  "learning_rate": 3.5091992588996026e-06,
1220
- "loss": 0.9553,
1221
  "step": 815
1222
  },
1223
  {
1224
  "epoch": 0.7568066451315182,
1225
- "grad_norm": 1.264756496676926,
1226
  "learning_rate": 3.387365250343615e-06,
1227
- "loss": 0.9514,
1228
  "step": 820
1229
  },
1230
  {
1231
  "epoch": 0.7614213197969543,
1232
- "grad_norm": 1.2784320319866365,
1233
  "learning_rate": 3.2672510774981692e-06,
1234
- "loss": 0.9335,
1235
  "step": 825
1236
  },
1237
  {
1238
  "epoch": 0.7660359944623903,
1239
- "grad_norm": 1.301600817119,
1240
  "learning_rate": 3.148887980050872e-06,
1241
- "loss": 0.9406,
1242
  "step": 830
1243
  },
1244
  {
1245
  "epoch": 0.7706506691278265,
1246
- "grad_norm": 1.3681079893540207,
1247
  "learning_rate": 3.032306742263891e-06,
1248
- "loss": 0.9492,
1249
  "step": 835
1250
  },
1251
  {
1252
  "epoch": 0.7752653437932626,
1253
- "grad_norm": 1.258171047985833,
1254
  "learning_rate": 2.9175376849675076e-06,
1255
- "loss": 0.9579,
1256
  "step": 840
1257
  },
1258
  {
1259
  "epoch": 0.7798800184586987,
1260
- "grad_norm": 1.344831232347504,
1261
  "learning_rate": 2.8046106576741605e-06,
1262
- "loss": 0.9258,
1263
  "step": 845
1264
  },
1265
  {
1266
  "epoch": 0.7844946931241348,
1267
- "grad_norm": 1.3148187818640558,
1268
  "learning_rate": 2.693555030815085e-06,
1269
- "loss": 0.9363,
1270
  "step": 850
1271
  },
1272
  {
1273
  "epoch": 0.7891093677895709,
1274
- "grad_norm": 1.3099089177521062,
1275
  "learning_rate": 2.5843996881015676e-06,
1276
- "loss": 0.9356,
1277
  "step": 855
1278
  },
1279
  {
1280
  "epoch": 0.7937240424550069,
1281
- "grad_norm": 1.315538878419098,
1282
  "learning_rate": 2.4771730190127616e-06,
1283
- "loss": 0.933,
1284
  "step": 860
1285
  },
1286
  {
1287
  "epoch": 0.798338717120443,
1288
- "grad_norm": 1.3133036385084866,
1289
  "learning_rate": 2.3719029114120716e-06,
1290
- "loss": 0.931,
1291
  "step": 865
1292
  },
1293
  {
1294
  "epoch": 0.8029533917858791,
1295
- "grad_norm": 1.4006753026532661,
1296
  "learning_rate": 2.2686167442939733e-06,
1297
- "loss": 0.9377,
1298
  "step": 870
1299
  },
1300
  {
1301
  "epoch": 0.8075680664513152,
1302
- "grad_norm": 1.2784631847200105,
1303
  "learning_rate": 2.1673413806632104e-06,
1304
- "loss": 0.9371,
1305
  "step": 875
1306
  },
1307
  {
1308
  "epoch": 0.8121827411167513,
1309
- "grad_norm": 1.2910534416533634,
1310
  "learning_rate": 2.0681031605481563e-06,
1311
- "loss": 0.9328,
1312
  "step": 880
1313
  },
1314
  {
1315
  "epoch": 0.8167974157821873,
1316
- "grad_norm": 1.2874865013796961,
1317
  "learning_rate": 1.9709278941502363e-06,
1318
- "loss": 0.933,
1319
  "step": 885
1320
  },
1321
  {
1322
  "epoch": 0.8214120904476234,
1323
- "grad_norm": 1.3030024076731785,
1324
  "learning_rate": 1.8758408551311048e-06,
1325
- "loss": 0.9293,
1326
  "step": 890
1327
  },
1328
  {
1329
  "epoch": 0.8260267651130595,
1330
- "grad_norm": 1.300621884806125,
1331
  "learning_rate": 1.7828667740394045e-06,
1332
- "loss": 0.9487,
1333
  "step": 895
1334
  },
1335
  {
1336
  "epoch": 0.8306414397784956,
1337
- "grad_norm": 1.2440950362531038,
1338
  "learning_rate": 1.6920298318787532e-06,
1339
- "loss": 0.936,
1340
  "step": 900
1341
  },
1342
  {
1343
  "epoch": 0.8306414397784956,
1344
- "eval_loss": 0.9476920962333679,
1345
- "eval_runtime": 1084.8052,
1346
- "eval_samples_per_second": 14.15,
1347
- "eval_steps_per_second": 0.111,
1348
  "step": 900
1349
  },
1350
  {
1351
  "epoch": 0.8352561144439317,
1352
- "grad_norm": 1.2832171786571007,
1353
  "learning_rate": 1.6033536538186778e-06,
1354
- "loss": 0.9335,
1355
  "step": 905
1356
  },
1357
  {
1358
  "epoch": 0.8398707891093677,
1359
- "grad_norm": 1.301969806108789,
1360
  "learning_rate": 1.5168613030500922e-06,
1361
- "loss": 0.9234,
1362
  "step": 910
1363
  },
1364
  {
1365
  "epoch": 0.8444854637748038,
1366
- "grad_norm": 1.301852459539048,
1367
  "learning_rate": 1.4325752747869626e-06,
1368
- "loss": 0.9371,
1369
  "step": 915
1370
  },
1371
  {
1372
  "epoch": 0.84910013844024,
1373
- "grad_norm": 1.239583785637997,
1374
  "learning_rate": 1.3505174904156593e-06,
1375
- "loss": 0.9279,
1376
  "step": 920
1377
  },
1378
  {
1379
  "epoch": 0.8537148131056761,
1380
- "grad_norm": 1.2857369329321662,
1381
  "learning_rate": 1.2707092917935914e-06,
1382
- "loss": 0.9371,
1383
  "step": 925
1384
  },
1385
  {
1386
  "epoch": 0.8583294877711122,
1387
- "grad_norm": 1.3340476774345473,
1388
  "learning_rate": 1.1931714356985257e-06,
1389
- "loss": 0.9289,
1390
  "step": 930
1391
  },
1392
  {
1393
  "epoch": 0.8629441624365483,
1394
- "grad_norm": 1.2418676415098189,
1395
  "learning_rate": 1.1179240884301158e-06,
1396
- "loss": 0.919,
1397
  "step": 935
1398
  },
1399
  {
1400
  "epoch": 0.8675588371019843,
1401
- "grad_norm": 1.2918276864444544,
1402
  "learning_rate": 1.0449868205649648e-06,
1403
- "loss": 0.9141,
1404
  "step": 940
1405
  },
1406
  {
1407
  "epoch": 0.8721735117674204,
1408
- "grad_norm": 1.2744777449559592,
1409
  "learning_rate": 9.74378601866669e-07,
1410
- "loss": 0.9393,
1411
  "step": 945
1412
  },
1413
  {
1414
  "epoch": 0.8767881864328565,
1415
- "grad_norm": 1.2681324842814665,
1416
  "learning_rate": 9.061177963520751e-07,
1417
- "loss": 0.9418,
1418
  "step": 950
1419
  },
1420
  {
1421
  "epoch": 0.8814028610982926,
1422
- "grad_norm": 1.240762002170098,
1423
  "learning_rate": 8.402221575151238e-07,
1424
- "loss": 0.9137,
1425
  "step": 955
1426
  },
1427
  {
1428
  "epoch": 0.8860175357637287,
1429
- "grad_norm": 1.2413815735797566,
1430
  "learning_rate": 7.767088237094578e-07,
1431
- "loss": 0.9309,
1432
  "step": 960
1433
  },
1434
  {
1435
  "epoch": 0.8906322104291647,
1436
- "grad_norm": 1.2962026497718315,
1437
  "learning_rate": 7.155943136910193e-07,
1438
- "loss": 0.9327,
1439
  "step": 965
1440
  },
1441
  {
1442
  "epoch": 0.8952468850946008,
1443
- "grad_norm": 1.2994627241342351,
1444
  "learning_rate": 6.568945223218048e-07,
1445
- "loss": 0.9357,
1446
  "step": 970
1447
  },
1448
  {
1449
  "epoch": 0.8998615597600369,
1450
- "grad_norm": 1.249471689704099,
1451
  "learning_rate": 6.00624716435868e-07,
1452
- "loss": 0.9182,
1453
  "step": 975
1454
  },
1455
  {
1456
  "epoch": 0.904476234425473,
1457
- "grad_norm": 1.304916858268742,
1458
  "learning_rate": 5.467995308686813e-07,
1459
- "loss": 0.9377,
1460
  "step": 980
1461
  },
1462
  {
1463
  "epoch": 0.9090909090909091,
1464
- "grad_norm": 1.308605845419493,
1465
  "learning_rate": 4.954329646508505e-07,
1466
- "loss": 0.9287,
1467
  "step": 985
1468
  },
1469
  {
1470
  "epoch": 0.9137055837563451,
1471
- "grad_norm": 1.3332538125701177,
1472
  "learning_rate": 4.4653837736721273e-07,
1473
- "loss": 0.9318,
1474
  "step": 990
1475
  },
1476
  {
1477
  "epoch": 0.9183202584217812,
1478
- "grad_norm": 1.242575940479115,
1479
  "learning_rate": 4.001284856822174e-07,
1480
- "loss": 0.9384,
1481
  "step": 995
1482
  },
1483
  {
1484
  "epoch": 0.9229349330872173,
1485
- "grad_norm": 1.2410664748669897,
1486
  "learning_rate": 3.562153600325491e-07,
1487
- "loss": 0.9192,
1488
  "step": 1000
1489
  },
1490
  {
1491
  "epoch": 0.9229349330872173,
1492
- "eval_loss": 0.9423367381095886,
1493
- "eval_runtime": 631.8993,
1494
- "eval_samples_per_second": 24.292,
1495
- "eval_steps_per_second": 0.19,
1496
  "step": 1000
1497
  },
1498
  {
1499
  "epoch": 0.9275496077526535,
1500
- "grad_norm": 1.2750064585233012,
1501
  "learning_rate": 3.1481042148779674e-07,
1502
- "loss": 0.937,
1503
  "step": 1005
1504
  },
1505
  {
1506
  "epoch": 0.9321642824180896,
1507
- "grad_norm": 1.296644486701872,
1508
  "learning_rate": 2.7592443878003196e-07,
1509
- "loss": 0.9379,
1510
  "step": 1010
1511
  },
1512
  {
1513
  "epoch": 0.9367789570835257,
1514
- "grad_norm": 1.2639158911345783,
1515
  "learning_rate": 2.395675255030383e-07,
1516
- "loss": 0.9201,
1517
  "step": 1015
1518
  },
1519
  {
1520
  "epoch": 0.9413936317489617,
1521
- "grad_norm": 1.2712336912350524,
1522
  "learning_rate": 2.057491374819365e-07,
1523
- "loss": 0.9229,
1524
  "step": 1020
1525
  },
1526
  {
1527
  "epoch": 0.9460083064143978,
1528
- "grad_norm": 1.2545982973718852,
1529
  "learning_rate": 1.7447807031388264e-07,
1530
- "loss": 0.9311,
1531
  "step": 1025
1532
  },
1533
  {
1534
  "epoch": 0.9506229810798339,
1535
- "grad_norm": 1.304991714097288,
1536
  "learning_rate": 1.457624570804772e-07,
1537
- "loss": 0.9154,
1538
  "step": 1030
1539
  },
1540
  {
1541
  "epoch": 0.95523765574527,
1542
- "grad_norm": 1.305162882196052,
1543
  "learning_rate": 1.196097662324902e-07,
1544
- "loss": 0.9362,
1545
  "step": 1035
1546
  },
1547
  {
1548
  "epoch": 0.9598523304107061,
1549
- "grad_norm": 1.2643702437746445,
1550
  "learning_rate": 9.602679964744288e-08,
1551
- "loss": 0.9146,
1552
  "step": 1040
1553
  },
1554
  {
1555
  "epoch": 0.9644670050761421,
1556
- "grad_norm": 1.2890802928691847,
1557
  "learning_rate": 7.501969086054717e-08,
1558
- "loss": 0.9302,
1559
  "step": 1045
1560
  },
1561
  {
1562
  "epoch": 0.9690816797415782,
1563
- "grad_norm": 1.3207575231611461,
1564
  "learning_rate": 5.659390346948179e-08,
1565
- "loss": 0.9392,
1566
  "step": 1050
1567
  },
1568
  {
1569
  "epoch": 0.9736963544070143,
1570
- "grad_norm": 1.2493549478675108,
1571
  "learning_rate": 4.075422971340115e-08,
1572
- "loss": 0.9386,
1573
  "step": 1055
1574
  },
1575
  {
1576
  "epoch": 0.9783110290724504,
1577
- "grad_norm": 1.2867036484493364,
1578
  "learning_rate": 2.7504789226548977e-08,
1579
- "loss": 0.9252,
1580
  "step": 1060
1581
  },
1582
  {
1583
  "epoch": 0.9829257037378865,
1584
- "grad_norm": 1.2128821195340131,
1585
  "learning_rate": 1.6849027966816535e-08,
1586
- "loss": 0.9248,
1587
  "step": 1065
1588
  },
1589
  {
1590
  "epoch": 0.9875403784033225,
1591
- "grad_norm": 1.2424791609426662,
1592
  "learning_rate": 8.789717319505065e-09,
1593
- "loss": 0.9339,
1594
  "step": 1070
1595
  },
1596
  {
1597
  "epoch": 0.9921550530687586,
1598
- "grad_norm": 1.2572456229816387,
1599
  "learning_rate": 3.328953376530164e-09,
1600
- "loss": 0.9287,
1601
  "step": 1075
1602
  },
1603
  {
1604
  "epoch": 0.9967697277341947,
1605
- "grad_norm": 1.3124951219428418,
1606
  "learning_rate": 4.681563912700693e-10,
1607
- "loss": 0.9176,
1608
  "step": 1080
1609
  },
1610
  {
1611
  "epoch": 0.9995385325334564,
1612
  "step": 1083,
1613
  "total_flos": 453306954547200.0,
1614
- "train_loss": 0.9835369018966802,
1615
- "train_runtime": 35743.0085,
1616
- "train_samples_per_second": 3.88,
1617
- "train_steps_per_second": 0.03
1618
  }
1619
  ],
1620
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0009229349330872173,
13
+ "grad_norm": 10.726049490177841,
14
  "learning_rate": 1.8348623853211012e-07,
15
+ "loss": 1.1497,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0046146746654360865,
20
+ "grad_norm": 8.900657171326609,
21
  "learning_rate": 9.174311926605506e-07,
22
+ "loss": 1.1397,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.009229349330872173,
27
+ "grad_norm": 3.735942046557525,
28
  "learning_rate": 1.8348623853211011e-06,
29
+ "loss": 1.0694,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01384402399630826,
34
+ "grad_norm": 2.8318178389546484,
35
  "learning_rate": 2.7522935779816517e-06,
36
+ "loss": 1.0196,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.018458698661744346,
41
+ "grad_norm": 2.114174455764573,
42
  "learning_rate": 3.6697247706422022e-06,
43
+ "loss": 1.0136,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.023073373327180433,
48
+ "grad_norm": 2.310487403214644,
49
  "learning_rate": 4.587155963302753e-06,
50
+ "loss": 1.0042,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.02768804799261652,
55
+ "grad_norm": 2.4801571550073933,
56
  "learning_rate": 5.504587155963303e-06,
57
+ "loss": 0.9742,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.032302722658052604,
62
+ "grad_norm": 2.902534165090561,
63
  "learning_rate": 6.422018348623854e-06,
64
+ "loss": 1.0012,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03691739732348869,
69
+ "grad_norm": 2.2959784184113547,
70
  "learning_rate": 7.3394495412844045e-06,
71
+ "loss": 1.0086,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04153207198892478,
76
+ "grad_norm": 2.3869379848167416,
77
  "learning_rate": 8.256880733944956e-06,
78
+ "loss": 0.987,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.046146746654360866,
83
+ "grad_norm": 2.0557468408917527,
84
  "learning_rate": 9.174311926605506e-06,
85
+ "loss": 0.9824,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.050761421319796954,
90
+ "grad_norm": 2.2160008605472874,
91
  "learning_rate": 1.0091743119266055e-05,
92
+ "loss": 0.986,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.05537609598523304,
97
+ "grad_norm": 2.1320443631302006,
98
  "learning_rate": 1.1009174311926607e-05,
99
+ "loss": 1.0019,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05999077065066913,
104
+ "grad_norm": 2.4823142489717016,
105
  "learning_rate": 1.1926605504587156e-05,
106
+ "loss": 1.0048,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06460544531610521,
111
+ "grad_norm": 2.5334243917693295,
112
  "learning_rate": 1.2844036697247708e-05,
113
+ "loss": 0.9828,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.0692201199815413,
118
+ "grad_norm": 2.436109706504398,
119
  "learning_rate": 1.3761467889908258e-05,
120
+ "loss": 0.9931,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07383479464697738,
125
+ "grad_norm": 2.1346723037619695,
126
  "learning_rate": 1.4678899082568809e-05,
127
+ "loss": 0.986,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.07844946931241348,
132
+ "grad_norm": 1.9789737582877578,
133
  "learning_rate": 1.559633027522936e-05,
134
+ "loss": 1.0089,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08306414397784956,
139
+ "grad_norm": 2.033387276422637,
140
  "learning_rate": 1.6513761467889912e-05,
141
+ "loss": 1.0083,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08767881864328565,
146
+ "grad_norm": 2.243076459165097,
147
  "learning_rate": 1.743119266055046e-05,
148
+ "loss": 1.0175,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09229349330872173,
153
+ "grad_norm": 1.953405447633714,
154
  "learning_rate": 1.834862385321101e-05,
155
+ "loss": 1.025,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09229349330872173,
160
+ "eval_loss": 1.0240174531936646,
161
+ "eval_runtime": 714.5925,
162
+ "eval_samples_per_second": 21.481,
163
+ "eval_steps_per_second": 0.168,
164
  "step": 100
165
  },
166
  {
167
  "epoch": 0.09690816797415783,
168
+ "grad_norm": 2.244939327354232,
169
  "learning_rate": 1.9266055045871563e-05,
170
+ "loss": 1.0494,
171
  "step": 105
172
  },
173
  {
174
  "epoch": 0.10152284263959391,
175
+ "grad_norm": 2.0565138965118406,
176
  "learning_rate": 1.9999947982262415e-05,
177
+ "loss": 1.0345,
178
  "step": 110
179
  },
180
  {
181
  "epoch": 0.10613751730503,
182
+ "grad_norm": 2.0332956026689386,
183
  "learning_rate": 1.9998127418269004e-05,
184
+ "loss": 1.0454,
185
  "step": 115
186
  },
187
  {
188
  "epoch": 0.11075219197046608,
189
+ "grad_norm": 2.1387249854354566,
190
  "learning_rate": 1.9993706508539968e-05,
191
+ "loss": 1.0255,
192
  "step": 120
193
  },
194
  {
195
  "epoch": 0.11536686663590216,
196
+ "grad_norm": 2.458364994524856,
197
  "learning_rate": 1.998668640288e-05,
198
+ "loss": 1.0518,
199
  "step": 125
200
  },
201
  {
202
  "epoch": 0.11998154130133826,
203
+ "grad_norm": 1.8099694655556404,
204
  "learning_rate": 1.997706892710117e-05,
205
+ "loss": 1.0224,
206
  "step": 130
207
  },
208
  {
209
  "epoch": 0.12459621596677434,
210
+ "grad_norm": 2.202808282950665,
211
  "learning_rate": 1.9964856582548094e-05,
212
+ "loss": 1.0552,
213
  "step": 135
214
  },
215
  {
216
  "epoch": 0.12921089063221042,
217
+ "grad_norm": 1.7439069720540679,
218
  "learning_rate": 1.9950052545447354e-05,
219
+ "loss": 1.0509,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 0.13382556529764653,
224
+ "grad_norm": 1.8051398779741403,
225
  "learning_rate": 1.993266066608142e-05,
226
+ "loss": 1.0298,
227
  "step": 145
228
  },
229
  {
230
  "epoch": 0.1384402399630826,
231
+ "grad_norm": 1.949872455091144,
232
  "learning_rate": 1.991268546778726e-05,
233
+ "loss": 1.0152,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 0.1430549146285187,
238
+ "grad_norm": 2.004864424785268,
239
  "learning_rate": 1.9890132145779885e-05,
240
+ "loss": 1.0682,
241
  "step": 155
242
  },
243
  {
244
  "epoch": 0.14766958929395477,
245
+ "grad_norm": 1.9881342422206065,
246
  "learning_rate": 1.986500656580118e-05,
247
+ "loss": 1.0275,
248
  "step": 160
249
  },
250
  {
251
  "epoch": 0.15228426395939088,
252
+ "grad_norm": 1.895358789872697,
253
  "learning_rate": 1.9837315262594307e-05,
254
+ "loss": 1.0341,
255
  "step": 165
256
  },
257
  {
258
  "epoch": 0.15689893862482696,
259
+ "grad_norm": 2.089231797232654,
260
  "learning_rate": 1.980706543820412e-05,
261
  "loss": 1.0367,
262
  "step": 170
263
  },
264
  {
265
  "epoch": 0.16151361329026304,
266
+ "grad_norm": 1.7637365585658213,
267
  "learning_rate": 1.9774264960104056e-05,
268
+ "loss": 1.0223,
269
  "step": 175
270
  },
271
  {
272
  "epoch": 0.16612828795569912,
273
+ "grad_norm": 1.7824319082067301,
274
  "learning_rate": 1.9738922359149927e-05,
275
+ "loss": 1.0352,
276
  "step": 180
277
  },
278
  {
279
  "epoch": 0.1707429626211352,
280
+ "grad_norm": 1.982313038360383,
281
  "learning_rate": 1.9701046827361175e-05,
282
+ "loss": 1.0386,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 0.1753576372865713,
287
+ "grad_norm": 1.6545978849182734,
288
  "learning_rate": 1.9660648215530207e-05,
289
+ "loss": 1.0247,
290
  "step": 190
291
  },
292
  {
293
  "epoch": 0.17997231195200739,
294
+ "grad_norm": 1.782636456859102,
295
  "learning_rate": 1.9617737030660338e-05,
296
+ "loss": 1.0305,
297
  "step": 195
298
  },
299
  {
300
  "epoch": 0.18458698661744347,
301
+ "grad_norm": 1.8378678572944849,
302
  "learning_rate": 1.9572324433233122e-05,
303
+ "loss": 1.033,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 0.18458698661744347,
308
+ "eval_loss": 1.0464073419570923,
309
+ "eval_runtime": 647.0101,
310
+ "eval_samples_per_second": 23.725,
311
+ "eval_steps_per_second": 0.185,
312
  "step": 200
313
  },
314
  {
315
  "epoch": 0.18920166128287955,
316
+ "grad_norm": 1.6648522003545267,
317
  "learning_rate": 1.9524422234305677e-05,
318
+ "loss": 1.0268,
319
  "step": 205
320
  },
321
  {
322
  "epoch": 0.19381633594831565,
323
+ "grad_norm": 1.7206083361405007,
324
  "learning_rate": 1.9474042892438848e-05,
325
+ "loss": 1.0104,
326
  "step": 210
327
  },
328
  {
329
  "epoch": 0.19843101061375173,
330
+ "grad_norm": 2.7587017854007194,
331
  "learning_rate": 1.942119951045692e-05,
332
+ "loss": 1.0338,
333
  "step": 215
334
  },
335
  {
336
  "epoch": 0.20304568527918782,
337
+ "grad_norm": 1.7930536986404009,
338
  "learning_rate": 1.9365905832039814e-05,
339
+ "loss": 1.0614,
340
  "step": 220
341
  },
342
  {
343
  "epoch": 0.2076603599446239,
344
+ "grad_norm": 1.7259048611678551,
345
  "learning_rate": 1.9308176238148565e-05,
346
+ "loss": 1.051,
347
  "step": 225
348
  },
349
  {
350
  "epoch": 0.21227503461006,
351
+ "grad_norm": 1.9820066538391885,
352
  "learning_rate": 1.924802574328509e-05,
353
+ "loss": 1.0259,
354
  "step": 230
355
  },
356
  {
357
  "epoch": 0.21688970927549608,
358
+ "grad_norm": 1.9752532611378077,
359
  "learning_rate": 1.9185469991587166e-05,
360
+ "loss": 1.045,
361
  "step": 235
362
  },
363
  {
364
  "epoch": 0.22150438394093216,
365
+ "grad_norm": 1.6132930572572803,
366
  "learning_rate": 1.912052525275965e-05,
367
+ "loss": 1.0343,
368
  "step": 240
369
  },
370
  {
371
  "epoch": 0.22611905860636825,
372
+ "grad_norm": 1.7584131810475476,
373
  "learning_rate": 1.905320841784298e-05,
374
+ "loss": 1.0341,
375
  "step": 245
376
  },
377
  {
378
  "epoch": 0.23073373327180433,
379
+ "grad_norm": 1.70806063018753,
380
  "learning_rate": 1.898353699482014e-05,
381
+ "loss": 1.0335,
382
  "step": 250
383
  },
384
  {
385
  "epoch": 0.23534840793724043,
386
+ "grad_norm": 1.6516040151613625,
387
  "learning_rate": 1.8911529104063093e-05,
388
+ "loss": 1.0388,
389
  "step": 255
390
  },
391
  {
392
  "epoch": 0.23996308260267651,
393
+ "grad_norm": 1.6090722840992302,
394
  "learning_rate": 1.8837203473619978e-05,
395
+ "loss": 1.0423,
396
  "step": 260
397
  },
398
  {
399
  "epoch": 0.2445777572681126,
400
+ "grad_norm": 1.7713270990326546,
401
  "learning_rate": 1.8760579434344283e-05,
402
+ "loss": 1.0475,
403
  "step": 265
404
  },
405
  {
406
  "epoch": 0.24919243193354867,
407
+ "grad_norm": 1.658315636443391,
408
  "learning_rate": 1.8681676914867176e-05,
409
+ "loss": 1.0484,
410
  "step": 270
411
  },
412
  {
413
  "epoch": 0.25380710659898476,
414
+ "grad_norm": 1.7916964421225479,
415
  "learning_rate": 1.860051643641443e-05,
416
+ "loss": 1.0312,
417
  "step": 275
418
  },
419
  {
420
  "epoch": 0.25842178126442084,
421
+ "grad_norm": 1.6115818403548052,
422
  "learning_rate": 1.8517119107469194e-05,
423
+ "loss": 1.0406,
424
  "step": 280
425
  },
426
  {
427
  "epoch": 0.26303645592985697,
428
+ "grad_norm": 1.5352737986672527,
429
  "learning_rate": 1.8431506618282e-05,
430
+ "loss": 1.0308,
431
  "step": 285
432
  },
433
  {
434
  "epoch": 0.26765113059529305,
435
+ "grad_norm": 1.6588566683503214,
436
  "learning_rate": 1.834370123522954e-05,
437
+ "loss": 1.0427,
438
  "step": 290
439
  },
440
  {
441
  "epoch": 0.27226580526072913,
442
+ "grad_norm": 1.5272038076819447,
443
  "learning_rate": 1.8253725795023504e-05,
444
+ "loss": 1.0309,
445
  "step": 295
446
  },
447
  {
448
  "epoch": 0.2768804799261652,
449
+ "grad_norm": 1.5715430487703328,
450
  "learning_rate": 1.816160369877117e-05,
451
+ "loss": 1.037,
452
  "step": 300
453
  },
454
  {
455
  "epoch": 0.2768804799261652,
456
+ "eval_loss": 1.0424165725708008,
457
+ "eval_runtime": 645.7635,
458
+ "eval_samples_per_second": 23.77,
459
+ "eval_steps_per_second": 0.186,
460
  "step": 300
461
  },
462
  {
463
  "epoch": 0.2814951545916013,
464
+ "grad_norm": 1.7174412533746373,
465
  "learning_rate": 1.8067358905889148e-05,
466
+ "loss": 1.0107,
467
  "step": 305
468
  },
469
  {
470
  "epoch": 0.2861098292570374,
471
+ "grad_norm": 1.6294029724129888,
472
  "learning_rate": 1.797101592787194e-05,
473
+ "loss": 1.0333,
474
  "step": 310
475
  },
476
  {
477
  "epoch": 0.29072450392247345,
478
+ "grad_norm": 1.5569394040938476,
479
  "learning_rate": 1.7872599821916922e-05,
480
+ "loss": 1.0253,
481
  "step": 315
482
  },
483
  {
484
  "epoch": 0.29533917858790953,
485
+ "grad_norm": 1.5576353282044078,
486
  "learning_rate": 1.7772136184407367e-05,
487
+ "loss": 1.0258,
488
  "step": 320
489
  },
490
  {
491
  "epoch": 0.2999538532533456,
492
+ "grad_norm": 1.6327760193096779,
493
  "learning_rate": 1.7669651144255265e-05,
494
+ "loss": 1.0354,
495
  "step": 325
496
  },
497
  {
498
  "epoch": 0.30456852791878175,
499
+ "grad_norm": 1.5902185774151916,
500
  "learning_rate": 1.7565171356105627e-05,
501
+ "loss": 1.0473,
502
  "step": 330
503
  },
504
  {
505
  "epoch": 0.30918320258421783,
506
+ "grad_norm": 1.6853713064054245,
507
  "learning_rate": 1.7458723993404065e-05,
508
+ "loss": 1.0423,
509
  "step": 335
510
  },
511
  {
512
  "epoch": 0.3137978772496539,
513
+ "grad_norm": 1.6156935982005598,
514
  "learning_rate": 1.7350336741329413e-05,
515
+ "loss": 1.032,
516
  "step": 340
517
  },
518
  {
519
  "epoch": 0.31841255191509,
520
+ "grad_norm": 1.6752591408238855,
521
  "learning_rate": 1.7240037789593307e-05,
522
+ "loss": 1.0409,
523
  "step": 345
524
  },
525
  {
526
  "epoch": 0.3230272265805261,
527
+ "grad_norm": 1.6216674961461026,
528
  "learning_rate": 1.712785582510848e-05,
529
+ "loss": 1.0146,
530
  "step": 350
531
  },
532
  {
533
  "epoch": 0.32764190124596215,
534
+ "grad_norm": 1.5384588472944032,
535
  "learning_rate": 1.70138200245278e-05,
536
+ "loss": 1.0257,
537
  "step": 355
538
  },
539
  {
540
  "epoch": 0.33225657591139823,
541
+ "grad_norm": 2.288338756325292,
542
  "learning_rate": 1.6897960046655886e-05,
543
+ "loss": 1.033,
544
  "step": 360
545
  },
546
  {
547
  "epoch": 0.3368712505768343,
548
+ "grad_norm": 1.7385161337142583,
549
  "learning_rate": 1.6780306024735384e-05,
550
+ "loss": 1.0213,
551
  "step": 365
552
  },
553
  {
554
  "epoch": 0.3414859252422704,
555
+ "grad_norm": 2.2001962662713885,
556
  "learning_rate": 1.6660888558609774e-05,
557
+ "loss": 1.0451,
558
  "step": 370
559
  },
560
  {
561
  "epoch": 0.34610059990770653,
562
+ "grad_norm": 1.5300219579349443,
563
  "learning_rate": 1.6539738706764895e-05,
564
+ "loss": 1.0282,
565
  "step": 375
566
  },
567
  {
568
  "epoch": 0.3507152745731426,
569
+ "grad_norm": 1.4444114105238783,
570
  "learning_rate": 1.6416887978251134e-05,
571
+ "loss": 1.0203,
572
  "step": 380
573
  },
574
  {
575
  "epoch": 0.3553299492385787,
576
+ "grad_norm": 1.5804824265139066,
577
  "learning_rate": 1.6292368324488462e-05,
578
+ "loss": 1.0012,
579
  "step": 385
580
  },
581
  {
582
  "epoch": 0.35994462390401477,
583
+ "grad_norm": 1.4610394973013912,
584
  "learning_rate": 1.6166212130956383e-05,
585
+ "loss": 1.0116,
586
  "step": 390
587
  },
588
  {
589
  "epoch": 0.36455929856945085,
590
+ "grad_norm": 1.5120140583271204,
591
  "learning_rate": 1.6038452208771037e-05,
592
+ "loss": 1.0319,
593
  "step": 395
594
  },
595
  {
596
  "epoch": 0.36917397323488693,
597
+ "grad_norm": 1.516281881967303,
598
  "learning_rate": 1.590912178615157e-05,
599
+ "loss": 1.0136,
600
  "step": 400
601
  },
602
  {
603
  "epoch": 0.36917397323488693,
604
+ "eval_loss": 1.0295383930206299,
605
+ "eval_runtime": 651.8755,
606
+ "eval_samples_per_second": 23.547,
607
+ "eval_steps_per_second": 0.184,
608
  "step": 400
609
  },
610
  {
611
  "epoch": 0.373788647900323,
612
+ "grad_norm": 1.7544165660209403,
613
  "learning_rate": 1.5778254499778006e-05,
614
+ "loss": 1.0196,
615
  "step": 405
616
  },
617
  {
618
  "epoch": 0.3784033225657591,
619
+ "grad_norm": 1.5692875836635374,
620
  "learning_rate": 1.564588438604296e-05,
621
+ "loss": 1.0094,
622
  "step": 410
623
  },
624
  {
625
  "epoch": 0.3830179972311952,
626
+ "grad_norm": 1.5588154190139185,
627
  "learning_rate": 1.551204587219928e-05,
628
+ "loss": 0.9973,
629
  "step": 415
630
  },
631
  {
632
  "epoch": 0.3876326718966313,
633
+ "grad_norm": 1.773673892309899,
634
  "learning_rate": 1.5376773767406142e-05,
635
+ "loss": 1.0388,
636
  "step": 420
637
  },
638
  {
639
  "epoch": 0.3922473465620674,
640
+ "grad_norm": 1.5489285794659653,
641
  "learning_rate": 1.5240103253675756e-05,
642
+ "loss": 1.0087,
643
  "step": 425
644
  },
645
  {
646
  "epoch": 0.39686202122750347,
647
+ "grad_norm": 1.6551129777825688,
648
  "learning_rate": 1.51020698767231e-05,
649
+ "loss": 1.0164,
650
  "step": 430
651
  },
652
  {
653
  "epoch": 0.40147669589293955,
654
+ "grad_norm": 1.4090672242535114,
655
  "learning_rate": 1.4962709536721087e-05,
656
+ "loss": 0.997,
657
  "step": 435
658
  },
659
  {
660
  "epoch": 0.40609137055837563,
661
+ "grad_norm": 1.5171777358410203,
662
  "learning_rate": 1.4822058478963532e-05,
663
+ "loss": 1.0132,
664
  "step": 440
665
  },
666
  {
667
  "epoch": 0.4107060452238117,
668
+ "grad_norm": 1.4969201768034885,
669
  "learning_rate": 1.4680153284438345e-05,
670
+ "loss": 1.0119,
671
  "step": 445
672
  },
673
  {
674
  "epoch": 0.4153207198892478,
675
+ "grad_norm": 1.5363202791746906,
676
  "learning_rate": 1.4537030860313443e-05,
677
+ "loss": 1.0188,
678
  "step": 450
679
  },
680
  {
681
  "epoch": 0.41993539455468387,
682
+ "grad_norm": 1.4963468746071473,
683
  "learning_rate": 1.4392728430337801e-05,
684
+ "loss": 0.9952,
685
  "step": 455
686
  },
687
  {
688
  "epoch": 0.42455006922012,
689
+ "grad_norm": 1.500810806740765,
690
  "learning_rate": 1.4247283525160178e-05,
691
+ "loss": 0.9973,
692
  "step": 460
693
  },
694
  {
695
  "epoch": 0.4291647438855561,
696
+ "grad_norm": 1.5508662694848825,
697
  "learning_rate": 1.4100733972568038e-05,
698
+ "loss": 1.0085,
699
  "step": 465
700
  },
701
  {
702
  "epoch": 0.43377941855099217,
703
+ "grad_norm": 1.5955354844051932,
704
  "learning_rate": 1.3953117887649153e-05,
705
+ "loss": 1.0215,
706
  "step": 470
707
  },
708
  {
709
  "epoch": 0.43839409321642825,
710
+ "grad_norm": 1.4682043182906732,
711
  "learning_rate": 1.3804473662878519e-05,
712
+ "loss": 1.0143,
713
  "step": 475
714
  },
715
  {
716
  "epoch": 0.44300876788186433,
717
+ "grad_norm": 1.4980307846950924,
718
  "learning_rate": 1.3654839958133118e-05,
719
+ "loss": 1.0026,
720
  "step": 480
721
  },
722
  {
723
  "epoch": 0.4476234425473004,
724
+ "grad_norm": 1.5233835818444807,
725
  "learning_rate": 1.3504255690637122e-05,
726
+ "loss": 1.0205,
727
  "step": 485
728
  },
729
  {
730
  "epoch": 0.4522381172127365,
731
+ "grad_norm": 1.4814525071349245,
732
  "learning_rate": 1.3352760024840174e-05,
733
+ "loss": 0.9967,
734
  "step": 490
735
  },
736
  {
737
  "epoch": 0.45685279187817257,
738
+ "grad_norm": 1.4499082430240968,
739
  "learning_rate": 1.3200392362231385e-05,
740
+ "loss": 0.9842,
741
  "step": 495
742
  },
743
  {
744
  "epoch": 0.46146746654360865,
745
+ "grad_norm": 1.5561735389313882,
746
  "learning_rate": 1.3047192331091636e-05,
747
+ "loss": 1.0229,
748
  "step": 500
749
  },
750
  {
751
  "epoch": 0.46146746654360865,
752
+ "eval_loss": 1.015141248703003,
753
+ "eval_runtime": 633.8229,
754
+ "eval_samples_per_second": 24.218,
755
+ "eval_steps_per_second": 0.189,
756
  "step": 500
757
  },
758
  {
759
  "epoch": 0.4660821412090448,
760
+ "grad_norm": 1.4233232765996602,
761
  "learning_rate": 1.2893199776186957e-05,
762
+ "loss": 0.9936,
763
  "step": 505
764
  },
765
  {
766
  "epoch": 0.47069681587448087,
767
+ "grad_norm": 1.54900822797248,
768
  "learning_rate": 1.2738454748405552e-05,
769
+ "loss": 1.0102,
770
  "step": 510
771
  },
772
  {
773
  "epoch": 0.47531149053991695,
774
+ "grad_norm": 1.4249354749013639,
775
  "learning_rate": 1.258299749434123e-05,
776
+ "loss": 1.013,
777
  "step": 515
778
  },
779
  {
780
  "epoch": 0.47992616520535303,
781
+ "grad_norm": 1.468956411146474,
782
  "learning_rate": 1.2426868445825955e-05,
783
+ "loss": 1.0027,
784
  "step": 520
785
  },
786
  {
787
  "epoch": 0.4845408398707891,
788
+ "grad_norm": 1.515134495058657,
789
  "learning_rate": 1.2270108209414186e-05,
790
+ "loss": 0.9825,
791
  "step": 525
792
  },
793
  {
794
  "epoch": 0.4891555145362252,
795
+ "grad_norm": 1.49493206284371,
796
  "learning_rate": 1.2112757555821796e-05,
797
+ "loss": 0.9968,
798
  "step": 530
799
  },
800
  {
801
  "epoch": 0.49377018920166127,
802
+ "grad_norm": 1.494232964423619,
803
  "learning_rate": 1.1954857409322302e-05,
804
+ "loss": 0.9808,
805
  "step": 535
806
  },
807
  {
808
  "epoch": 0.49838486386709735,
809
+ "grad_norm": 1.5895499778471747,
810
  "learning_rate": 1.179644883710313e-05,
811
+ "loss": 0.996,
812
  "step": 540
813
  },
814
  {
815
  "epoch": 0.5029995385325334,
816
+ "grad_norm": 1.575516689496947,
817
  "learning_rate": 1.1637573038584729e-05,
818
+ "loss": 0.9843,
819
  "step": 545
820
  },
821
  {
822
  "epoch": 0.5076142131979695,
823
+ "grad_norm": 1.5289310135121519,
824
  "learning_rate": 1.1478271334705302e-05,
825
+ "loss": 0.9897,
826
  "step": 550
827
  },
828
  {
829
  "epoch": 0.5122288878634056,
830
+ "grad_norm": 1.487892885517731,
831
  "learning_rate": 1.1318585157173913e-05,
832
+ "loss": 0.9965,
833
  "step": 555
834
  },
835
  {
836
  "epoch": 0.5168435625288417,
837
+ "grad_norm": 1.504695649448808,
838
  "learning_rate": 1.115855603769479e-05,
839
+ "loss": 0.9864,
840
  "step": 560
841
  },
842
  {
843
  "epoch": 0.5214582371942778,
844
+ "grad_norm": 1.444258657078223,
845
  "learning_rate": 1.0998225597165628e-05,
846
+ "loss": 0.9824,
847
  "step": 565
848
  },
849
  {
850
  "epoch": 0.5260729118597139,
851
+ "grad_norm": 1.452291205660523,
852
  "learning_rate": 1.0837635534852687e-05,
853
+ "loss": 0.9806,
854
  "step": 570
855
  },
856
  {
857
  "epoch": 0.53068758652515,
858
+ "grad_norm": 1.4809970617721466,
859
  "learning_rate": 1.0676827617545511e-05,
860
+ "loss": 0.98,
861
  "step": 575
862
  },
863
  {
864
  "epoch": 0.5353022611905861,
865
+ "grad_norm": 1.4688234901022226,
866
  "learning_rate": 1.0515843668694087e-05,
867
+ "loss": 0.9785,
868
  "step": 580
869
  },
870
  {
871
  "epoch": 0.5399169358560222,
872
+ "grad_norm": 1.4825659064745627,
873
  "learning_rate": 1.0354725557531258e-05,
874
+ "loss": 0.9776,
875
  "step": 585
876
  },
877
  {
878
  "epoch": 0.5445316105214583,
879
+ "grad_norm": 1.3801777122885093,
880
  "learning_rate": 1.0193515188183246e-05,
881
+ "loss": 0.9687,
882
  "step": 590
883
  },
884
  {
885
  "epoch": 0.5491462851868943,
886
+ "grad_norm": 1.421023225061784,
887
  "learning_rate": 1.003225448877108e-05,
888
+ "loss": 0.9964,
889
  "step": 595
890
  },
891
  {
892
  "epoch": 0.5537609598523304,
893
+ "grad_norm": 1.3889284539657671,
894
  "learning_rate": 9.870985400505805e-06,
895
+ "loss": 0.9745,
896
  "step": 600
897
  },
898
  {
899
  "epoch": 0.5537609598523304,
900
+ "eval_loss": 0.9945215582847595,
901
+ "eval_runtime": 661.3891,
902
+ "eval_samples_per_second": 23.209,
903
+ "eval_steps_per_second": 0.181,
904
  "step": 600
905
  },
906
  {
907
  "epoch": 0.5583756345177665,
908
+ "grad_norm": 1.4429569586116144,
909
  "learning_rate": 9.709749866780248e-06,
910
+ "loss": 0.9805,
911
  "step": 605
912
  },
913
  {
914
  "epoch": 0.5629903091832026,
915
+ "grad_norm": 1.4656449742761994,
916
  "learning_rate": 9.548589822260281e-06,
917
+ "loss": 0.9895,
918
  "step": 610
919
  },
920
  {
921
  "epoch": 0.5676049838486387,
922
+ "grad_norm": 1.3965932035586004,
923
  "learning_rate": 9.387547181978291e-06,
924
+ "loss": 0.9744,
925
  "step": 615
926
  },
927
  {
928
  "epoch": 0.5722196585140747,
929
+ "grad_norm": 1.359374657149616,
930
  "learning_rate": 9.226663830431777e-06,
931
+ "loss": 0.9824,
932
  "step": 620
933
  },
934
  {
935
  "epoch": 0.5768343331795108,
936
+ "grad_norm": 1.3668229629199753,
937
  "learning_rate": 9.065981610689915e-06,
938
+ "loss": 0.9706,
939
  "step": 625
940
  },
941
  {
942
  "epoch": 0.5814490078449469,
943
+ "grad_norm": 1.373500531171451,
944
  "learning_rate": 8.905542313510846e-06,
945
+ "loss": 0.9796,
946
  "step": 630
947
  },
948
  {
949
  "epoch": 0.586063682510383,
950
+ "grad_norm": 1.4067124446675243,
951
  "learning_rate": 8.745387666472639e-06,
952
+ "loss": 0.9879,
953
  "step": 635
954
  },
955
  {
956
  "epoch": 0.5906783571758191,
957
+ "grad_norm": 1.436245514521079,
958
  "learning_rate": 8.58555932312059e-06,
959
+ "loss": 0.9894,
960
  "step": 640
961
  },
962
  {
963
  "epoch": 0.5952930318412551,
964
+ "grad_norm": 1.429504715827128,
965
  "learning_rate": 8.426098852133892e-06,
966
+ "loss": 0.9643,
967
  "step": 645
968
  },
969
  {
970
  "epoch": 0.5999077065066912,
971
+ "grad_norm": 1.3728127558164411,
972
  "learning_rate": 8.267047726514278e-06,
973
+ "loss": 0.9813,
974
  "step": 650
975
  },
976
  {
977
  "epoch": 0.6045223811721273,
978
+ "grad_norm": 1.3422366968784711,
979
  "learning_rate": 8.108447312799588e-06,
980
+ "loss": 0.972,
981
  "step": 655
982
  },
983
  {
984
  "epoch": 0.6091370558375635,
985
+ "grad_norm": 1.4348417465202754,
986
  "learning_rate": 7.950338860305049e-06,
987
+ "loss": 0.9638,
988
  "step": 660
989
  },
990
  {
991
  "epoch": 0.6137517305029996,
992
+ "grad_norm": 1.3342023162033965,
993
  "learning_rate": 7.792763490394983e-06,
994
+ "loss": 0.9733,
995
  "step": 665
996
  },
997
  {
998
  "epoch": 0.6183664051684357,
999
+ "grad_norm": 1.361475388045652,
1000
  "learning_rate": 7.635762185787868e-06,
1001
+ "loss": 0.9773,
1002
  "step": 670
1003
  },
1004
  {
1005
  "epoch": 0.6229810798338717,
1006
+ "grad_norm": 1.3634924688905254,
1007
  "learning_rate": 7.479375779897379e-06,
1008
+ "loss": 0.9747,
1009
  "step": 675
1010
  },
1011
  {
1012
  "epoch": 0.6275957544993078,
1013
+ "grad_norm": 1.3732265984949414,
1014
  "learning_rate": 7.3236449462123315e-06,
1015
+ "loss": 0.9678,
1016
  "step": 680
1017
  },
1018
  {
1019
  "epoch": 0.6322104291647439,
1020
+ "grad_norm": 1.4464461120602612,
1021
  "learning_rate": 7.168610187718164e-06,
1022
+ "loss": 0.9662,
1023
  "step": 685
1024
  },
1025
  {
1026
  "epoch": 0.63682510383018,
1027
+ "grad_norm": 1.3931117990795983,
1028
  "learning_rate": 7.014311826362804e-06,
1029
+ "loss": 0.9641,
1030
  "step": 690
1031
  },
1032
  {
1033
  "epoch": 0.6414397784956161,
1034
+ "grad_norm": 1.366546097704984,
1035
  "learning_rate": 6.860789992569601e-06,
1036
+ "loss": 0.9787,
1037
  "step": 695
1038
  },
1039
  {
1040
  "epoch": 0.6460544531610521,
1041
+ "grad_norm": 1.3945778923545584,
1042
  "learning_rate": 6.708084614800065e-06,
1043
+ "loss": 0.9441,
1044
  "step": 700
1045
  },
1046
  {
1047
  "epoch": 0.6460544531610521,
1048
+ "eval_loss": 0.9769104719161987,
1049
+ "eval_runtime": 633.9092,
1050
+ "eval_samples_per_second": 24.215,
1051
+ "eval_steps_per_second": 0.189,
1052
  "step": 700
1053
  },
1054
  {
1055
  "epoch": 0.6506691278264882,
1056
+ "grad_norm": 1.364918482537208,
1057
  "learning_rate": 6.556235409169154e-06,
1058
+ "loss": 0.9437,
1059
  "step": 705
1060
  },
1061
  {
1062
  "epoch": 0.6552838024919243,
1063
+ "grad_norm": 1.3330832614943129,
1064
  "learning_rate": 6.405281869115768e-06,
1065
+ "loss": 0.9482,
1066
  "step": 710
1067
  },
1068
  {
1069
  "epoch": 0.6598984771573604,
1070
+ "grad_norm": 1.3291401175998692,
1071
  "learning_rate": 6.255263255131172e-06,
1072
+ "loss": 0.9646,
1073
  "step": 715
1074
  },
1075
  {
1076
  "epoch": 0.6645131518227965,
1077
+ "grad_norm": 1.3661394031338707,
1078
  "learning_rate": 6.106218584547992e-06,
1079
+ "loss": 0.9649,
1080
  "step": 720
1081
  },
1082
  {
1083
  "epoch": 0.6691278264882325,
1084
+ "grad_norm": 1.3117340443959773,
1085
  "learning_rate": 5.9581866213924656e-06,
1086
+ "loss": 0.9525,
1087
  "step": 725
1088
  },
1089
  {
1090
  "epoch": 0.6737425011536686,
1091
+ "grad_norm": 1.364658394013176,
1092
  "learning_rate": 5.811205866302571e-06,
1093
+ "loss": 0.9516,
1094
  "step": 730
1095
  },
1096
  {
1097
  "epoch": 0.6783571758191047,
1098
+ "grad_norm": 1.331994492768848,
1099
  "learning_rate": 5.665314546514633e-06,
1100
+ "loss": 0.954,
1101
  "step": 735
1102
  },
1103
  {
1104
  "epoch": 0.6829718504845408,
1105
+ "grad_norm": 1.3743467262940992,
1106
  "learning_rate": 5.520550605921091e-06,
1107
+ "loss": 0.9554,
1108
  "step": 740
1109
  },
1110
  {
1111
  "epoch": 0.687586525149977,
1112
+ "grad_norm": 1.3312291076208118,
1113
  "learning_rate": 5.376951695201894e-06,
1114
+ "loss": 0.9565,
1115
  "step": 745
1116
  },
1117
  {
1118
  "epoch": 0.6922011998154131,
1119
+ "grad_norm": 1.3832998972367352,
1120
  "learning_rate": 5.234555162032221e-06,
1121
+ "loss": 0.9475,
1122
  "step": 750
1123
  },
1124
  {
1125
  "epoch": 0.6968158744808491,
1126
+ "grad_norm": 1.389562727942595,
1127
  "learning_rate": 5.093398041368942e-06,
1128
+ "loss": 0.9574,
1129
  "step": 755
1130
  },
1131
  {
1132
  "epoch": 0.7014305491462852,
1133
+ "grad_norm": 1.3714452844531986,
1134
  "learning_rate": 4.9535170458184735e-06,
1135
+ "loss": 0.9581,
1136
  "step": 760
1137
  },
1138
  {
1139
  "epoch": 0.7060452238117213,
1140
+ "grad_norm": 1.3477889477630838,
1141
  "learning_rate": 4.81494855608843e-06,
1142
+ "loss": 0.9561,
1143
  "step": 765
1144
  },
1145
  {
1146
  "epoch": 0.7106598984771574,
1147
+ "grad_norm": 1.4491931180376743,
1148
  "learning_rate": 4.677728611525605e-06,
1149
+ "loss": 0.9512,
1150
  "step": 770
1151
  },
1152
  {
1153
  "epoch": 0.7152745731425935,
1154
+ "grad_norm": 1.3241497550464327,
1155
  "learning_rate": 4.541892900742757e-06,
1156
+ "loss": 0.9422,
1157
  "step": 775
1158
  },
1159
  {
1160
  "epoch": 0.7198892478080295,
1161
+ "grad_norm": 1.314421280157553,
1162
  "learning_rate": 4.407476752336575e-06,
1163
+ "loss": 0.943,
1164
  "step": 780
1165
  },
1166
  {
1167
  "epoch": 0.7245039224734656,
1168
+ "grad_norm": 1.2755970945876594,
1169
  "learning_rate": 4.2745151256993325e-06,
1170
+ "loss": 0.9426,
1171
  "step": 785
1172
  },
1173
  {
1174
  "epoch": 0.7291185971389017,
1175
+ "grad_norm": 1.332124542587031,
1176
  "learning_rate": 4.143042601926492e-06,
1177
+ "loss": 0.9533,
1178
  "step": 790
1179
  },
1180
  {
1181
  "epoch": 0.7337332718043378,
1182
+ "grad_norm": 1.3708413423330084,
1183
  "learning_rate": 4.013093374822789e-06,
1184
+ "loss": 0.9374,
1185
  "step": 795
1186
  },
1187
  {
1188
  "epoch": 0.7383479464697739,
1189
+ "grad_norm": 1.27203160584856,
1190
  "learning_rate": 3.884701242008949e-06,
1191
+ "loss": 0.9277,
1192
  "step": 800
1193
  },
1194
  {
1195
  "epoch": 0.7383479464697739,
1196
+ "eval_loss": 0.9612703323364258,
1197
+ "eval_runtime": 651.9955,
1198
+ "eval_samples_per_second": 23.543,
1199
+ "eval_steps_per_second": 0.184,
1200
  "step": 800
1201
  },
1202
  {
1203
  "epoch": 0.7429626211352099,
1204
+ "grad_norm": 1.31377359076841,
1205
  "learning_rate": 3.757899596131529e-06,
1206
+ "loss": 0.9611,
1207
  "step": 805
1208
  },
1209
  {
1210
  "epoch": 0.747577295800646,
1211
+ "grad_norm": 1.2975998004112579,
1212
  "learning_rate": 3.6327214161780287e-06,
1213
+ "loss": 0.9798,
1214
  "step": 810
1215
  },
1216
  {
1217
  "epoch": 0.7521919704660821,
1218
+ "grad_norm": 1.3894464184722144,
1219
  "learning_rate": 3.5091992588996026e-06,
1220
+ "loss": 0.9567,
1221
  "step": 815
1222
  },
1223
  {
1224
  "epoch": 0.7568066451315182,
1225
+ "grad_norm": 1.282473554343769,
1226
  "learning_rate": 3.387365250343615e-06,
1227
+ "loss": 0.954,
1228
  "step": 820
1229
  },
1230
  {
1231
  "epoch": 0.7614213197969543,
1232
+ "grad_norm": 1.3647805901178591,
1233
  "learning_rate": 3.2672510774981692e-06,
1234
+ "loss": 0.9361,
1235
  "step": 825
1236
  },
1237
  {
1238
  "epoch": 0.7660359944623903,
1239
+ "grad_norm": 1.347888877348781,
1240
  "learning_rate": 3.148887980050872e-06,
1241
+ "loss": 0.9432,
1242
  "step": 830
1243
  },
1244
  {
1245
  "epoch": 0.7706506691278265,
1246
+ "grad_norm": 1.377090549096584,
1247
  "learning_rate": 3.032306742263891e-06,
1248
+ "loss": 0.9519,
1249
  "step": 835
1250
  },
1251
  {
1252
  "epoch": 0.7752653437932626,
1253
+ "grad_norm": 1.287471382339416,
1254
  "learning_rate": 2.9175376849675076e-06,
1255
+ "loss": 0.9607,
1256
  "step": 840
1257
  },
1258
  {
1259
  "epoch": 0.7798800184586987,
1260
+ "grad_norm": 1.4070979616878627,
1261
  "learning_rate": 2.8046106576741605e-06,
1262
+ "loss": 0.929,
1263
  "step": 845
1264
  },
1265
  {
1266
  "epoch": 0.7844946931241348,
1267
+ "grad_norm": 1.296812382608952,
1268
  "learning_rate": 2.693555030815085e-06,
1269
+ "loss": 0.9383,
1270
  "step": 850
1271
  },
1272
  {
1273
  "epoch": 0.7891093677895709,
1274
+ "grad_norm": 1.345957217872087,
1275
  "learning_rate": 2.5843996881015676e-06,
1276
+ "loss": 0.9378,
1277
  "step": 855
1278
  },
1279
  {
1280
  "epoch": 0.7937240424550069,
1281
+ "grad_norm": 1.3218142306826084,
1282
  "learning_rate": 2.4771730190127616e-06,
1283
+ "loss": 0.9353,
1284
  "step": 860
1285
  },
1286
  {
1287
  "epoch": 0.798338717120443,
1288
+ "grad_norm": 1.2932161650654428,
1289
  "learning_rate": 2.3719029114120716e-06,
1290
+ "loss": 0.9333,
1291
  "step": 865
1292
  },
1293
  {
1294
  "epoch": 0.8029533917858791,
1295
+ "grad_norm": 1.36407173107211,
1296
  "learning_rate": 2.2686167442939733e-06,
1297
+ "loss": 0.9401,
1298
  "step": 870
1299
  },
1300
  {
1301
  "epoch": 0.8075680664513152,
1302
+ "grad_norm": 1.302064737279862,
1303
  "learning_rate": 2.1673413806632104e-06,
1304
+ "loss": 0.939,
1305
  "step": 875
1306
  },
1307
  {
1308
  "epoch": 0.8121827411167513,
1309
+ "grad_norm": 1.3229653318762729,
1310
  "learning_rate": 2.0681031605481563e-06,
1311
+ "loss": 0.9355,
1312
  "step": 880
1313
  },
1314
  {
1315
  "epoch": 0.8167974157821873,
1316
+ "grad_norm": 1.3252223231469167,
1317
  "learning_rate": 1.9709278941502363e-06,
1318
+ "loss": 0.9344,
1319
  "step": 885
1320
  },
1321
  {
1322
  "epoch": 0.8214120904476234,
1323
+ "grad_norm": 1.3270338774644677,
1324
  "learning_rate": 1.8758408551311048e-06,
1325
+ "loss": 0.9321,
1326
  "step": 890
1327
  },
1328
  {
1329
  "epoch": 0.8260267651130595,
1330
+ "grad_norm": 1.3116552675081934,
1331
  "learning_rate": 1.7828667740394045e-06,
1332
+ "loss": 0.9513,
1333
  "step": 895
1334
  },
1335
  {
1336
  "epoch": 0.8306414397784956,
1337
+ "grad_norm": 1.2617601634166857,
1338
  "learning_rate": 1.6920298318787532e-06,
1339
+ "loss": 0.9384,
1340
  "step": 900
1341
  },
1342
  {
1343
  "epoch": 0.8306414397784956,
1344
+ "eval_loss": 0.9501336812973022,
1345
+ "eval_runtime": 633.7004,
1346
+ "eval_samples_per_second": 24.223,
1347
+ "eval_steps_per_second": 0.189,
1348
  "step": 900
1349
  },
1350
  {
1351
  "epoch": 0.8352561144439317,
1352
+ "grad_norm": 1.3036810903046379,
1353
  "learning_rate": 1.6033536538186778e-06,
1354
+ "loss": 0.9363,
1355
  "step": 905
1356
  },
1357
  {
1358
  "epoch": 0.8398707891093677,
1359
+ "grad_norm": 1.3184782817538288,
1360
  "learning_rate": 1.5168613030500922e-06,
1361
+ "loss": 0.9254,
1362
  "step": 910
1363
  },
1364
  {
1365
  "epoch": 0.8444854637748038,
1366
+ "grad_norm": 1.3311174144545366,
1367
  "learning_rate": 1.4325752747869626e-06,
1368
+ "loss": 0.9401,
1369
  "step": 915
1370
  },
1371
  {
1372
  "epoch": 0.84910013844024,
1373
+ "grad_norm": 1.2485403971692124,
1374
  "learning_rate": 1.3505174904156593e-06,
1375
+ "loss": 0.9305,
1376
  "step": 920
1377
  },
1378
  {
1379
  "epoch": 0.8537148131056761,
1380
+ "grad_norm": 1.302635001110673,
1381
  "learning_rate": 1.2707092917935914e-06,
1382
+ "loss": 0.9393,
1383
  "step": 925
1384
  },
1385
  {
1386
  "epoch": 0.8583294877711122,
1387
+ "grad_norm": 1.3633354249041523,
1388
  "learning_rate": 1.1931714356985257e-06,
1389
+ "loss": 0.9312,
1390
  "step": 930
1391
  },
1392
  {
1393
  "epoch": 0.8629441624365483,
1394
+ "grad_norm": 1.2815769915508204,
1395
  "learning_rate": 1.1179240884301158e-06,
1396
+ "loss": 0.9217,
1397
  "step": 935
1398
  },
1399
  {
1400
  "epoch": 0.8675588371019843,
1401
+ "grad_norm": 1.3419818473322924,
1402
  "learning_rate": 1.0449868205649648e-06,
1403
+ "loss": 0.9168,
1404
  "step": 940
1405
  },
1406
  {
1407
  "epoch": 0.8721735117674204,
1408
+ "grad_norm": 1.3006753146842553,
1409
  "learning_rate": 9.74378601866669e-07,
1410
+ "loss": 0.9413,
1411
  "step": 945
1412
  },
1413
  {
1414
  "epoch": 0.8767881864328565,
1415
+ "grad_norm": 1.275285958470618,
1416
  "learning_rate": 9.061177963520751e-07,
1417
+ "loss": 0.9446,
1418
  "step": 950
1419
  },
1420
  {
1421
  "epoch": 0.8814028610982926,
1422
+ "grad_norm": 1.2680303522716787,
1423
  "learning_rate": 8.402221575151238e-07,
1424
+ "loss": 0.9161,
1425
  "step": 955
1426
  },
1427
  {
1428
  "epoch": 0.8860175357637287,
1429
+ "grad_norm": 1.2495043326934117,
1430
  "learning_rate": 7.767088237094578e-07,
1431
+ "loss": 0.9333,
1432
  "step": 960
1433
  },
1434
  {
1435
  "epoch": 0.8906322104291647,
1436
+ "grad_norm": 1.3345940160548069,
1437
  "learning_rate": 7.155943136910193e-07,
1438
+ "loss": 0.9353,
1439
  "step": 965
1440
  },
1441
  {
1442
  "epoch": 0.8952468850946008,
1443
+ "grad_norm": 1.3252439901574087,
1444
  "learning_rate": 6.568945223218048e-07,
1445
+ "loss": 0.9381,
1446
  "step": 970
1447
  },
1448
  {
1449
  "epoch": 0.8998615597600369,
1450
+ "grad_norm": 1.2795828639710098,
1451
  "learning_rate": 6.00624716435868e-07,
1452
+ "loss": 0.9199,
1453
  "step": 975
1454
  },
1455
  {
1456
  "epoch": 0.904476234425473,
1457
+ "grad_norm": 1.3413550832303935,
1458
  "learning_rate": 5.467995308686813e-07,
1459
+ "loss": 0.94,
1460
  "step": 980
1461
  },
1462
  {
1463
  "epoch": 0.9090909090909091,
1464
+ "grad_norm": 1.3436022079621268,
1465
  "learning_rate": 4.954329646508505e-07,
1466
+ "loss": 0.9313,
1467
  "step": 985
1468
  },
1469
  {
1470
  "epoch": 0.9137055837563451,
1471
+ "grad_norm": 1.3709638383836422,
1472
  "learning_rate": 4.4653837736721273e-07,
1473
+ "loss": 0.9346,
1474
  "step": 990
1475
  },
1476
  {
1477
  "epoch": 0.9183202584217812,
1478
+ "grad_norm": 1.2668090185362362,
1479
  "learning_rate": 4.001284856822174e-07,
1480
+ "loss": 0.9408,
1481
  "step": 995
1482
  },
1483
  {
1484
  "epoch": 0.9229349330872173,
1485
+ "grad_norm": 1.2697339769613498,
1486
  "learning_rate": 3.562153600325491e-07,
1487
+ "loss": 0.9216,
1488
  "step": 1000
1489
  },
1490
  {
1491
  "epoch": 0.9229349330872173,
1492
+ "eval_loss": 0.9448357224464417,
1493
+ "eval_runtime": 634.4549,
1494
+ "eval_samples_per_second": 24.194,
1495
+ "eval_steps_per_second": 0.189,
1496
  "step": 1000
1497
  },
1498
  {
1499
  "epoch": 0.9275496077526535,
1500
+ "grad_norm": 1.281655332093928,
1501
  "learning_rate": 3.1481042148779674e-07,
1502
+ "loss": 0.9399,
1503
  "step": 1005
1504
  },
1505
  {
1506
  "epoch": 0.9321642824180896,
1507
+ "grad_norm": 1.3600956214897377,
1508
  "learning_rate": 2.7592443878003196e-07,
1509
+ "loss": 0.9408,
1510
  "step": 1010
1511
  },
1512
  {
1513
  "epoch": 0.9367789570835257,
1514
+ "grad_norm": 1.3158687412306498,
1515
  "learning_rate": 2.395675255030383e-07,
1516
+ "loss": 0.9227,
1517
  "step": 1015
1518
  },
1519
  {
1520
  "epoch": 0.9413936317489617,
1521
+ "grad_norm": 1.321042213237677,
1522
  "learning_rate": 2.057491374819365e-07,
1523
+ "loss": 0.9251,
1524
  "step": 1020
1525
  },
1526
  {
1527
  "epoch": 0.9460083064143978,
1528
+ "grad_norm": 1.2706676798657595,
1529
  "learning_rate": 1.7447807031388264e-07,
1530
+ "loss": 0.9332,
1531
  "step": 1025
1532
  },
1533
  {
1534
  "epoch": 0.9506229810798339,
1535
+ "grad_norm": 1.3025080688365438,
1536
  "learning_rate": 1.457624570804772e-07,
1537
+ "loss": 0.9171,
1538
  "step": 1030
1539
  },
1540
  {
1541
  "epoch": 0.95523765574527,
1542
+ "grad_norm": 1.3183940856356338,
1543
  "learning_rate": 1.196097662324902e-07,
1544
+ "loss": 0.9394,
1545
  "step": 1035
1546
  },
1547
  {
1548
  "epoch": 0.9598523304107061,
1549
+ "grad_norm": 1.2959502551091049,
1550
  "learning_rate": 9.602679964744288e-08,
1551
+ "loss": 0.9171,
1552
  "step": 1040
1553
  },
1554
  {
1555
  "epoch": 0.9644670050761421,
1556
+ "grad_norm": 1.3183746243018815,
1557
  "learning_rate": 7.501969086054717e-08,
1558
+ "loss": 0.9328,
1559
  "step": 1045
1560
  },
1561
  {
1562
  "epoch": 0.9690816797415782,
1563
+ "grad_norm": 1.3307943534831945,
1564
  "learning_rate": 5.659390346948179e-08,
1565
+ "loss": 0.9424,
1566
  "step": 1050
1567
  },
1568
  {
1569
  "epoch": 0.9736963544070143,
1570
+ "grad_norm": 1.2730525780758684,
1571
  "learning_rate": 4.075422971340115e-08,
1572
+ "loss": 0.9402,
1573
  "step": 1055
1574
  },
1575
  {
1576
  "epoch": 0.9783110290724504,
1577
+ "grad_norm": 1.3093625889696177,
1578
  "learning_rate": 2.7504789226548977e-08,
1579
+ "loss": 0.9275,
1580
  "step": 1060
1581
  },
1582
  {
1583
  "epoch": 0.9829257037378865,
1584
+ "grad_norm": 1.2514296608879,
1585
  "learning_rate": 1.6849027966816535e-08,
1586
+ "loss": 0.9269,
1587
  "step": 1065
1588
  },
1589
  {
1590
  "epoch": 0.9875403784033225,
1591
+ "grad_norm": 1.273379135333167,
1592
  "learning_rate": 8.789717319505065e-09,
1593
+ "loss": 0.9362,
1594
  "step": 1070
1595
  },
1596
  {
1597
  "epoch": 0.9921550530687586,
1598
+ "grad_norm": 1.2811332538414983,
1599
  "learning_rate": 3.328953376530164e-09,
1600
+ "loss": 0.9313,
1601
  "step": 1075
1602
  },
1603
  {
1604
  "epoch": 0.9967697277341947,
1605
+ "grad_norm": 1.3117234277097758,
1606
  "learning_rate": 4.681563912700693e-10,
1607
+ "loss": 0.9204,
1608
  "step": 1080
1609
  },
1610
  {
1611
  "epoch": 0.9995385325334564,
1612
  "step": 1083,
1613
  "total_flos": 453306954547200.0,
1614
+ "train_loss": 0.986508995762382,
1615
+ "train_runtime": 33955.1767,
1616
+ "train_samples_per_second": 4.084,
1617
+ "train_steps_per_second": 0.032
1618
  }
1619
  ],
1620
  "logging_steps": 5,