jerseyjerry commited on
Commit
105b17c
·
verified ·
1 Parent(s): af26ccc

Delete trainer_state.json

Browse files
Files changed (1) hide show
  1. trainer_state.json +0 -3693
trainer_state.json DELETED
@@ -1,3693 +0,0 @@
1
- {
2
- "best_metric": null,
3
- "best_model_checkpoint": null,
4
- "epoch": 1.00100150225338,
5
- "eval_steps": 50,
6
- "global_step": 1000,
7
- "is_hyper_param_search": false,
8
- "is_local_process_zero": true,
9
- "is_world_process_zero": true,
10
- "log_history": [
11
- {
12
- "epoch": 0.00200300450676014,
13
- "grad_norm": 1.1490445137023926,
14
- "learning_rate": 6.666666666666667e-07,
15
- "loss": 1.2042,
16
- "step": 2
17
- },
18
- {
19
- "epoch": 0.00400600901352028,
20
- "grad_norm": 1.0771784782409668,
21
- "learning_rate": 1.3333333333333334e-06,
22
- "loss": 1.307,
23
- "step": 4
24
- },
25
- {
26
- "epoch": 0.006009013520280421,
27
- "grad_norm": 1.569467306137085,
28
- "learning_rate": 2.0000000000000003e-06,
29
- "loss": 1.3554,
30
- "step": 6
31
- },
32
- {
33
- "epoch": 0.00801201802704056,
34
- "grad_norm": 1.3319200277328491,
35
- "learning_rate": 2.666666666666667e-06,
36
- "loss": 1.293,
37
- "step": 8
38
- },
39
- {
40
- "epoch": 0.010015022533800702,
41
- "grad_norm": 1.0480939149856567,
42
- "learning_rate": 3.3333333333333333e-06,
43
- "loss": 1.2224,
44
- "step": 10
45
- },
46
- {
47
- "epoch": 0.012018027040560842,
48
- "grad_norm": 1.1433207988739014,
49
- "learning_rate": 4.000000000000001e-06,
50
- "loss": 1.2897,
51
- "step": 12
52
- },
53
- {
54
- "epoch": 0.014021031547320982,
55
- "grad_norm": 1.073263168334961,
56
- "learning_rate": 4.666666666666667e-06,
57
- "loss": 1.2234,
58
- "step": 14
59
- },
60
- {
61
- "epoch": 0.01602403605408112,
62
- "grad_norm": 1.4199696779251099,
63
- "learning_rate": 5.333333333333334e-06,
64
- "loss": 1.2185,
65
- "step": 16
66
- },
67
- {
68
- "epoch": 0.018027040560841263,
69
- "grad_norm": 1.0688554048538208,
70
- "learning_rate": 6e-06,
71
- "loss": 1.1962,
72
- "step": 18
73
- },
74
- {
75
- "epoch": 0.020030045067601403,
76
- "grad_norm": 0.7635518312454224,
77
- "learning_rate": 6.666666666666667e-06,
78
- "loss": 1.2015,
79
- "step": 20
80
- },
81
- {
82
- "epoch": 0.022033049574361543,
83
- "grad_norm": 0.898193895816803,
84
- "learning_rate": 7.333333333333334e-06,
85
- "loss": 1.2538,
86
- "step": 22
87
- },
88
- {
89
- "epoch": 0.024036054081121683,
90
- "grad_norm": 0.5049487948417664,
91
- "learning_rate": 8.000000000000001e-06,
92
- "loss": 1.1567,
93
- "step": 24
94
- },
95
- {
96
- "epoch": 0.026039058587881823,
97
- "grad_norm": 0.5218433141708374,
98
- "learning_rate": 8.666666666666668e-06,
99
- "loss": 1.1553,
100
- "step": 26
101
- },
102
- {
103
- "epoch": 0.028042063094641963,
104
- "grad_norm": 0.6712301969528198,
105
- "learning_rate": 9.333333333333334e-06,
106
- "loss": 1.2026,
107
- "step": 28
108
- },
109
- {
110
- "epoch": 0.030045067601402103,
111
- "grad_norm": 0.5391427278518677,
112
- "learning_rate": 1e-05,
113
- "loss": 1.2703,
114
- "step": 30
115
- },
116
- {
117
- "epoch": 0.03204807210816224,
118
- "grad_norm": 0.462812602519989,
119
- "learning_rate": 1.0666666666666667e-05,
120
- "loss": 1.1924,
121
- "step": 32
122
- },
123
- {
124
- "epoch": 0.03405107661492238,
125
- "grad_norm": 0.6354833245277405,
126
- "learning_rate": 1.1333333333333334e-05,
127
- "loss": 1.1828,
128
- "step": 34
129
- },
130
- {
131
- "epoch": 0.03605408112168253,
132
- "grad_norm": 0.41658806800842285,
133
- "learning_rate": 1.2e-05,
134
- "loss": 1.0911,
135
- "step": 36
136
- },
137
- {
138
- "epoch": 0.03805708562844266,
139
- "grad_norm": 0.40210819244384766,
140
- "learning_rate": 1.2666666666666668e-05,
141
- "loss": 1.1088,
142
- "step": 38
143
- },
144
- {
145
- "epoch": 0.04006009013520281,
146
- "grad_norm": 0.4195331931114197,
147
- "learning_rate": 1.3333333333333333e-05,
148
- "loss": 1.1107,
149
- "step": 40
150
- },
151
- {
152
- "epoch": 0.04206309464196294,
153
- "grad_norm": 0.4773981273174286,
154
- "learning_rate": 1.4000000000000001e-05,
155
- "loss": 1.0895,
156
- "step": 42
157
- },
158
- {
159
- "epoch": 0.04406609914872309,
160
- "grad_norm": 0.44729089736938477,
161
- "learning_rate": 1.4666666666666668e-05,
162
- "loss": 1.1064,
163
- "step": 44
164
- },
165
- {
166
- "epoch": 0.04606910365548322,
167
- "grad_norm": 0.4262336492538452,
168
- "learning_rate": 1.5333333333333334e-05,
169
- "loss": 1.1339,
170
- "step": 46
171
- },
172
- {
173
- "epoch": 0.04807210816224337,
174
- "grad_norm": 0.48148858547210693,
175
- "learning_rate": 1.6000000000000003e-05,
176
- "loss": 1.0981,
177
- "step": 48
178
- },
179
- {
180
- "epoch": 0.0500751126690035,
181
- "grad_norm": 0.39283275604248047,
182
- "learning_rate": 1.6666666666666667e-05,
183
- "loss": 1.093,
184
- "step": 50
185
- },
186
- {
187
- "epoch": 0.0500751126690035,
188
- "eval_loss": 1.2894173860549927,
189
- "eval_runtime": 3.763,
190
- "eval_samples_per_second": 15.413,
191
- "eval_steps_per_second": 7.707,
192
- "step": 50
193
- },
194
- {
195
- "epoch": 0.05207811717576365,
196
- "grad_norm": 0.37658512592315674,
197
- "learning_rate": 1.7333333333333336e-05,
198
- "loss": 1.139,
199
- "step": 52
200
- },
201
- {
202
- "epoch": 0.05408112168252378,
203
- "grad_norm": 0.5392587184906006,
204
- "learning_rate": 1.8e-05,
205
- "loss": 1.125,
206
- "step": 54
207
- },
208
- {
209
- "epoch": 0.05608412618928393,
210
- "grad_norm": 0.4281522333621979,
211
- "learning_rate": 1.866666666666667e-05,
212
- "loss": 1.142,
213
- "step": 56
214
- },
215
- {
216
- "epoch": 0.05808713069604406,
217
- "grad_norm": 0.3900790512561798,
218
- "learning_rate": 1.9333333333333333e-05,
219
- "loss": 1.0687,
220
- "step": 58
221
- },
222
- {
223
- "epoch": 0.06009013520280421,
224
- "grad_norm": 0.43412598967552185,
225
- "learning_rate": 2e-05,
226
- "loss": 1.0266,
227
- "step": 60
228
- },
229
- {
230
- "epoch": 0.06209313970956434,
231
- "grad_norm": 0.35002750158309937,
232
- "learning_rate": 2.0666666666666666e-05,
233
- "loss": 1.0595,
234
- "step": 62
235
- },
236
- {
237
- "epoch": 0.06409614421632448,
238
- "grad_norm": 0.4777143597602844,
239
- "learning_rate": 2.1333333333333335e-05,
240
- "loss": 1.1136,
241
- "step": 64
242
- },
243
- {
244
- "epoch": 0.06609914872308463,
245
- "grad_norm": 0.49310263991355896,
246
- "learning_rate": 2.2000000000000003e-05,
247
- "loss": 1.0185,
248
- "step": 66
249
- },
250
- {
251
- "epoch": 0.06810215322984477,
252
- "grad_norm": 0.449856698513031,
253
- "learning_rate": 2.2666666666666668e-05,
254
- "loss": 1.1279,
255
- "step": 68
256
- },
257
- {
258
- "epoch": 0.0701051577366049,
259
- "grad_norm": 0.38826239109039307,
260
- "learning_rate": 2.3333333333333336e-05,
261
- "loss": 1.0885,
262
- "step": 70
263
- },
264
- {
265
- "epoch": 0.07210816224336505,
266
- "grad_norm": 0.4807354509830475,
267
- "learning_rate": 2.4e-05,
268
- "loss": 1.0903,
269
- "step": 72
270
- },
271
- {
272
- "epoch": 0.07411116675012519,
273
- "grad_norm": 0.4949500262737274,
274
- "learning_rate": 2.466666666666667e-05,
275
- "loss": 1.127,
276
- "step": 74
277
- },
278
- {
279
- "epoch": 0.07611417125688533,
280
- "grad_norm": 0.3626649081707001,
281
- "learning_rate": 2.5333333333333337e-05,
282
- "loss": 1.0255,
283
- "step": 76
284
- },
285
- {
286
- "epoch": 0.07811717576364546,
287
- "grad_norm": 0.5750380754470825,
288
- "learning_rate": 2.6000000000000002e-05,
289
- "loss": 1.1275,
290
- "step": 78
291
- },
292
- {
293
- "epoch": 0.08012018027040561,
294
- "grad_norm": 0.39814862608909607,
295
- "learning_rate": 2.6666666666666667e-05,
296
- "loss": 1.0341,
297
- "step": 80
298
- },
299
- {
300
- "epoch": 0.08212318477716575,
301
- "grad_norm": 0.4639066457748413,
302
- "learning_rate": 2.733333333333333e-05,
303
- "loss": 1.0229,
304
- "step": 82
305
- },
306
- {
307
- "epoch": 0.08412618928392589,
308
- "grad_norm": 0.4696304500102997,
309
- "learning_rate": 2.8000000000000003e-05,
310
- "loss": 0.9657,
311
- "step": 84
312
- },
313
- {
314
- "epoch": 0.08612919379068602,
315
- "grad_norm": 0.4721640646457672,
316
- "learning_rate": 2.8666666666666668e-05,
317
- "loss": 1.0449,
318
- "step": 86
319
- },
320
- {
321
- "epoch": 0.08813219829744617,
322
- "grad_norm": 0.538497805595398,
323
- "learning_rate": 2.9333333333333336e-05,
324
- "loss": 1.0298,
325
- "step": 88
326
- },
327
- {
328
- "epoch": 0.09013520280420631,
329
- "grad_norm": 0.4559970498085022,
330
- "learning_rate": 3e-05,
331
- "loss": 1.1037,
332
- "step": 90
333
- },
334
- {
335
- "epoch": 0.09213820731096645,
336
- "grad_norm": 0.5490939617156982,
337
- "learning_rate": 3.066666666666667e-05,
338
- "loss": 1.0027,
339
- "step": 92
340
- },
341
- {
342
- "epoch": 0.09414121181772658,
343
- "grad_norm": 0.45646870136260986,
344
- "learning_rate": 3.1333333333333334e-05,
345
- "loss": 0.9897,
346
- "step": 94
347
- },
348
- {
349
- "epoch": 0.09614421632448673,
350
- "grad_norm": 0.43321868777275085,
351
- "learning_rate": 3.2000000000000005e-05,
352
- "loss": 1.0761,
353
- "step": 96
354
- },
355
- {
356
- "epoch": 0.09814722083124687,
357
- "grad_norm": 0.5118622183799744,
358
- "learning_rate": 3.266666666666667e-05,
359
- "loss": 1.02,
360
- "step": 98
361
- },
362
- {
363
- "epoch": 0.100150225338007,
364
- "grad_norm": 0.496593177318573,
365
- "learning_rate": 3.3333333333333335e-05,
366
- "loss": 1.0625,
367
- "step": 100
368
- },
369
- {
370
- "epoch": 0.100150225338007,
371
- "eval_loss": 1.240967035293579,
372
- "eval_runtime": 3.786,
373
- "eval_samples_per_second": 15.32,
374
- "eval_steps_per_second": 7.66,
375
- "step": 100
376
- },
377
- {
378
- "epoch": 0.10215322984476716,
379
- "grad_norm": 0.4841687083244324,
380
- "learning_rate": 3.4000000000000007e-05,
381
- "loss": 1.0166,
382
- "step": 102
383
- },
384
- {
385
- "epoch": 0.1041562343515273,
386
- "grad_norm": 0.5562867522239685,
387
- "learning_rate": 3.466666666666667e-05,
388
- "loss": 1.0716,
389
- "step": 104
390
- },
391
- {
392
- "epoch": 0.10615923885828743,
393
- "grad_norm": 0.5093795657157898,
394
- "learning_rate": 3.5333333333333336e-05,
395
- "loss": 1.0912,
396
- "step": 106
397
- },
398
- {
399
- "epoch": 0.10816224336504757,
400
- "grad_norm": 0.4446066915988922,
401
- "learning_rate": 3.6e-05,
402
- "loss": 1.0152,
403
- "step": 108
404
- },
405
- {
406
- "epoch": 0.11016524787180772,
407
- "grad_norm": 0.518335223197937,
408
- "learning_rate": 3.6666666666666666e-05,
409
- "loss": 1.0098,
410
- "step": 110
411
- },
412
- {
413
- "epoch": 0.11216825237856785,
414
- "grad_norm": 0.47020334005355835,
415
- "learning_rate": 3.733333333333334e-05,
416
- "loss": 1.0347,
417
- "step": 112
418
- },
419
- {
420
- "epoch": 0.11417125688532799,
421
- "grad_norm": 0.5809981226921082,
422
- "learning_rate": 3.8e-05,
423
- "loss": 1.0242,
424
- "step": 114
425
- },
426
- {
427
- "epoch": 0.11617426139208813,
428
- "grad_norm": 0.49666646122932434,
429
- "learning_rate": 3.866666666666667e-05,
430
- "loss": 1.053,
431
- "step": 116
432
- },
433
- {
434
- "epoch": 0.11817726589884828,
435
- "grad_norm": 0.47094520926475525,
436
- "learning_rate": 3.933333333333333e-05,
437
- "loss": 1.0258,
438
- "step": 118
439
- },
440
- {
441
- "epoch": 0.12018027040560841,
442
- "grad_norm": 0.5577300786972046,
443
- "learning_rate": 4e-05,
444
- "loss": 1.0197,
445
- "step": 120
446
- },
447
- {
448
- "epoch": 0.12218327491236855,
449
- "grad_norm": 0.5453508496284485,
450
- "learning_rate": 4.066666666666667e-05,
451
- "loss": 0.9842,
452
- "step": 122
453
- },
454
- {
455
- "epoch": 0.12418627941912869,
456
- "grad_norm": 0.5353218913078308,
457
- "learning_rate": 4.133333333333333e-05,
458
- "loss": 1.1579,
459
- "step": 124
460
- },
461
- {
462
- "epoch": 0.12618928392588882,
463
- "grad_norm": 0.617546021938324,
464
- "learning_rate": 4.2e-05,
465
- "loss": 1.0052,
466
- "step": 126
467
- },
468
- {
469
- "epoch": 0.12819228843264896,
470
- "grad_norm": 0.48849716782569885,
471
- "learning_rate": 4.266666666666667e-05,
472
- "loss": 1.0416,
473
- "step": 128
474
- },
475
- {
476
- "epoch": 0.13019529293940912,
477
- "grad_norm": 0.5549625754356384,
478
- "learning_rate": 4.3333333333333334e-05,
479
- "loss": 1.0562,
480
- "step": 130
481
- },
482
- {
483
- "epoch": 0.13219829744616926,
484
- "grad_norm": 0.6010375618934631,
485
- "learning_rate": 4.4000000000000006e-05,
486
- "loss": 1.046,
487
- "step": 132
488
- },
489
- {
490
- "epoch": 0.1342013019529294,
491
- "grad_norm": 0.481374591588974,
492
- "learning_rate": 4.466666666666667e-05,
493
- "loss": 1.0136,
494
- "step": 134
495
- },
496
- {
497
- "epoch": 0.13620430645968953,
498
- "grad_norm": 0.4886944591999054,
499
- "learning_rate": 4.5333333333333335e-05,
500
- "loss": 0.9658,
501
- "step": 136
502
- },
503
- {
504
- "epoch": 0.13820731096644967,
505
- "grad_norm": 0.6117609739303589,
506
- "learning_rate": 4.600000000000001e-05,
507
- "loss": 1.0545,
508
- "step": 138
509
- },
510
- {
511
- "epoch": 0.1402103154732098,
512
- "grad_norm": 0.5340180397033691,
513
- "learning_rate": 4.666666666666667e-05,
514
- "loss": 0.9826,
515
- "step": 140
516
- },
517
- {
518
- "epoch": 0.14221331997996994,
519
- "grad_norm": 0.5061513781547546,
520
- "learning_rate": 4.7333333333333336e-05,
521
- "loss": 0.9832,
522
- "step": 142
523
- },
524
- {
525
- "epoch": 0.1442163244867301,
526
- "grad_norm": 0.5090388059616089,
527
- "learning_rate": 4.8e-05,
528
- "loss": 0.9733,
529
- "step": 144
530
- },
531
- {
532
- "epoch": 0.14621932899349024,
533
- "grad_norm": 0.5136658549308777,
534
- "learning_rate": 4.866666666666667e-05,
535
- "loss": 0.9673,
536
- "step": 146
537
- },
538
- {
539
- "epoch": 0.14822233350025038,
540
- "grad_norm": 0.5653979778289795,
541
- "learning_rate": 4.933333333333334e-05,
542
- "loss": 0.9908,
543
- "step": 148
544
- },
545
- {
546
- "epoch": 0.15022533800701052,
547
- "grad_norm": 0.5377776026725769,
548
- "learning_rate": 5e-05,
549
- "loss": 0.9428,
550
- "step": 150
551
- },
552
- {
553
- "epoch": 0.15022533800701052,
554
- "eval_loss": 1.2466219663619995,
555
- "eval_runtime": 3.7908,
556
- "eval_samples_per_second": 15.3,
557
- "eval_steps_per_second": 7.65,
558
- "step": 150
559
- },
560
- {
561
- "epoch": 0.15222834251377065,
562
- "grad_norm": 0.5484976768493652,
563
- "learning_rate": 4.9999728022003156e-05,
564
- "loss": 0.9523,
565
- "step": 152
566
- },
567
- {
568
- "epoch": 0.1542313470205308,
569
- "grad_norm": 0.4431094229221344,
570
- "learning_rate": 4.999891209393037e-05,
571
- "loss": 0.9937,
572
- "step": 154
573
- },
574
- {
575
- "epoch": 0.15623435152729093,
576
- "grad_norm": 0.6066553592681885,
577
- "learning_rate": 4.999755223353482e-05,
578
- "loss": 1.0431,
579
- "step": 156
580
- },
581
- {
582
- "epoch": 0.15823735603405106,
583
- "grad_norm": 0.6024964451789856,
584
- "learning_rate": 4.9995648470404664e-05,
585
- "loss": 0.9671,
586
- "step": 158
587
- },
588
- {
589
- "epoch": 0.16024036054081123,
590
- "grad_norm": 0.486589640378952,
591
- "learning_rate": 4.9993200845962434e-05,
592
- "loss": 0.949,
593
- "step": 160
594
- },
595
- {
596
- "epoch": 0.16224336504757136,
597
- "grad_norm": 0.505987823009491,
598
- "learning_rate": 4.9990209413464136e-05,
599
- "loss": 1.0444,
600
- "step": 162
601
- },
602
- {
603
- "epoch": 0.1642463695543315,
604
- "grad_norm": 0.46255800127983093,
605
- "learning_rate": 4.998667423799807e-05,
606
- "loss": 0.971,
607
- "step": 164
608
- },
609
- {
610
- "epoch": 0.16624937406109164,
611
- "grad_norm": 0.5792336463928223,
612
- "learning_rate": 4.9982595396483435e-05,
613
- "loss": 0.9869,
614
- "step": 166
615
- },
616
- {
617
- "epoch": 0.16825237856785177,
618
- "grad_norm": 0.48192256689071655,
619
- "learning_rate": 4.997797297766864e-05,
620
- "loss": 1.0234,
621
- "step": 168
622
- },
623
- {
624
- "epoch": 0.1702553830746119,
625
- "grad_norm": 0.5370559692382812,
626
- "learning_rate": 4.997280708212939e-05,
627
- "loss": 0.9721,
628
- "step": 170
629
- },
630
- {
631
- "epoch": 0.17225838758137205,
632
- "grad_norm": 0.4554755985736847,
633
- "learning_rate": 4.996709782226646e-05,
634
- "loss": 0.9292,
635
- "step": 172
636
- },
637
- {
638
- "epoch": 0.1742613920881322,
639
- "grad_norm": 0.4883841872215271,
640
- "learning_rate": 4.9960845322303315e-05,
641
- "loss": 0.9815,
642
- "step": 174
643
- },
644
- {
645
- "epoch": 0.17626439659489235,
646
- "grad_norm": 0.5221249461174011,
647
- "learning_rate": 4.995404971828333e-05,
648
- "loss": 1.0216,
649
- "step": 176
650
- },
651
- {
652
- "epoch": 0.17826740110165248,
653
- "grad_norm": 0.5130178332328796,
654
- "learning_rate": 4.994671115806691e-05,
655
- "loss": 0.9855,
656
- "step": 178
657
- },
658
- {
659
- "epoch": 0.18027040560841262,
660
- "grad_norm": 0.46121644973754883,
661
- "learning_rate": 4.993882980132819e-05,
662
- "loss": 0.9196,
663
- "step": 180
664
- },
665
- {
666
- "epoch": 0.18227341011517276,
667
- "grad_norm": 0.49680426716804504,
668
- "learning_rate": 4.9930405819551627e-05,
669
- "loss": 1.0151,
670
- "step": 182
671
- },
672
- {
673
- "epoch": 0.1842764146219329,
674
- "grad_norm": 0.48591047525405884,
675
- "learning_rate": 4.992143939602823e-05,
676
- "loss": 0.9901,
677
- "step": 184
678
- },
679
- {
680
- "epoch": 0.18627941912869303,
681
- "grad_norm": 0.47092878818511963,
682
- "learning_rate": 4.9911930725851583e-05,
683
- "loss": 0.9632,
684
- "step": 186
685
- },
686
- {
687
- "epoch": 0.18828242363545317,
688
- "grad_norm": 0.40838295221328735,
689
- "learning_rate": 4.990188001591363e-05,
690
- "loss": 0.9469,
691
- "step": 188
692
- },
693
- {
694
- "epoch": 0.19028542814221333,
695
- "grad_norm": 0.4728156626224518,
696
- "learning_rate": 4.9891287484900124e-05,
697
- "loss": 0.9667,
698
- "step": 190
699
- },
700
- {
701
- "epoch": 0.19228843264897347,
702
- "grad_norm": 0.534322202205658,
703
- "learning_rate": 4.988015336328589e-05,
704
- "loss": 0.9982,
705
- "step": 192
706
- },
707
- {
708
- "epoch": 0.1942914371557336,
709
- "grad_norm": 0.43927860260009766,
710
- "learning_rate": 4.986847789332981e-05,
711
- "loss": 0.9898,
712
- "step": 194
713
- },
714
- {
715
- "epoch": 0.19629444166249374,
716
- "grad_norm": 0.40531125664711,
717
- "learning_rate": 4.985626132906957e-05,
718
- "loss": 0.9442,
719
- "step": 196
720
- },
721
- {
722
- "epoch": 0.19829744616925388,
723
- "grad_norm": 0.5949648022651672,
724
- "learning_rate": 4.9843503936316095e-05,
725
- "loss": 1.0381,
726
- "step": 198
727
- },
728
- {
729
- "epoch": 0.200300450676014,
730
- "grad_norm": 0.43230050802230835,
731
- "learning_rate": 4.983020599264781e-05,
732
- "loss": 1.0166,
733
- "step": 200
734
- },
735
- {
736
- "epoch": 0.200300450676014,
737
- "eval_loss": 1.2184284925460815,
738
- "eval_runtime": 3.8028,
739
- "eval_samples_per_second": 15.252,
740
- "eval_steps_per_second": 7.626,
741
- "step": 200
742
- },
743
- {
744
- "epoch": 0.20230345518277415,
745
- "grad_norm": 0.4429769814014435,
746
- "learning_rate": 4.9816367787404534e-05,
747
- "loss": 0.9594,
748
- "step": 202
749
- },
750
- {
751
- "epoch": 0.20430645968953431,
752
- "grad_norm": 0.5523216724395752,
753
- "learning_rate": 4.980198962168128e-05,
754
- "loss": 1.0446,
755
- "step": 204
756
- },
757
- {
758
- "epoch": 0.20630946419629445,
759
- "grad_norm": 0.4551699459552765,
760
- "learning_rate": 4.978707180832161e-05,
761
- "loss": 0.9913,
762
- "step": 206
763
- },
764
- {
765
- "epoch": 0.2083124687030546,
766
- "grad_norm": 0.41649895906448364,
767
- "learning_rate": 4.977161467191089e-05,
768
- "loss": 0.9163,
769
- "step": 208
770
- },
771
- {
772
- "epoch": 0.21031547320981472,
773
- "grad_norm": 0.4184020459651947,
774
- "learning_rate": 4.97556185487692e-05,
775
- "loss": 0.9463,
776
- "step": 210
777
- },
778
- {
779
- "epoch": 0.21231847771657486,
780
- "grad_norm": 0.6365268230438232,
781
- "learning_rate": 4.9739083786944016e-05,
782
- "loss": 0.9992,
783
- "step": 212
784
- },
785
- {
786
- "epoch": 0.214321482223335,
787
- "grad_norm": 0.5223124027252197,
788
- "learning_rate": 4.9722010746202664e-05,
789
- "loss": 0.923,
790
- "step": 214
791
- },
792
- {
793
- "epoch": 0.21632448673009513,
794
- "grad_norm": 0.42879560589790344,
795
- "learning_rate": 4.970439979802445e-05,
796
- "loss": 0.9788,
797
- "step": 216
798
- },
799
- {
800
- "epoch": 0.21832749123685527,
801
- "grad_norm": 0.4171353578567505,
802
- "learning_rate": 4.96862513255926e-05,
803
- "loss": 1.0101,
804
- "step": 218
805
- },
806
- {
807
- "epoch": 0.22033049574361543,
808
- "grad_norm": 0.42286214232444763,
809
- "learning_rate": 4.966756572378593e-05,
810
- "loss": 0.981,
811
- "step": 220
812
- },
813
- {
814
- "epoch": 0.22233350025037557,
815
- "grad_norm": 0.6001223921775818,
816
- "learning_rate": 4.964834339917025e-05,
817
- "loss": 1.0276,
818
- "step": 222
819
- },
820
- {
821
- "epoch": 0.2243365047571357,
822
- "grad_norm": 0.6153950095176697,
823
- "learning_rate": 4.9628584769989504e-05,
824
- "loss": 1.0437,
825
- "step": 224
826
- },
827
- {
828
- "epoch": 0.22633950926389584,
829
- "grad_norm": 0.419117271900177,
830
- "learning_rate": 4.9608290266156695e-05,
831
- "loss": 1.0168,
832
- "step": 226
833
- },
834
- {
835
- "epoch": 0.22834251377065598,
836
- "grad_norm": 0.40286022424697876,
837
- "learning_rate": 4.958746032924448e-05,
838
- "loss": 0.988,
839
- "step": 228
840
- },
841
- {
842
- "epoch": 0.23034551827741612,
843
- "grad_norm": 0.5287054181098938,
844
- "learning_rate": 4.9566095412475636e-05,
845
- "loss": 1.019,
846
- "step": 230
847
- },
848
- {
849
- "epoch": 0.23234852278417625,
850
- "grad_norm": 0.43865758180618286,
851
- "learning_rate": 4.9544195980713136e-05,
852
- "loss": 0.9563,
853
- "step": 232
854
- },
855
- {
856
- "epoch": 0.23435152729093642,
857
- "grad_norm": 0.5529116988182068,
858
- "learning_rate": 4.952176251045008e-05,
859
- "loss": 0.9288,
860
- "step": 234
861
- },
862
- {
863
- "epoch": 0.23635453179769655,
864
- "grad_norm": 0.5552803874015808,
865
- "learning_rate": 4.9498795489799276e-05,
866
- "loss": 0.8924,
867
- "step": 236
868
- },
869
- {
870
- "epoch": 0.2383575363044567,
871
- "grad_norm": 0.722111165523529,
872
- "learning_rate": 4.947529541848268e-05,
873
- "loss": 0.9598,
874
- "step": 238
875
- },
876
- {
877
- "epoch": 0.24036054081121683,
878
- "grad_norm": 0.4804269075393677,
879
- "learning_rate": 4.9451262807820466e-05,
880
- "loss": 0.9757,
881
- "step": 240
882
- },
883
- {
884
- "epoch": 0.24236354531797696,
885
- "grad_norm": 0.5181965231895447,
886
- "learning_rate": 4.942669818071994e-05,
887
- "loss": 1.0138,
888
- "step": 242
889
- },
890
- {
891
- "epoch": 0.2443665498247371,
892
- "grad_norm": 0.43212518095970154,
893
- "learning_rate": 4.9401602071664155e-05,
894
- "loss": 0.9027,
895
- "step": 244
896
- },
897
- {
898
- "epoch": 0.24636955433149724,
899
- "grad_norm": 0.5169520974159241,
900
- "learning_rate": 4.937597502670027e-05,
901
- "loss": 0.9668,
902
- "step": 246
903
- },
904
- {
905
- "epoch": 0.24837255883825737,
906
- "grad_norm": 0.4116087555885315,
907
- "learning_rate": 4.934981760342766e-05,
908
- "loss": 0.9634,
909
- "step": 248
910
- },
911
- {
912
- "epoch": 0.25037556334501754,
913
- "grad_norm": 0.5354374647140503,
914
- "learning_rate": 4.932313037098582e-05,
915
- "loss": 0.9993,
916
- "step": 250
917
- },
918
- {
919
- "epoch": 0.25037556334501754,
920
- "eval_loss": 1.207343339920044,
921
- "eval_runtime": 3.8101,
922
- "eval_samples_per_second": 15.223,
923
- "eval_steps_per_second": 7.611,
924
- "step": 250
925
- },
926
- {
927
- "epoch": 0.25237856785177765,
928
- "grad_norm": 0.5648212432861328,
929
- "learning_rate": 4.929591391004196e-05,
930
- "loss": 1.0219,
931
- "step": 252
932
- },
933
- {
934
- "epoch": 0.2543815723585378,
935
- "grad_norm": 0.6550512909889221,
936
- "learning_rate": 4.926816881277834e-05,
937
- "loss": 0.9505,
938
- "step": 254
939
- },
940
- {
941
- "epoch": 0.2563845768652979,
942
- "grad_norm": 0.4034920334815979,
943
- "learning_rate": 4.923989568287946e-05,
944
- "loss": 0.929,
945
- "step": 256
946
- },
947
- {
948
- "epoch": 0.2583875813720581,
949
- "grad_norm": 0.475777804851532,
950
- "learning_rate": 4.921109513551885e-05,
951
- "loss": 0.9811,
952
- "step": 258
953
- },
954
- {
955
- "epoch": 0.26039058587881825,
956
- "grad_norm": 0.47418224811553955,
957
- "learning_rate": 4.9181767797345724e-05,
958
- "loss": 1.0354,
959
- "step": 260
960
- },
961
- {
962
- "epoch": 0.26239359038557836,
963
- "grad_norm": 0.5102671384811401,
964
- "learning_rate": 4.9151914306471345e-05,
965
- "loss": 1.0212,
966
- "step": 262
967
- },
968
- {
969
- "epoch": 0.2643965948923385,
970
- "grad_norm": 0.4163782298564911,
971
- "learning_rate": 4.912153531245511e-05,
972
- "loss": 0.9191,
973
- "step": 264
974
- },
975
- {
976
- "epoch": 0.26639959939909863,
977
- "grad_norm": 0.5019692182540894,
978
- "learning_rate": 4.909063147629046e-05,
979
- "loss": 0.9337,
980
- "step": 266
981
- },
982
- {
983
- "epoch": 0.2684026039058588,
984
- "grad_norm": 0.5193113088607788,
985
- "learning_rate": 4.905920347039048e-05,
986
- "loss": 0.9746,
987
- "step": 268
988
- },
989
- {
990
- "epoch": 0.2704056084126189,
991
- "grad_norm": 0.4991247355937958,
992
- "learning_rate": 4.9027251978573244e-05,
993
- "loss": 0.9568,
994
- "step": 270
995
- },
996
- {
997
- "epoch": 0.27240861291937907,
998
- "grad_norm": 0.3833785951137543,
999
- "learning_rate": 4.8994777696046984e-05,
1000
- "loss": 0.9621,
1001
- "step": 272
1002
- },
1003
- {
1004
- "epoch": 0.27441161742613923,
1005
- "grad_norm": 0.5187920331954956,
1006
- "learning_rate": 4.8961781329394915e-05,
1007
- "loss": 0.9393,
1008
- "step": 274
1009
- },
1010
- {
1011
- "epoch": 0.27641462193289934,
1012
- "grad_norm": 0.6128193736076355,
1013
- "learning_rate": 4.89282635965599e-05,
1014
- "loss": 0.9734,
1015
- "step": 276
1016
- },
1017
- {
1018
- "epoch": 0.2784176264396595,
1019
- "grad_norm": 0.47504886984825134,
1020
- "learning_rate": 4.8894225226828795e-05,
1021
- "loss": 0.9592,
1022
- "step": 278
1023
- },
1024
- {
1025
- "epoch": 0.2804206309464196,
1026
- "grad_norm": 0.44938042759895325,
1027
- "learning_rate": 4.885966696081663e-05,
1028
- "loss": 0.9999,
1029
- "step": 280
1030
- },
1031
- {
1032
- "epoch": 0.2824236354531798,
1033
- "grad_norm": 0.48498111963272095,
1034
- "learning_rate": 4.8824589550450415e-05,
1035
- "loss": 0.9597,
1036
- "step": 282
1037
- },
1038
- {
1039
- "epoch": 0.2844266399599399,
1040
- "grad_norm": 0.582253098487854,
1041
- "learning_rate": 4.8788993758952875e-05,
1042
- "loss": 0.9322,
1043
- "step": 284
1044
- },
1045
- {
1046
- "epoch": 0.28642964446670005,
1047
- "grad_norm": 0.5211949944496155,
1048
- "learning_rate": 4.875288036082577e-05,
1049
- "loss": 0.9913,
1050
- "step": 286
1051
- },
1052
- {
1053
- "epoch": 0.2884326489734602,
1054
- "grad_norm": 0.5122332572937012,
1055
- "learning_rate": 4.8716250141833075e-05,
1056
- "loss": 0.92,
1057
- "step": 288
1058
- },
1059
- {
1060
- "epoch": 0.2904356534802203,
1061
- "grad_norm": 0.509671151638031,
1062
- "learning_rate": 4.867910389898387e-05,
1063
- "loss": 0.9686,
1064
- "step": 290
1065
- },
1066
- {
1067
- "epoch": 0.2924386579869805,
1068
- "grad_norm": 0.42992913722991943,
1069
- "learning_rate": 4.864144244051503e-05,
1070
- "loss": 0.8937,
1071
- "step": 292
1072
- },
1073
- {
1074
- "epoch": 0.2944416624937406,
1075
- "grad_norm": 0.558230996131897,
1076
- "learning_rate": 4.860326658587358e-05,
1077
- "loss": 1.005,
1078
- "step": 294
1079
- },
1080
- {
1081
- "epoch": 0.29644466700050076,
1082
- "grad_norm": 0.3904726505279541,
1083
- "learning_rate": 4.856457716569891e-05,
1084
- "loss": 0.9927,
1085
- "step": 296
1086
- },
1087
- {
1088
- "epoch": 0.29844767150726087,
1089
- "grad_norm": 0.377273827791214,
1090
- "learning_rate": 4.852537502180473e-05,
1091
- "loss": 0.9042,
1092
- "step": 298
1093
- },
1094
- {
1095
- "epoch": 0.30045067601402103,
1096
- "grad_norm": 0.4523603320121765,
1097
- "learning_rate": 4.848566100716066e-05,
1098
- "loss": 0.978,
1099
- "step": 300
1100
- },
1101
- {
1102
- "epoch": 0.30045067601402103,
1103
- "eval_loss": 1.191455602645874,
1104
- "eval_runtime": 3.8019,
1105
- "eval_samples_per_second": 15.256,
1106
- "eval_steps_per_second": 7.628,
1107
- "step": 300
1108
- },
1109
- {
1110
- "epoch": 0.3024536805207812,
1111
- "grad_norm": 0.39940956234931946,
1112
- "learning_rate": 4.8445435985873775e-05,
1113
- "loss": 1.0145,
1114
- "step": 302
1115
- },
1116
- {
1117
- "epoch": 0.3044566850275413,
1118
- "grad_norm": 0.42715466022491455,
1119
- "learning_rate": 4.84047008331697e-05,
1120
- "loss": 0.9933,
1121
- "step": 304
1122
- },
1123
- {
1124
- "epoch": 0.30645968953430147,
1125
- "grad_norm": 0.5550795793533325,
1126
- "learning_rate": 4.8363456435373686e-05,
1127
- "loss": 0.8994,
1128
- "step": 306
1129
- },
1130
- {
1131
- "epoch": 0.3084626940410616,
1132
- "grad_norm": 0.50642329454422,
1133
- "learning_rate": 4.832170368989121e-05,
1134
- "loss": 0.9708,
1135
- "step": 308
1136
- },
1137
- {
1138
- "epoch": 0.31046569854782174,
1139
- "grad_norm": 0.4395250976085663,
1140
- "learning_rate": 4.827944350518852e-05,
1141
- "loss": 1.055,
1142
- "step": 310
1143
- },
1144
- {
1145
- "epoch": 0.31246870305458185,
1146
- "grad_norm": 0.40183037519454956,
1147
- "learning_rate": 4.8236676800772845e-05,
1148
- "loss": 0.9564,
1149
- "step": 312
1150
- },
1151
- {
1152
- "epoch": 0.314471707561342,
1153
- "grad_norm": 0.4325483441352844,
1154
- "learning_rate": 4.8193404507172405e-05,
1155
- "loss": 0.9437,
1156
- "step": 314
1157
- },
1158
- {
1159
- "epoch": 0.3164747120681021,
1160
- "grad_norm": 0.5079526305198669,
1161
- "learning_rate": 4.814962756591612e-05,
1162
- "loss": 0.9426,
1163
- "step": 316
1164
- },
1165
- {
1166
- "epoch": 0.3184777165748623,
1167
- "grad_norm": 0.6221234202384949,
1168
- "learning_rate": 4.8105346929513195e-05,
1169
- "loss": 0.9674,
1170
- "step": 318
1171
- },
1172
- {
1173
- "epoch": 0.32048072108162245,
1174
- "grad_norm": 0.5088761448860168,
1175
- "learning_rate": 4.8060563561432313e-05,
1176
- "loss": 0.953,
1177
- "step": 320
1178
- },
1179
- {
1180
- "epoch": 0.32248372558838256,
1181
- "grad_norm": 0.4460401237010956,
1182
- "learning_rate": 4.801527843608075e-05,
1183
- "loss": 0.935,
1184
- "step": 322
1185
- },
1186
- {
1187
- "epoch": 0.32448673009514273,
1188
- "grad_norm": 0.39005428552627563,
1189
- "learning_rate": 4.796949253878311e-05,
1190
- "loss": 0.9204,
1191
- "step": 324
1192
- },
1193
- {
1194
- "epoch": 0.32648973460190284,
1195
- "grad_norm": 0.4077945351600647,
1196
- "learning_rate": 4.792320686575993e-05,
1197
- "loss": 1.0509,
1198
- "step": 326
1199
- },
1200
- {
1201
- "epoch": 0.328492739108663,
1202
- "grad_norm": 0.4249040186405182,
1203
- "learning_rate": 4.787642242410597e-05,
1204
- "loss": 0.9549,
1205
- "step": 328
1206
- },
1207
- {
1208
- "epoch": 0.3304957436154231,
1209
- "grad_norm": 0.4203990697860718,
1210
- "learning_rate": 4.7829140231768335e-05,
1211
- "loss": 0.9996,
1212
- "step": 330
1213
- },
1214
- {
1215
- "epoch": 0.3324987481221833,
1216
- "grad_norm": 0.4657137095928192,
1217
- "learning_rate": 4.778136131752431e-05,
1218
- "loss": 1.0336,
1219
- "step": 332
1220
- },
1221
- {
1222
- "epoch": 0.33450175262894344,
1223
- "grad_norm": 0.4463610053062439,
1224
- "learning_rate": 4.773308672095895e-05,
1225
- "loss": 0.936,
1226
- "step": 334
1227
- },
1228
- {
1229
- "epoch": 0.33650475713570355,
1230
- "grad_norm": 0.46322551369667053,
1231
- "learning_rate": 4.768431749244251e-05,
1232
- "loss": 0.8727,
1233
- "step": 336
1234
- },
1235
- {
1236
- "epoch": 0.3385077616424637,
1237
- "grad_norm": 0.4579392671585083,
1238
- "learning_rate": 4.7635054693107553e-05,
1239
- "loss": 0.9551,
1240
- "step": 338
1241
- },
1242
- {
1243
- "epoch": 0.3405107661492238,
1244
- "grad_norm": 0.40763622522354126,
1245
- "learning_rate": 4.758529939482588e-05,
1246
- "loss": 0.8965,
1247
- "step": 340
1248
- },
1249
- {
1250
- "epoch": 0.342513770655984,
1251
- "grad_norm": 0.5640069246292114,
1252
- "learning_rate": 4.75350526801852e-05,
1253
- "loss": 1.019,
1254
- "step": 342
1255
- },
1256
- {
1257
- "epoch": 0.3445167751627441,
1258
- "grad_norm": 0.378750741481781,
1259
- "learning_rate": 4.748431564246557e-05,
1260
- "loss": 0.974,
1261
- "step": 344
1262
- },
1263
- {
1264
- "epoch": 0.34651977966950426,
1265
- "grad_norm": 0.5434790849685669,
1266
- "learning_rate": 4.7433089385615634e-05,
1267
- "loss": 0.9863,
1268
- "step": 346
1269
- },
1270
- {
1271
- "epoch": 0.3485227841762644,
1272
- "grad_norm": 0.5737304091453552,
1273
- "learning_rate": 4.7381375024228556e-05,
1274
- "loss": 0.9044,
1275
- "step": 348
1276
- },
1277
- {
1278
- "epoch": 0.35052578868302453,
1279
- "grad_norm": 0.5187863707542419,
1280
- "learning_rate": 4.7329173683517825e-05,
1281
- "loss": 0.8692,
1282
- "step": 350
1283
- },
1284
- {
1285
- "epoch": 0.35052578868302453,
1286
- "eval_loss": 1.1893218755722046,
1287
- "eval_runtime": 3.7963,
1288
- "eval_samples_per_second": 15.278,
1289
- "eval_steps_per_second": 7.639,
1290
- "step": 350
1291
- },
1292
- {
1293
- "epoch": 0.3525287931897847,
1294
- "grad_norm": 0.417603462934494,
1295
- "learning_rate": 4.727648649929271e-05,
1296
- "loss": 0.9013,
1297
- "step": 352
1298
- },
1299
- {
1300
- "epoch": 0.3545317976965448,
1301
- "grad_norm": 0.5028386116027832,
1302
- "learning_rate": 4.7223314617933605e-05,
1303
- "loss": 0.9508,
1304
- "step": 354
1305
- },
1306
- {
1307
- "epoch": 0.35653480220330497,
1308
- "grad_norm": 0.3822748064994812,
1309
- "learning_rate": 4.7169659196367056e-05,
1310
- "loss": 0.9452,
1311
- "step": 356
1312
- },
1313
- {
1314
- "epoch": 0.3585378067100651,
1315
- "grad_norm": 0.44049903750419617,
1316
- "learning_rate": 4.711552140204059e-05,
1317
- "loss": 0.9455,
1318
- "step": 358
1319
- },
1320
- {
1321
- "epoch": 0.36054081121682524,
1322
- "grad_norm": 0.45998480916023254,
1323
- "learning_rate": 4.7060902412897304e-05,
1324
- "loss": 0.9731,
1325
- "step": 360
1326
- },
1327
- {
1328
- "epoch": 0.3625438157235854,
1329
- "grad_norm": 0.5747750401496887,
1330
- "learning_rate": 4.700580341735026e-05,
1331
- "loss": 0.9197,
1332
- "step": 362
1333
- },
1334
- {
1335
- "epoch": 0.3645468202303455,
1336
- "grad_norm": 0.39996007084846497,
1337
- "learning_rate": 4.695022561425663e-05,
1338
- "loss": 0.9464,
1339
- "step": 364
1340
- },
1341
- {
1342
- "epoch": 0.3665498247371057,
1343
- "grad_norm": 0.4300011396408081,
1344
- "learning_rate": 4.689417021289157e-05,
1345
- "loss": 0.8947,
1346
- "step": 366
1347
- },
1348
- {
1349
- "epoch": 0.3685528292438658,
1350
- "grad_norm": 0.38185784220695496,
1351
- "learning_rate": 4.6837638432921925e-05,
1352
- "loss": 0.9521,
1353
- "step": 368
1354
- },
1355
- {
1356
- "epoch": 0.37055583375062595,
1357
- "grad_norm": 0.48808950185775757,
1358
- "learning_rate": 4.6780631504379736e-05,
1359
- "loss": 0.9326,
1360
- "step": 370
1361
- },
1362
- {
1363
- "epoch": 0.37255883825738606,
1364
- "grad_norm": 0.40927746891975403,
1365
- "learning_rate": 4.672315066763542e-05,
1366
- "loss": 0.9949,
1367
- "step": 372
1368
- },
1369
- {
1370
- "epoch": 0.3745618427641462,
1371
- "grad_norm": 0.473628968000412,
1372
- "learning_rate": 4.666519717337079e-05,
1373
- "loss": 0.9808,
1374
- "step": 374
1375
- },
1376
- {
1377
- "epoch": 0.37656484727090633,
1378
- "grad_norm": 0.45377451181411743,
1379
- "learning_rate": 4.6606772282551894e-05,
1380
- "loss": 0.9978,
1381
- "step": 376
1382
- },
1383
- {
1384
- "epoch": 0.3785678517776665,
1385
- "grad_norm": 0.5329418182373047,
1386
- "learning_rate": 4.65478772664015e-05,
1387
- "loss": 0.9531,
1388
- "step": 378
1389
- },
1390
- {
1391
- "epoch": 0.38057085628442666,
1392
- "grad_norm": 0.4209918677806854,
1393
- "learning_rate": 4.648851340637147e-05,
1394
- "loss": 0.914,
1395
- "step": 380
1396
- },
1397
- {
1398
- "epoch": 0.38257386079118677,
1399
- "grad_norm": 0.40193280577659607,
1400
- "learning_rate": 4.642868199411493e-05,
1401
- "loss": 0.8853,
1402
- "step": 382
1403
- },
1404
- {
1405
- "epoch": 0.38457686529794693,
1406
- "grad_norm": 0.39280131459236145,
1407
- "learning_rate": 4.6368384331458085e-05,
1408
- "loss": 0.8992,
1409
- "step": 384
1410
- },
1411
- {
1412
- "epoch": 0.38657986980470704,
1413
- "grad_norm": 0.44302472472190857,
1414
- "learning_rate": 4.6307621730371934e-05,
1415
- "loss": 0.9454,
1416
- "step": 386
1417
- },
1418
- {
1419
- "epoch": 0.3885828743114672,
1420
- "grad_norm": 0.4578077793121338,
1421
- "learning_rate": 4.6246395512943716e-05,
1422
- "loss": 0.957,
1423
- "step": 388
1424
- },
1425
- {
1426
- "epoch": 0.3905858788182273,
1427
- "grad_norm": 0.4635055959224701,
1428
- "learning_rate": 4.618470701134815e-05,
1429
- "loss": 0.9978,
1430
- "step": 390
1431
- },
1432
- {
1433
- "epoch": 0.3925888833249875,
1434
- "grad_norm": 0.49186405539512634,
1435
- "learning_rate": 4.612255756781845e-05,
1436
- "loss": 0.9792,
1437
- "step": 392
1438
- },
1439
- {
1440
- "epoch": 0.39459188783174765,
1441
- "grad_norm": 0.42530110478401184,
1442
- "learning_rate": 4.605994853461709e-05,
1443
- "loss": 1.0054,
1444
- "step": 394
1445
- },
1446
- {
1447
- "epoch": 0.39659489233850775,
1448
- "grad_norm": 0.4250572919845581,
1449
- "learning_rate": 4.5996881274006446e-05,
1450
- "loss": 0.8744,
1451
- "step": 396
1452
- },
1453
- {
1454
- "epoch": 0.3985978968452679,
1455
- "grad_norm": 0.4212440550327301,
1456
- "learning_rate": 4.593335715821909e-05,
1457
- "loss": 0.9451,
1458
- "step": 398
1459
- },
1460
- {
1461
- "epoch": 0.400600901352028,
1462
- "grad_norm": 0.35784921050071716,
1463
- "learning_rate": 4.586937756942796e-05,
1464
- "loss": 0.9179,
1465
- "step": 400
1466
- },
1467
- {
1468
- "epoch": 0.400600901352028,
1469
- "eval_loss": 1.1884177923202515,
1470
- "eval_runtime": 3.8058,
1471
- "eval_samples_per_second": 15.24,
1472
- "eval_steps_per_second": 7.62,
1473
- "step": 400
1474
- },
1475
- {
1476
- "epoch": 0.4026039058587882,
1477
- "grad_norm": 0.4087256193161011,
1478
- "learning_rate": 4.580494389971628e-05,
1479
- "loss": 0.8817,
1480
- "step": 402
1481
- },
1482
- {
1483
- "epoch": 0.4046069103655483,
1484
- "grad_norm": 0.40662136673927307,
1485
- "learning_rate": 4.5740057551047294e-05,
1486
- "loss": 0.9219,
1487
- "step": 404
1488
- },
1489
- {
1490
- "epoch": 0.40660991487230846,
1491
- "grad_norm": 0.4162129759788513,
1492
- "learning_rate": 4.5674719935233726e-05,
1493
- "loss": 0.8831,
1494
- "step": 406
1495
- },
1496
- {
1497
- "epoch": 0.40861291937906863,
1498
- "grad_norm": 0.40978914499282837,
1499
- "learning_rate": 4.56089324739071e-05,
1500
- "loss": 0.9601,
1501
- "step": 408
1502
- },
1503
- {
1504
- "epoch": 0.41061592388582874,
1505
- "grad_norm": 0.42754805088043213,
1506
- "learning_rate": 4.554269659848675e-05,
1507
- "loss": 0.9463,
1508
- "step": 410
1509
- },
1510
- {
1511
- "epoch": 0.4126189283925889,
1512
- "grad_norm": 0.48228365182876587,
1513
- "learning_rate": 4.547601375014875e-05,
1514
- "loss": 0.9418,
1515
- "step": 412
1516
- },
1517
- {
1518
- "epoch": 0.414621932899349,
1519
- "grad_norm": 0.4946666657924652,
1520
- "learning_rate": 4.5408885379794494e-05,
1521
- "loss": 0.9011,
1522
- "step": 414
1523
- },
1524
- {
1525
- "epoch": 0.4166249374061092,
1526
- "grad_norm": 0.4881949722766876,
1527
- "learning_rate": 4.5341312948019155e-05,
1528
- "loss": 0.9794,
1529
- "step": 416
1530
- },
1531
- {
1532
- "epoch": 0.4186279419128693,
1533
- "grad_norm": 0.39862060546875,
1534
- "learning_rate": 4.527329792507991e-05,
1535
- "loss": 0.9116,
1536
- "step": 418
1537
- },
1538
- {
1539
- "epoch": 0.42063094641962945,
1540
- "grad_norm": 0.3882657587528229,
1541
- "learning_rate": 4.520484179086394e-05,
1542
- "loss": 0.9337,
1543
- "step": 420
1544
- },
1545
- {
1546
- "epoch": 0.42263395092638956,
1547
- "grad_norm": 0.3756396770477295,
1548
- "learning_rate": 4.51359460348562e-05,
1549
- "loss": 0.9272,
1550
- "step": 422
1551
- },
1552
- {
1553
- "epoch": 0.4246369554331497,
1554
- "grad_norm": 0.451297402381897,
1555
- "learning_rate": 4.50666121561071e-05,
1556
- "loss": 0.9306,
1557
- "step": 424
1558
- },
1559
- {
1560
- "epoch": 0.4266399599399099,
1561
- "grad_norm": 0.41500887274742126,
1562
- "learning_rate": 4.499684166319978e-05,
1563
- "loss": 0.9472,
1564
- "step": 426
1565
- },
1566
- {
1567
- "epoch": 0.42864296444667,
1568
- "grad_norm": 0.4838218688964844,
1569
- "learning_rate": 4.492663607421736e-05,
1570
- "loss": 0.8738,
1571
- "step": 428
1572
- },
1573
- {
1574
- "epoch": 0.43064596895343016,
1575
- "grad_norm": 0.3867829442024231,
1576
- "learning_rate": 4.4855996916709865e-05,
1577
- "loss": 1.0112,
1578
- "step": 430
1579
- },
1580
- {
1581
- "epoch": 0.43264897346019027,
1582
- "grad_norm": 0.40715524554252625,
1583
- "learning_rate": 4.478492572766102e-05,
1584
- "loss": 0.9571,
1585
- "step": 432
1586
- },
1587
- {
1588
- "epoch": 0.43465197796695043,
1589
- "grad_norm": 0.5042704343795776,
1590
- "learning_rate": 4.47134240534548e-05,
1591
- "loss": 0.9304,
1592
- "step": 434
1593
- },
1594
- {
1595
- "epoch": 0.43665498247371054,
1596
- "grad_norm": 0.4030342400074005,
1597
- "learning_rate": 4.464149344984178e-05,
1598
- "loss": 0.9479,
1599
- "step": 436
1600
- },
1601
- {
1602
- "epoch": 0.4386579869804707,
1603
- "grad_norm": 0.3429213762283325,
1604
- "learning_rate": 4.456913548190527e-05,
1605
- "loss": 0.9511,
1606
- "step": 438
1607
- },
1608
- {
1609
- "epoch": 0.44066099148723087,
1610
- "grad_norm": 0.4278419315814972,
1611
- "learning_rate": 4.44963517240273e-05,
1612
- "loss": 1.1125,
1613
- "step": 440
1614
- },
1615
- {
1616
- "epoch": 0.442663995993991,
1617
- "grad_norm": 0.4170474708080292,
1618
- "learning_rate": 4.44231437598543e-05,
1619
- "loss": 0.9498,
1620
- "step": 442
1621
- },
1622
- {
1623
- "epoch": 0.44466700050075114,
1624
- "grad_norm": 0.39053234457969666,
1625
- "learning_rate": 4.4349513182262715e-05,
1626
- "loss": 0.9796,
1627
- "step": 444
1628
- },
1629
- {
1630
- "epoch": 0.44667000500751125,
1631
- "grad_norm": 0.5083168148994446,
1632
- "learning_rate": 4.4275461593324306e-05,
1633
- "loss": 0.9236,
1634
- "step": 446
1635
- },
1636
- {
1637
- "epoch": 0.4486730095142714,
1638
- "grad_norm": 0.3927271068096161,
1639
- "learning_rate": 4.420099060427131e-05,
1640
- "loss": 1.011,
1641
- "step": 448
1642
- },
1643
- {
1644
- "epoch": 0.4506760140210315,
1645
- "grad_norm": 0.4185622036457062,
1646
- "learning_rate": 4.4126101835461346e-05,
1647
- "loss": 0.9671,
1648
- "step": 450
1649
- },
1650
- {
1651
- "epoch": 0.4506760140210315,
1652
- "eval_loss": 1.1852179765701294,
1653
- "eval_runtime": 3.8121,
1654
- "eval_samples_per_second": 15.215,
1655
- "eval_steps_per_second": 7.607,
1656
- "step": 450
1657
- },
1658
- {
1659
- "epoch": 0.4526790185277917,
1660
- "grad_norm": 0.5305806398391724,
1661
- "learning_rate": 4.405079691634221e-05,
1662
- "loss": 0.9388,
1663
- "step": 452
1664
- },
1665
- {
1666
- "epoch": 0.45468202303455185,
1667
- "grad_norm": 0.4585268497467041,
1668
- "learning_rate": 4.3975077485416377e-05,
1669
- "loss": 0.8841,
1670
- "step": 454
1671
- },
1672
- {
1673
- "epoch": 0.45668502754131196,
1674
- "grad_norm": 0.39412179589271545,
1675
- "learning_rate": 4.3898945190205386e-05,
1676
- "loss": 0.9371,
1677
- "step": 456
1678
- },
1679
- {
1680
- "epoch": 0.4586880320480721,
1681
- "grad_norm": 0.5423275828361511,
1682
- "learning_rate": 4.382240168721396e-05,
1683
- "loss": 0.9923,
1684
- "step": 458
1685
- },
1686
- {
1687
- "epoch": 0.46069103655483223,
1688
- "grad_norm": 0.3563918471336365,
1689
- "learning_rate": 4.3745448641894e-05,
1690
- "loss": 0.9546,
1691
- "step": 460
1692
- },
1693
- {
1694
- "epoch": 0.4626940410615924,
1695
- "grad_norm": 0.7710307836532593,
1696
- "learning_rate": 4.3668087728608316e-05,
1697
- "loss": 0.9195,
1698
- "step": 462
1699
- },
1700
- {
1701
- "epoch": 0.4646970455683525,
1702
- "grad_norm": 0.4273247718811035,
1703
- "learning_rate": 4.359032063059419e-05,
1704
- "loss": 0.9674,
1705
- "step": 464
1706
- },
1707
- {
1708
- "epoch": 0.46670005007511267,
1709
- "grad_norm": 0.41480231285095215,
1710
- "learning_rate": 4.3512149039926796e-05,
1711
- "loss": 0.8851,
1712
- "step": 466
1713
- },
1714
- {
1715
- "epoch": 0.46870305458187284,
1716
- "grad_norm": 0.559946596622467,
1717
- "learning_rate": 4.343357465748235e-05,
1718
- "loss": 0.8949,
1719
- "step": 468
1720
- },
1721
- {
1722
- "epoch": 0.47070605908863294,
1723
- "grad_norm": 0.5360729098320007,
1724
- "learning_rate": 4.33545991929011e-05,
1725
- "loss": 0.9014,
1726
- "step": 470
1727
- },
1728
- {
1729
- "epoch": 0.4727090635953931,
1730
- "grad_norm": 0.5606299042701721,
1731
- "learning_rate": 4.327522436455013e-05,
1732
- "loss": 0.9091,
1733
- "step": 472
1734
- },
1735
- {
1736
- "epoch": 0.4747120681021532,
1737
- "grad_norm": 0.49291422963142395,
1738
- "learning_rate": 4.3195451899485994e-05,
1739
- "loss": 0.9076,
1740
- "step": 474
1741
- },
1742
- {
1743
- "epoch": 0.4767150726089134,
1744
- "grad_norm": 0.3711169958114624,
1745
- "learning_rate": 4.3115283533417105e-05,
1746
- "loss": 0.9644,
1747
- "step": 476
1748
- },
1749
- {
1750
- "epoch": 0.4787180771156735,
1751
- "grad_norm": 0.4362380802631378,
1752
- "learning_rate": 4.3034721010666e-05,
1753
- "loss": 0.9263,
1754
- "step": 478
1755
- },
1756
- {
1757
- "epoch": 0.48072108162243365,
1758
- "grad_norm": 0.5104102492332458,
1759
- "learning_rate": 4.295376608413137e-05,
1760
- "loss": 0.96,
1761
- "step": 480
1762
- },
1763
- {
1764
- "epoch": 0.48272408612919376,
1765
- "grad_norm": 0.4157417416572571,
1766
- "learning_rate": 4.287242051524989e-05,
1767
- "loss": 0.9594,
1768
- "step": 482
1769
- },
1770
- {
1771
- "epoch": 0.4847270906359539,
1772
- "grad_norm": 0.36849111318588257,
1773
- "learning_rate": 4.2790686073957976e-05,
1774
- "loss": 0.8976,
1775
- "step": 484
1776
- },
1777
- {
1778
- "epoch": 0.4867300951427141,
1779
- "grad_norm": 0.6290056109428406,
1780
- "learning_rate": 4.270856453865318e-05,
1781
- "loss": 0.9248,
1782
- "step": 486
1783
- },
1784
- {
1785
- "epoch": 0.4887330996494742,
1786
- "grad_norm": 0.4833918511867523,
1787
- "learning_rate": 4.262605769615557e-05,
1788
- "loss": 1.0118,
1789
- "step": 488
1790
- },
1791
- {
1792
- "epoch": 0.49073610415623437,
1793
- "grad_norm": 0.6724058985710144,
1794
- "learning_rate": 4.25431673416688e-05,
1795
- "loss": 0.8823,
1796
- "step": 490
1797
- },
1798
- {
1799
- "epoch": 0.4927391086629945,
1800
- "grad_norm": 0.45951318740844727,
1801
- "learning_rate": 4.245989527874107e-05,
1802
- "loss": 0.9822,
1803
- "step": 492
1804
- },
1805
- {
1806
- "epoch": 0.49474211316975464,
1807
- "grad_norm": 0.4734819829463959,
1808
- "learning_rate": 4.237624331922589e-05,
1809
- "loss": 0.9181,
1810
- "step": 494
1811
- },
1812
- {
1813
- "epoch": 0.49674511767651475,
1814
- "grad_norm": 0.9102823138237,
1815
- "learning_rate": 4.229221328324265e-05,
1816
- "loss": 0.8974,
1817
- "step": 496
1818
- },
1819
- {
1820
- "epoch": 0.4987481221832749,
1821
- "grad_norm": 0.35548609495162964,
1822
- "learning_rate": 4.2207806999137035e-05,
1823
- "loss": 0.9309,
1824
- "step": 498
1825
- },
1826
- {
1827
- "epoch": 0.5007511266900351,
1828
- "grad_norm": 0.46587055921554565,
1829
- "learning_rate": 4.21230263034412e-05,
1830
- "loss": 0.9114,
1831
- "step": 500
1832
- },
1833
- {
1834
- "epoch": 0.5007511266900351,
1835
- "eval_loss": 1.174551248550415,
1836
- "eval_runtime": 3.8166,
1837
- "eval_samples_per_second": 15.197,
1838
- "eval_steps_per_second": 7.598,
1839
- "step": 500
1840
- },
1841
- {
1842
- "epoch": 0.5027541311967952,
1843
- "grad_norm": 0.3687826097011566,
1844
- "learning_rate": 4.2037873040833845e-05,
1845
- "loss": 0.9322,
1846
- "step": 502
1847
- },
1848
- {
1849
- "epoch": 0.5047571357035553,
1850
- "grad_norm": 0.5049874782562256,
1851
- "learning_rate": 4.1952349064100074e-05,
1852
- "loss": 0.9975,
1853
- "step": 504
1854
- },
1855
- {
1856
- "epoch": 0.5067601402103155,
1857
- "grad_norm": 0.4126236140727997,
1858
- "learning_rate": 4.1866456234091076e-05,
1859
- "loss": 0.929,
1860
- "step": 506
1861
- },
1862
- {
1863
- "epoch": 0.5087631447170756,
1864
- "grad_norm": 0.44455772638320923,
1865
- "learning_rate": 4.178019641968364e-05,
1866
- "loss": 0.9345,
1867
- "step": 508
1868
- },
1869
- {
1870
- "epoch": 0.5107661492238358,
1871
- "grad_norm": 0.4278281033039093,
1872
- "learning_rate": 4.1693571497739495e-05,
1873
- "loss": 0.8941,
1874
- "step": 510
1875
- },
1876
- {
1877
- "epoch": 0.5127691537305958,
1878
- "grad_norm": 0.3606776297092438,
1879
- "learning_rate": 4.160658335306446e-05,
1880
- "loss": 0.9442,
1881
- "step": 512
1882
- },
1883
- {
1884
- "epoch": 0.514772158237356,
1885
- "grad_norm": 0.5303627848625183,
1886
- "learning_rate": 4.1519233878367424e-05,
1887
- "loss": 0.8712,
1888
- "step": 514
1889
- },
1890
- {
1891
- "epoch": 0.5167751627441162,
1892
- "grad_norm": 0.3978877067565918,
1893
- "learning_rate": 4.143152497421922e-05,
1894
- "loss": 0.8558,
1895
- "step": 516
1896
- },
1897
- {
1898
- "epoch": 0.5187781672508763,
1899
- "grad_norm": 0.68426513671875,
1900
- "learning_rate": 4.134345854901121e-05,
1901
- "loss": 0.9229,
1902
- "step": 518
1903
- },
1904
- {
1905
- "epoch": 0.5207811717576365,
1906
- "grad_norm": 0.5070856809616089,
1907
- "learning_rate": 4.125503651891377e-05,
1908
- "loss": 0.8383,
1909
- "step": 520
1910
- },
1911
- {
1912
- "epoch": 0.5227841762643965,
1913
- "grad_norm": 0.5237690806388855,
1914
- "learning_rate": 4.1166260807834644e-05,
1915
- "loss": 0.8836,
1916
- "step": 522
1917
- },
1918
- {
1919
- "epoch": 0.5247871807711567,
1920
- "grad_norm": 0.38217777013778687,
1921
- "learning_rate": 4.107713334737704e-05,
1922
- "loss": 0.953,
1923
- "step": 524
1924
- },
1925
- {
1926
- "epoch": 0.5267901852779169,
1927
- "grad_norm": 0.4001261591911316,
1928
- "learning_rate": 4.098765607679761e-05,
1929
- "loss": 0.9681,
1930
- "step": 526
1931
- },
1932
- {
1933
- "epoch": 0.528793189784677,
1934
- "grad_norm": 0.4185451567173004,
1935
- "learning_rate": 4.0897830942964255e-05,
1936
- "loss": 0.9023,
1937
- "step": 528
1938
- },
1939
- {
1940
- "epoch": 0.5307961942914372,
1941
- "grad_norm": 0.4268343150615692,
1942
- "learning_rate": 4.080765990031377e-05,
1943
- "loss": 0.9154,
1944
- "step": 530
1945
- },
1946
- {
1947
- "epoch": 0.5327991987981973,
1948
- "grad_norm": 0.46939241886138916,
1949
- "learning_rate": 4.071714491080932e-05,
1950
- "loss": 0.9013,
1951
- "step": 532
1952
- },
1953
- {
1954
- "epoch": 0.5348022033049574,
1955
- "grad_norm": 0.3804875910282135,
1956
- "learning_rate": 4.0626287943897764e-05,
1957
- "loss": 0.9091,
1958
- "step": 534
1959
- },
1960
- {
1961
- "epoch": 0.5368052078117176,
1962
- "grad_norm": 0.5679438710212708,
1963
- "learning_rate": 4.053509097646674e-05,
1964
- "loss": 0.9361,
1965
- "step": 536
1966
- },
1967
- {
1968
- "epoch": 0.5388082123184778,
1969
- "grad_norm": 0.47385266423225403,
1970
- "learning_rate": 4.044355599280175e-05,
1971
- "loss": 0.9549,
1972
- "step": 538
1973
- },
1974
- {
1975
- "epoch": 0.5408112168252378,
1976
- "grad_norm": 0.48675286769866943,
1977
- "learning_rate": 4.035168498454292e-05,
1978
- "loss": 0.8835,
1979
- "step": 540
1980
- },
1981
- {
1982
- "epoch": 0.542814221331998,
1983
- "grad_norm": 0.46679016947746277,
1984
- "learning_rate": 4.025947995064166e-05,
1985
- "loss": 0.9377,
1986
- "step": 542
1987
- },
1988
- {
1989
- "epoch": 0.5448172258387581,
1990
- "grad_norm": 0.4926673471927643,
1991
- "learning_rate": 4.0166942897317205e-05,
1992
- "loss": 0.9036,
1993
- "step": 544
1994
- },
1995
- {
1996
- "epoch": 0.5468202303455183,
1997
- "grad_norm": 0.38182321190834045,
1998
- "learning_rate": 4.007407583801295e-05,
1999
- "loss": 0.9616,
2000
- "step": 546
2001
- },
2002
- {
2003
- "epoch": 0.5488232348522785,
2004
- "grad_norm": 0.45545268058776855,
2005
- "learning_rate": 3.9980880793352635e-05,
2006
- "loss": 0.9747,
2007
- "step": 548
2008
- },
2009
- {
2010
- "epoch": 0.5508262393590385,
2011
- "grad_norm": 0.47782036662101746,
2012
- "learning_rate": 3.988735979109638e-05,
2013
- "loss": 0.8995,
2014
- "step": 550
2015
- },
2016
- {
2017
- "epoch": 0.5508262393590385,
2018
- "eval_loss": 1.1662517786026,
2019
- "eval_runtime": 3.8013,
2020
- "eval_samples_per_second": 15.258,
2021
- "eval_steps_per_second": 7.629,
2022
- "step": 550
2023
- },
2024
- {
2025
- "epoch": 0.5528292438657987,
2026
- "grad_norm": 0.5856130123138428,
2027
- "learning_rate": 3.979351486609658e-05,
2028
- "loss": 0.8887,
2029
- "step": 552
2030
- },
2031
- {
2032
- "epoch": 0.5548322483725588,
2033
- "grad_norm": 0.3920418620109558,
2034
- "learning_rate": 3.969934806025361e-05,
2035
- "loss": 0.8773,
2036
- "step": 554
2037
- },
2038
- {
2039
- "epoch": 0.556835252879319,
2040
- "grad_norm": 0.43775448203086853,
2041
- "learning_rate": 3.960486142247142e-05,
2042
- "loss": 0.8969,
2043
- "step": 556
2044
- },
2045
- {
2046
- "epoch": 0.5588382573860792,
2047
- "grad_norm": 0.42693212628364563,
2048
- "learning_rate": 3.951005700861291e-05,
2049
- "loss": 0.9114,
2050
- "step": 558
2051
- },
2052
- {
2053
- "epoch": 0.5608412618928392,
2054
- "grad_norm": 0.45931047201156616,
2055
- "learning_rate": 3.9414936881455254e-05,
2056
- "loss": 0.9111,
2057
- "step": 560
2058
- },
2059
- {
2060
- "epoch": 0.5628442663995994,
2061
- "grad_norm": 0.5036295652389526,
2062
- "learning_rate": 3.931950311064498e-05,
2063
- "loss": 0.9606,
2064
- "step": 562
2065
- },
2066
- {
2067
- "epoch": 0.5648472709063596,
2068
- "grad_norm": 0.5762202143669128,
2069
- "learning_rate": 3.9223757772652956e-05,
2070
- "loss": 0.8566,
2071
- "step": 564
2072
- },
2073
- {
2074
- "epoch": 0.5668502754131197,
2075
- "grad_norm": 0.40658578276634216,
2076
- "learning_rate": 3.91277029507292e-05,
2077
- "loss": 0.9485,
2078
- "step": 566
2079
- },
2080
- {
2081
- "epoch": 0.5688532799198798,
2082
- "grad_norm": 0.3851291835308075,
2083
- "learning_rate": 3.903134073485756e-05,
2084
- "loss": 0.8902,
2085
- "step": 568
2086
- },
2087
- {
2088
- "epoch": 0.5708562844266399,
2089
- "grad_norm": 0.3543303906917572,
2090
- "learning_rate": 3.8934673221710215e-05,
2091
- "loss": 0.9411,
2092
- "step": 570
2093
- },
2094
- {
2095
- "epoch": 0.5728592889334001,
2096
- "grad_norm": 0.3977811336517334,
2097
- "learning_rate": 3.883770251460212e-05,
2098
- "loss": 0.9258,
2099
- "step": 572
2100
- },
2101
- {
2102
- "epoch": 0.5748622934401603,
2103
- "grad_norm": 0.4081217050552368,
2104
- "learning_rate": 3.8740430723445156e-05,
2105
- "loss": 0.9201,
2106
- "step": 574
2107
- },
2108
- {
2109
- "epoch": 0.5768652979469204,
2110
- "grad_norm": 0.4058239459991455,
2111
- "learning_rate": 3.864285996470226e-05,
2112
- "loss": 0.9428,
2113
- "step": 576
2114
- },
2115
- {
2116
- "epoch": 0.5788683024536805,
2117
- "grad_norm": 0.40673911571502686,
2118
- "learning_rate": 3.854499236134141e-05,
2119
- "loss": 0.985,
2120
- "step": 578
2121
- },
2122
- {
2123
- "epoch": 0.5808713069604406,
2124
- "grad_norm": 0.4199845790863037,
2125
- "learning_rate": 3.844683004278939e-05,
2126
- "loss": 0.9476,
2127
- "step": 580
2128
- },
2129
- {
2130
- "epoch": 0.5828743114672008,
2131
- "grad_norm": 0.4016932547092438,
2132
- "learning_rate": 3.834837514488544e-05,
2133
- "loss": 0.9464,
2134
- "step": 582
2135
- },
2136
- {
2137
- "epoch": 0.584877315973961,
2138
- "grad_norm": 0.41921266913414,
2139
- "learning_rate": 3.8249629809834845e-05,
2140
- "loss": 0.9651,
2141
- "step": 584
2142
- },
2143
- {
2144
- "epoch": 0.586880320480721,
2145
- "grad_norm": 0.4465863108634949,
2146
- "learning_rate": 3.8150596186162286e-05,
2147
- "loss": 0.8847,
2148
- "step": 586
2149
- },
2150
- {
2151
- "epoch": 0.5888833249874812,
2152
- "grad_norm": 0.4515509009361267,
2153
- "learning_rate": 3.805127642866507e-05,
2154
- "loss": 0.951,
2155
- "step": 588
2156
- },
2157
- {
2158
- "epoch": 0.5908863294942414,
2159
- "grad_norm": 0.44146063923835754,
2160
- "learning_rate": 3.795167269836631e-05,
2161
- "loss": 0.8924,
2162
- "step": 590
2163
- },
2164
- {
2165
- "epoch": 0.5928893340010015,
2166
- "grad_norm": 0.538754940032959,
2167
- "learning_rate": 3.785178716246786e-05,
2168
- "loss": 0.9536,
2169
- "step": 592
2170
- },
2171
- {
2172
- "epoch": 0.5948923385077617,
2173
- "grad_norm": 0.3271295130252838,
2174
- "learning_rate": 3.775162199430312e-05,
2175
- "loss": 0.8724,
2176
- "step": 594
2177
- },
2178
- {
2179
- "epoch": 0.5968953430145217,
2180
- "grad_norm": 0.4394945800304413,
2181
- "learning_rate": 3.765117937328986e-05,
2182
- "loss": 0.9133,
2183
- "step": 596
2184
- },
2185
- {
2186
- "epoch": 0.5988983475212819,
2187
- "grad_norm": 0.40261757373809814,
2188
- "learning_rate": 3.75504614848827e-05,
2189
- "loss": 0.9253,
2190
- "step": 598
2191
- },
2192
- {
2193
- "epoch": 0.6009013520280421,
2194
- "grad_norm": 0.4515800178050995,
2195
- "learning_rate": 3.744947052052562e-05,
2196
- "loss": 0.918,
2197
- "step": 600
2198
- },
2199
- {
2200
- "epoch": 0.6009013520280421,
2201
- "eval_loss": 1.1564297676086426,
2202
- "eval_runtime": 3.8109,
2203
- "eval_samples_per_second": 15.219,
2204
- "eval_steps_per_second": 7.61,
2205
- "step": 600
2206
- },
2207
- {
2208
- "epoch": 0.6029043565348022,
2209
- "grad_norm": 0.4420590400695801,
2210
- "learning_rate": 3.734820867760421e-05,
2211
- "loss": 0.8758,
2212
- "step": 602
2213
- },
2214
- {
2215
- "epoch": 0.6049073610415624,
2216
- "grad_norm": 0.41104549169540405,
2217
- "learning_rate": 3.724667815939794e-05,
2218
- "loss": 1.0595,
2219
- "step": 604
2220
- },
2221
- {
2222
- "epoch": 0.6069103655483225,
2223
- "grad_norm": 0.4642109274864197,
2224
- "learning_rate": 3.7144881175032174e-05,
2225
- "loss": 0.9576,
2226
- "step": 606
2227
- },
2228
- {
2229
- "epoch": 0.6089133700550826,
2230
- "grad_norm": 0.4654678404331207,
2231
- "learning_rate": 3.704281993943008e-05,
2232
- "loss": 0.9196,
2233
- "step": 608
2234
- },
2235
- {
2236
- "epoch": 0.6109163745618428,
2237
- "grad_norm": 0.44470739364624023,
2238
- "learning_rate": 3.694049667326451e-05,
2239
- "loss": 0.9326,
2240
- "step": 610
2241
- },
2242
- {
2243
- "epoch": 0.6129193790686029,
2244
- "grad_norm": 0.4389815330505371,
2245
- "learning_rate": 3.683791360290961e-05,
2246
- "loss": 0.9633,
2247
- "step": 612
2248
- },
2249
- {
2250
- "epoch": 0.614922383575363,
2251
- "grad_norm": 0.366268515586853,
2252
- "learning_rate": 3.673507296039243e-05,
2253
- "loss": 0.9876,
2254
- "step": 614
2255
- },
2256
- {
2257
- "epoch": 0.6169253880821232,
2258
- "grad_norm": 0.40563082695007324,
2259
- "learning_rate": 3.663197698334432e-05,
2260
- "loss": 0.8903,
2261
- "step": 616
2262
- },
2263
- {
2264
- "epoch": 0.6189283925888833,
2265
- "grad_norm": 0.35876786708831787,
2266
- "learning_rate": 3.6528627914952266e-05,
2267
- "loss": 0.9025,
2268
- "step": 618
2269
- },
2270
- {
2271
- "epoch": 0.6209313970956435,
2272
- "grad_norm": 0.44777098298072815,
2273
- "learning_rate": 3.6425028003910074e-05,
2274
- "loss": 0.9048,
2275
- "step": 620
2276
- },
2277
- {
2278
- "epoch": 0.6229344016024037,
2279
- "grad_norm": 0.40352246165275574,
2280
- "learning_rate": 3.6321179504369444e-05,
2281
- "loss": 0.9176,
2282
- "step": 622
2283
- },
2284
- {
2285
- "epoch": 0.6249374061091637,
2286
- "grad_norm": 0.4628620445728302,
2287
- "learning_rate": 3.6217084675890935e-05,
2288
- "loss": 0.9208,
2289
- "step": 624
2290
- },
2291
- {
2292
- "epoch": 0.6269404106159239,
2293
- "grad_norm": 0.45699334144592285,
2294
- "learning_rate": 3.611274578339477e-05,
2295
- "loss": 0.9284,
2296
- "step": 626
2297
- },
2298
- {
2299
- "epoch": 0.628943415122684,
2300
- "grad_norm": 0.45050838589668274,
2301
- "learning_rate": 3.60081650971116e-05,
2302
- "loss": 0.9417,
2303
- "step": 628
2304
- },
2305
- {
2306
- "epoch": 0.6309464196294442,
2307
- "grad_norm": 0.4145865738391876,
2308
- "learning_rate": 3.590334489253306e-05,
2309
- "loss": 0.9526,
2310
- "step": 630
2311
- },
2312
- {
2313
- "epoch": 0.6329494241362043,
2314
- "grad_norm": 0.4078468084335327,
2315
- "learning_rate": 3.5798287450362306e-05,
2316
- "loss": 0.8913,
2317
- "step": 632
2318
- },
2319
- {
2320
- "epoch": 0.6349524286429644,
2321
- "grad_norm": 0.49246945977211,
2322
- "learning_rate": 3.569299505646433e-05,
2323
- "loss": 0.862,
2324
- "step": 634
2325
- },
2326
- {
2327
- "epoch": 0.6369554331497246,
2328
- "grad_norm": 0.4269583523273468,
2329
- "learning_rate": 3.55874700018163e-05,
2330
- "loss": 0.9608,
2331
- "step": 636
2332
- },
2333
- {
2334
- "epoch": 0.6389584376564847,
2335
- "grad_norm": 0.4796135723590851,
2336
- "learning_rate": 3.548171458245765e-05,
2337
- "loss": 0.9123,
2338
- "step": 638
2339
- },
2340
- {
2341
- "epoch": 0.6409614421632449,
2342
- "grad_norm": 0.41421452164649963,
2343
- "learning_rate": 3.5375731099440135e-05,
2344
- "loss": 0.9702,
2345
- "step": 640
2346
- },
2347
- {
2348
- "epoch": 0.642964446670005,
2349
- "grad_norm": 0.4892091751098633,
2350
- "learning_rate": 3.526952185877781e-05,
2351
- "loss": 0.877,
2352
- "step": 642
2353
- },
2354
- {
2355
- "epoch": 0.6449674511767651,
2356
- "grad_norm": 0.39520540833473206,
2357
- "learning_rate": 3.516308917139678e-05,
2358
- "loss": 0.9643,
2359
- "step": 644
2360
- },
2361
- {
2362
- "epoch": 0.6469704556835253,
2363
- "grad_norm": 0.5455682873725891,
2364
- "learning_rate": 3.505643535308499e-05,
2365
- "loss": 0.9473,
2366
- "step": 646
2367
- },
2368
- {
2369
- "epoch": 0.6489734601902855,
2370
- "grad_norm": 0.40943270921707153,
2371
- "learning_rate": 3.494956272444177e-05,
2372
- "loss": 0.9506,
2373
- "step": 648
2374
- },
2375
- {
2376
- "epoch": 0.6509764646970456,
2377
- "grad_norm": 0.3957885503768921,
2378
- "learning_rate": 3.484247361082741e-05,
2379
- "loss": 0.8854,
2380
- "step": 650
2381
- },
2382
- {
2383
- "epoch": 0.6509764646970456,
2384
- "eval_loss": 1.1660796403884888,
2385
- "eval_runtime": 3.827,
2386
- "eval_samples_per_second": 15.155,
2387
- "eval_steps_per_second": 7.578,
2388
- "step": 650
2389
- },
2390
- {
2391
- "epoch": 0.6529794692038057,
2392
- "grad_norm": 0.4576199948787689,
2393
- "learning_rate": 3.473517034231251e-05,
2394
- "loss": 0.8848,
2395
- "step": 652
2396
- },
2397
- {
2398
- "epoch": 0.6549824737105658,
2399
- "grad_norm": 0.45555633306503296,
2400
- "learning_rate": 3.4627655253627323e-05,
2401
- "loss": 0.954,
2402
- "step": 654
2403
- },
2404
- {
2405
- "epoch": 0.656985478217326,
2406
- "grad_norm": 0.45799553394317627,
2407
- "learning_rate": 3.451993068411092e-05,
2408
- "loss": 0.9766,
2409
- "step": 656
2410
- },
2411
- {
2412
- "epoch": 0.6589884827240862,
2413
- "grad_norm": 0.44451501965522766,
2414
- "learning_rate": 3.441199897766031e-05,
2415
- "loss": 0.9934,
2416
- "step": 658
2417
- },
2418
- {
2419
- "epoch": 0.6609914872308462,
2420
- "grad_norm": 0.43687155842781067,
2421
- "learning_rate": 3.430386248267943e-05,
2422
- "loss": 0.8342,
2423
- "step": 660
2424
- },
2425
- {
2426
- "epoch": 0.6629944917376064,
2427
- "grad_norm": 0.385002076625824,
2428
- "learning_rate": 3.419552355202807e-05,
2429
- "loss": 0.9195,
2430
- "step": 662
2431
- },
2432
- {
2433
- "epoch": 0.6649974962443665,
2434
- "grad_norm": 0.4921188950538635,
2435
- "learning_rate": 3.408698454297067e-05,
2436
- "loss": 0.894,
2437
- "step": 664
2438
- },
2439
- {
2440
- "epoch": 0.6670005007511267,
2441
- "grad_norm": 0.45717331767082214,
2442
- "learning_rate": 3.397824781712499e-05,
2443
- "loss": 0.9223,
2444
- "step": 666
2445
- },
2446
- {
2447
- "epoch": 0.6690035052578869,
2448
- "grad_norm": 0.6077693700790405,
2449
- "learning_rate": 3.386931574041079e-05,
2450
- "loss": 0.8307,
2451
- "step": 668
2452
- },
2453
- {
2454
- "epoch": 0.6710065097646469,
2455
- "grad_norm": 0.5416433215141296,
2456
- "learning_rate": 3.376019068299832e-05,
2457
- "loss": 0.9084,
2458
- "step": 670
2459
- },
2460
- {
2461
- "epoch": 0.6730095142714071,
2462
- "grad_norm": 0.48100745677948,
2463
- "learning_rate": 3.365087501925673e-05,
2464
- "loss": 0.8687,
2465
- "step": 672
2466
- },
2467
- {
2468
- "epoch": 0.6750125187781673,
2469
- "grad_norm": 0.4744812846183777,
2470
- "learning_rate": 3.354137112770244e-05,
2471
- "loss": 0.9819,
2472
- "step": 674
2473
- },
2474
- {
2475
- "epoch": 0.6770155232849274,
2476
- "grad_norm": 0.5188727378845215,
2477
- "learning_rate": 3.343168139094738e-05,
2478
- "loss": 0.8702,
2479
- "step": 676
2480
- },
2481
- {
2482
- "epoch": 0.6790185277916875,
2483
- "grad_norm": 0.42871081829071045,
2484
- "learning_rate": 3.332180819564714e-05,
2485
- "loss": 0.9244,
2486
- "step": 678
2487
- },
2488
- {
2489
- "epoch": 0.6810215322984476,
2490
- "grad_norm": 0.3858610689640045,
2491
- "learning_rate": 3.321175393244904e-05,
2492
- "loss": 0.8371,
2493
- "step": 680
2494
- },
2495
- {
2496
- "epoch": 0.6830245368052078,
2497
- "grad_norm": 0.459778368473053,
2498
- "learning_rate": 3.310152099594013e-05,
2499
- "loss": 0.9146,
2500
- "step": 682
2501
- },
2502
- {
2503
- "epoch": 0.685027541311968,
2504
- "grad_norm": 0.36012330651283264,
2505
- "learning_rate": 3.299111178459507e-05,
2506
- "loss": 0.9806,
2507
- "step": 684
2508
- },
2509
- {
2510
- "epoch": 0.6870305458187281,
2511
- "grad_norm": 0.4208768606185913,
2512
- "learning_rate": 3.288052870072395e-05,
2513
- "loss": 0.8729,
2514
- "step": 686
2515
- },
2516
- {
2517
- "epoch": 0.6890335503254882,
2518
- "grad_norm": 0.4012265205383301,
2519
- "learning_rate": 3.2769774150420015e-05,
2520
- "loss": 0.8586,
2521
- "step": 688
2522
- },
2523
- {
2524
- "epoch": 0.6910365548322484,
2525
- "grad_norm": 0.442624032497406,
2526
- "learning_rate": 3.2658850543507334e-05,
2527
- "loss": 0.931,
2528
- "step": 690
2529
- },
2530
- {
2531
- "epoch": 0.6930395593390085,
2532
- "grad_norm": 0.3907168209552765,
2533
- "learning_rate": 3.2547760293488335e-05,
2534
- "loss": 0.9246,
2535
- "step": 692
2536
- },
2537
- {
2538
- "epoch": 0.6950425638457687,
2539
- "grad_norm": 0.4578626751899719,
2540
- "learning_rate": 3.2436505817491305e-05,
2541
- "loss": 0.9339,
2542
- "step": 694
2543
- },
2544
- {
2545
- "epoch": 0.6970455683525288,
2546
- "grad_norm": 0.49979129433631897,
2547
- "learning_rate": 3.2325089536217815e-05,
2548
- "loss": 0.9637,
2549
- "step": 696
2550
- },
2551
- {
2552
- "epoch": 0.6990485728592889,
2553
- "grad_norm": 0.41651976108551025,
2554
- "learning_rate": 3.2213513873890026e-05,
2555
- "loss": 0.9365,
2556
- "step": 698
2557
- },
2558
- {
2559
- "epoch": 0.7010515773660491,
2560
- "grad_norm": 0.4993303120136261,
2561
- "learning_rate": 3.210178125819795e-05,
2562
- "loss": 0.8978,
2563
- "step": 700
2564
- },
2565
- {
2566
- "epoch": 0.7010515773660491,
2567
- "eval_loss": 1.1489382982254028,
2568
- "eval_runtime": 3.8105,
2569
- "eval_samples_per_second": 15.221,
2570
- "eval_steps_per_second": 7.611,
2571
- "step": 700
2572
- },
2573
- {
2574
- "epoch": 0.7030545818728092,
2575
- "grad_norm": 0.5267933011054993,
2576
- "learning_rate": 3.1989894120246614e-05,
2577
- "loss": 0.8641,
2578
- "step": 702
2579
- },
2580
- {
2581
- "epoch": 0.7050575863795694,
2582
- "grad_norm": 0.5193835496902466,
2583
- "learning_rate": 3.1877854894503204e-05,
2584
- "loss": 0.9497,
2585
- "step": 704
2586
- },
2587
- {
2588
- "epoch": 0.7070605908863294,
2589
- "grad_norm": 0.43787896633148193,
2590
- "learning_rate": 3.1765666018744046e-05,
2591
- "loss": 0.8907,
2592
- "step": 706
2593
- },
2594
- {
2595
- "epoch": 0.7090635953930896,
2596
- "grad_norm": 0.418584406375885,
2597
- "learning_rate": 3.1653329934001584e-05,
2598
- "loss": 0.9517,
2599
- "step": 708
2600
- },
2601
- {
2602
- "epoch": 0.7110665998998498,
2603
- "grad_norm": 0.6064937114715576,
2604
- "learning_rate": 3.154084908451131e-05,
2605
- "loss": 0.8603,
2606
- "step": 710
2607
- },
2608
- {
2609
- "epoch": 0.7130696044066099,
2610
- "grad_norm": 0.37019243836402893,
2611
- "learning_rate": 3.142822591765851e-05,
2612
- "loss": 0.8974,
2613
- "step": 712
2614
- },
2615
- {
2616
- "epoch": 0.7150726089133701,
2617
- "grad_norm": 0.38166865706443787,
2618
- "learning_rate": 3.1315462883925025e-05,
2619
- "loss": 0.9558,
2620
- "step": 714
2621
- },
2622
- {
2623
- "epoch": 0.7170756134201302,
2624
- "grad_norm": 0.45281273126602173,
2625
- "learning_rate": 3.1202562436836e-05,
2626
- "loss": 0.9325,
2627
- "step": 716
2628
- },
2629
- {
2630
- "epoch": 0.7190786179268903,
2631
- "grad_norm": 0.4501991868019104,
2632
- "learning_rate": 3.1089527032906425e-05,
2633
- "loss": 0.9862,
2634
- "step": 718
2635
- },
2636
- {
2637
- "epoch": 0.7210816224336505,
2638
- "grad_norm": 0.43729260563850403,
2639
- "learning_rate": 3.097635913158772e-05,
2640
- "loss": 0.9339,
2641
- "step": 720
2642
- },
2643
- {
2644
- "epoch": 0.7230846269404106,
2645
- "grad_norm": 0.5757997632026672,
2646
- "learning_rate": 3.08630611952142e-05,
2647
- "loss": 0.8904,
2648
- "step": 722
2649
- },
2650
- {
2651
- "epoch": 0.7250876314471708,
2652
- "grad_norm": 0.4715934991836548,
2653
- "learning_rate": 3.0749635688949545e-05,
2654
- "loss": 0.8899,
2655
- "step": 724
2656
- },
2657
- {
2658
- "epoch": 0.7270906359539309,
2659
- "grad_norm": 0.5050368905067444,
2660
- "learning_rate": 3.063608508073311e-05,
2661
- "loss": 0.9324,
2662
- "step": 726
2663
- },
2664
- {
2665
- "epoch": 0.729093640460691,
2666
- "grad_norm": 0.6013456583023071,
2667
- "learning_rate": 3.052241184122625e-05,
2668
- "loss": 0.9626,
2669
- "step": 728
2670
- },
2671
- {
2672
- "epoch": 0.7310966449674512,
2673
- "grad_norm": 0.45164185762405396,
2674
- "learning_rate": 3.0408618443758557e-05,
2675
- "loss": 0.8899,
2676
- "step": 730
2677
- },
2678
- {
2679
- "epoch": 0.7330996494742114,
2680
- "grad_norm": 0.4240935444831848,
2681
- "learning_rate": 3.0294707364274067e-05,
2682
- "loss": 0.9151,
2683
- "step": 732
2684
- },
2685
- {
2686
- "epoch": 0.7351026539809714,
2687
- "grad_norm": 0.548370361328125,
2688
- "learning_rate": 3.018068108127735e-05,
2689
- "loss": 0.8976,
2690
- "step": 734
2691
- },
2692
- {
2693
- "epoch": 0.7371056584877316,
2694
- "grad_norm": 0.4141191840171814,
2695
- "learning_rate": 3.0066542075779602e-05,
2696
- "loss": 0.9035,
2697
- "step": 736
2698
- },
2699
- {
2700
- "epoch": 0.7391086629944917,
2701
- "grad_norm": 0.4236369729042053,
2702
- "learning_rate": 2.9952292831244676e-05,
2703
- "loss": 0.8906,
2704
- "step": 738
2705
- },
2706
- {
2707
- "epoch": 0.7411116675012519,
2708
- "grad_norm": 0.3607020974159241,
2709
- "learning_rate": 2.9837935833535037e-05,
2710
- "loss": 0.9423,
2711
- "step": 740
2712
- },
2713
- {
2714
- "epoch": 0.7431146720080121,
2715
- "grad_norm": 0.4230390191078186,
2716
- "learning_rate": 2.9723473570857642e-05,
2717
- "loss": 0.9092,
2718
- "step": 742
2719
- },
2720
- {
2721
- "epoch": 0.7451176765147721,
2722
- "grad_norm": 0.3703189492225647,
2723
- "learning_rate": 2.960890853370985e-05,
2724
- "loss": 0.8663,
2725
- "step": 744
2726
- },
2727
- {
2728
- "epoch": 0.7471206810215323,
2729
- "grad_norm": 0.49546095728874207,
2730
- "learning_rate": 2.9494243214825208e-05,
2731
- "loss": 0.8875,
2732
- "step": 746
2733
- },
2734
- {
2735
- "epoch": 0.7491236855282924,
2736
- "grad_norm": 0.44254347681999207,
2737
- "learning_rate": 2.9379480109119213e-05,
2738
- "loss": 0.923,
2739
- "step": 748
2740
- },
2741
- {
2742
- "epoch": 0.7511266900350526,
2743
- "grad_norm": 0.4102881848812103,
2744
- "learning_rate": 2.9264621713635028e-05,
2745
- "loss": 0.9357,
2746
- "step": 750
2747
- },
2748
- {
2749
- "epoch": 0.7511266900350526,
2750
- "eval_loss": 1.1563700437545776,
2751
- "eval_runtime": 3.8041,
2752
- "eval_samples_per_second": 15.247,
2753
- "eval_steps_per_second": 7.623,
2754
- "step": 750
2755
- },
2756
- {
2757
- "epoch": 0.7531296945418127,
2758
- "grad_norm": 0.42651745676994324,
2759
- "learning_rate": 2.914967052748917e-05,
2760
- "loss": 0.9277,
2761
- "step": 752
2762
- },
2763
- {
2764
- "epoch": 0.7551326990485728,
2765
- "grad_norm": 0.37917560338974,
2766
- "learning_rate": 2.9034629051817096e-05,
2767
- "loss": 0.9717,
2768
- "step": 754
2769
- },
2770
- {
2771
- "epoch": 0.757135703555333,
2772
- "grad_norm": 0.4591340720653534,
2773
- "learning_rate": 2.891949978971883e-05,
2774
- "loss": 0.9336,
2775
- "step": 756
2776
- },
2777
- {
2778
- "epoch": 0.7591387080620932,
2779
- "grad_norm": 0.5880463719367981,
2780
- "learning_rate": 2.8804285246204438e-05,
2781
- "loss": 0.9098,
2782
- "step": 758
2783
- },
2784
- {
2785
- "epoch": 0.7611417125688533,
2786
- "grad_norm": 0.39928752183914185,
2787
- "learning_rate": 2.8688987928139588e-05,
2788
- "loss": 0.8258,
2789
- "step": 760
2790
- },
2791
- {
2792
- "epoch": 0.7631447170756134,
2793
- "grad_norm": 0.5559530258178711,
2794
- "learning_rate": 2.8573610344190975e-05,
2795
- "loss": 0.8728,
2796
- "step": 762
2797
- },
2798
- {
2799
- "epoch": 0.7651477215823735,
2800
- "grad_norm": 0.49999016523361206,
2801
- "learning_rate": 2.8458155004771724e-05,
2802
- "loss": 1.0135,
2803
- "step": 764
2804
- },
2805
- {
2806
- "epoch": 0.7671507260891337,
2807
- "grad_norm": 0.35017403960227966,
2808
- "learning_rate": 2.8342624421986797e-05,
2809
- "loss": 0.8929,
2810
- "step": 766
2811
- },
2812
- {
2813
- "epoch": 0.7691537305958939,
2814
- "grad_norm": 0.48860040307044983,
2815
- "learning_rate": 2.822702110957831e-05,
2816
- "loss": 0.8784,
2817
- "step": 768
2818
- },
2819
- {
2820
- "epoch": 0.771156735102654,
2821
- "grad_norm": 0.4092211425304413,
2822
- "learning_rate": 2.811134758287085e-05,
2823
- "loss": 0.8643,
2824
- "step": 770
2825
- },
2826
- {
2827
- "epoch": 0.7731597396094141,
2828
- "grad_norm": 0.517197847366333,
2829
- "learning_rate": 2.799560635871675e-05,
2830
- "loss": 0.9033,
2831
- "step": 772
2832
- },
2833
- {
2834
- "epoch": 0.7751627441161743,
2835
- "grad_norm": 0.40133723616600037,
2836
- "learning_rate": 2.78797999554413e-05,
2837
- "loss": 0.9308,
2838
- "step": 774
2839
- },
2840
- {
2841
- "epoch": 0.7771657486229344,
2842
- "grad_norm": 0.4061048626899719,
2843
- "learning_rate": 2.7763930892787992e-05,
2844
- "loss": 0.9076,
2845
- "step": 776
2846
- },
2847
- {
2848
- "epoch": 0.7791687531296946,
2849
- "grad_norm": 0.5977723002433777,
2850
- "learning_rate": 2.7648001691863673e-05,
2851
- "loss": 0.8699,
2852
- "step": 778
2853
- },
2854
- {
2855
- "epoch": 0.7811717576364546,
2856
- "grad_norm": 0.3865041136741638,
2857
- "learning_rate": 2.753201487508369e-05,
2858
- "loss": 0.9565,
2859
- "step": 780
2860
- },
2861
- {
2862
- "epoch": 0.7831747621432148,
2863
- "grad_norm": 0.49114081263542175,
2864
- "learning_rate": 2.7415972966117014e-05,
2865
- "loss": 0.8533,
2866
- "step": 782
2867
- },
2868
- {
2869
- "epoch": 0.785177766649975,
2870
- "grad_norm": 0.3852551281452179,
2871
- "learning_rate": 2.7299878489831316e-05,
2872
- "loss": 0.8556,
2873
- "step": 784
2874
- },
2875
- {
2876
- "epoch": 0.7871807711567351,
2877
- "grad_norm": 0.4888080060482025,
2878
- "learning_rate": 2.718373397223804e-05,
2879
- "loss": 0.8734,
2880
- "step": 786
2881
- },
2882
- {
2883
- "epoch": 0.7891837756634953,
2884
- "grad_norm": 0.4077546298503876,
2885
- "learning_rate": 2.706754194043746e-05,
2886
- "loss": 0.9392,
2887
- "step": 788
2888
- },
2889
- {
2890
- "epoch": 0.7911867801702553,
2891
- "grad_norm": 0.408587247133255,
2892
- "learning_rate": 2.6951304922563642e-05,
2893
- "loss": 0.8565,
2894
- "step": 790
2895
- },
2896
- {
2897
- "epoch": 0.7931897846770155,
2898
- "grad_norm": 0.45802196860313416,
2899
- "learning_rate": 2.6835025447729495e-05,
2900
- "loss": 0.9535,
2901
- "step": 792
2902
- },
2903
- {
2904
- "epoch": 0.7951927891837757,
2905
- "grad_norm": 0.4353581964969635,
2906
- "learning_rate": 2.6718706045971726e-05,
2907
- "loss": 0.8428,
2908
- "step": 794
2909
- },
2910
- {
2911
- "epoch": 0.7971957936905358,
2912
- "grad_norm": 0.4018676280975342,
2913
- "learning_rate": 2.6602349248195746e-05,
2914
- "loss": 0.8754,
2915
- "step": 796
2916
- },
2917
- {
2918
- "epoch": 0.7991987981972959,
2919
- "grad_norm": 0.4653930068016052,
2920
- "learning_rate": 2.6485957586120663e-05,
2921
- "loss": 0.7725,
2922
- "step": 798
2923
- },
2924
- {
2925
- "epoch": 0.801201802704056,
2926
- "grad_norm": 0.5806179642677307,
2927
- "learning_rate": 2.6369533592224172e-05,
2928
- "loss": 0.8955,
2929
- "step": 800
2930
- },
2931
- {
2932
- "epoch": 0.801201802704056,
2933
- "eval_loss": 1.1470181941986084,
2934
- "eval_runtime": 3.7967,
2935
- "eval_samples_per_second": 15.277,
2936
- "eval_steps_per_second": 7.638,
2937
- "step": 800
2938
- },
2939
- {
2940
- "epoch": 0.8032048072108162,
2941
- "grad_norm": 0.4590522348880768,
2942
- "learning_rate": 2.6253079799687435e-05,
2943
- "loss": 0.9738,
2944
- "step": 802
2945
- },
2946
- {
2947
- "epoch": 0.8052078117175764,
2948
- "grad_norm": 0.5188782811164856,
2949
- "learning_rate": 2.613659874233999e-05,
2950
- "loss": 0.9573,
2951
- "step": 804
2952
- },
2953
- {
2954
- "epoch": 0.8072108162243365,
2955
- "grad_norm": 0.4585997760295868,
2956
- "learning_rate": 2.6020092954604614e-05,
2957
- "loss": 0.948,
2958
- "step": 806
2959
- },
2960
- {
2961
- "epoch": 0.8092138207310966,
2962
- "grad_norm": 0.39974266290664673,
2963
- "learning_rate": 2.5903564971442167e-05,
2964
- "loss": 1.0123,
2965
- "step": 808
2966
- },
2967
- {
2968
- "epoch": 0.8112168252378568,
2969
- "grad_norm": 0.4484356641769409,
2970
- "learning_rate": 2.5787017328296447e-05,
2971
- "loss": 0.8262,
2972
- "step": 810
2973
- },
2974
- {
2975
- "epoch": 0.8132198297446169,
2976
- "grad_norm": 0.4441506862640381,
2977
- "learning_rate": 2.5670452561039004e-05,
2978
- "loss": 0.8683,
2979
- "step": 812
2980
- },
2981
- {
2982
- "epoch": 0.8152228342513771,
2983
- "grad_norm": 0.6077110171318054,
2984
- "learning_rate": 2.555387320591401e-05,
2985
- "loss": 0.8657,
2986
- "step": 814
2987
- },
2988
- {
2989
- "epoch": 0.8172258387581373,
2990
- "grad_norm": 0.3740634322166443,
2991
- "learning_rate": 2.5437281799483004e-05,
2992
- "loss": 0.9215,
2993
- "step": 816
2994
- },
2995
- {
2996
- "epoch": 0.8192288432648973,
2997
- "grad_norm": 0.516426682472229,
2998
- "learning_rate": 2.5320680878569768e-05,
2999
- "loss": 0.8907,
3000
- "step": 818
3001
- },
3002
- {
3003
- "epoch": 0.8212318477716575,
3004
- "grad_norm": 0.42550894618034363,
3005
- "learning_rate": 2.5204072980205092e-05,
3006
- "loss": 0.9188,
3007
- "step": 820
3008
- },
3009
- {
3010
- "epoch": 0.8232348522784176,
3011
- "grad_norm": 0.5615983605384827,
3012
- "learning_rate": 2.508746064157159e-05,
3013
- "loss": 1.0489,
3014
- "step": 822
3015
- },
3016
- {
3017
- "epoch": 0.8252378567851778,
3018
- "grad_norm": 0.4470774233341217,
3019
- "learning_rate": 2.4970846399948487e-05,
3020
- "loss": 0.8668,
3021
- "step": 824
3022
- },
3023
- {
3024
- "epoch": 0.8272408612919379,
3025
- "grad_norm": 0.440336138010025,
3026
- "learning_rate": 2.4854232792656394e-05,
3027
- "loss": 0.8658,
3028
- "step": 826
3029
- },
3030
- {
3031
- "epoch": 0.829243865798698,
3032
- "grad_norm": 0.41719090938568115,
3033
- "learning_rate": 2.473762235700214e-05,
3034
- "loss": 0.9103,
3035
- "step": 828
3036
- },
3037
- {
3038
- "epoch": 0.8312468703054582,
3039
- "grad_norm": 0.4663768410682678,
3040
- "learning_rate": 2.462101763022356e-05,
3041
- "loss": 0.8621,
3042
- "step": 830
3043
- },
3044
- {
3045
- "epoch": 0.8332498748122183,
3046
- "grad_norm": 0.4149011969566345,
3047
- "learning_rate": 2.4504421149434233e-05,
3048
- "loss": 0.82,
3049
- "step": 832
3050
- },
3051
- {
3052
- "epoch": 0.8352528793189785,
3053
- "grad_norm": 0.4140399992465973,
3054
- "learning_rate": 2.4387835451568355e-05,
3055
- "loss": 0.9775,
3056
- "step": 834
3057
- },
3058
- {
3059
- "epoch": 0.8372558838257386,
3060
- "grad_norm": 0.44181761145591736,
3061
- "learning_rate": 2.427126307332549e-05,
3062
- "loss": 0.8591,
3063
- "step": 836
3064
- },
3065
- {
3066
- "epoch": 0.8392588883324987,
3067
- "grad_norm": 0.4710381031036377,
3068
- "learning_rate": 2.4154706551115384e-05,
3069
- "loss": 0.8738,
3070
- "step": 838
3071
- },
3072
- {
3073
- "epoch": 0.8412618928392589,
3074
- "grad_norm": 0.5030112266540527,
3075
- "learning_rate": 2.4038168421002794e-05,
3076
- "loss": 0.9506,
3077
- "step": 840
3078
- },
3079
- {
3080
- "epoch": 0.8432648973460191,
3081
- "grad_norm": 0.5199030041694641,
3082
- "learning_rate": 2.3921651218652293e-05,
3083
- "loss": 0.8508,
3084
- "step": 842
3085
- },
3086
- {
3087
- "epoch": 0.8452679018527791,
3088
- "grad_norm": 0.5105124115943909,
3089
- "learning_rate": 2.380515747927312e-05,
3090
- "loss": 0.8432,
3091
- "step": 844
3092
- },
3093
- {
3094
- "epoch": 0.8472709063595393,
3095
- "grad_norm": 0.49101004004478455,
3096
- "learning_rate": 2.3688689737563967e-05,
3097
- "loss": 0.9014,
3098
- "step": 846
3099
- },
3100
- {
3101
- "epoch": 0.8492739108662994,
3102
- "grad_norm": 0.4043116569519043,
3103
- "learning_rate": 2.3572250527657895e-05,
3104
- "loss": 0.9011,
3105
- "step": 848
3106
- },
3107
- {
3108
- "epoch": 0.8512769153730596,
3109
- "grad_norm": 0.4326643645763397,
3110
- "learning_rate": 2.345584238306713e-05,
3111
- "loss": 0.8597,
3112
- "step": 850
3113
- },
3114
- {
3115
- "epoch": 0.8512769153730596,
3116
- "eval_loss": 1.1495444774627686,
3117
- "eval_runtime": 3.7962,
3118
- "eval_samples_per_second": 15.279,
3119
- "eval_steps_per_second": 7.639,
3120
- "step": 850
3121
- },
3122
- {
3123
- "epoch": 0.8532799198798198,
3124
- "grad_norm": 0.5106630325317383,
3125
- "learning_rate": 2.3339467836628017e-05,
3126
- "loss": 0.9167,
3127
- "step": 852
3128
- },
3129
- {
3130
- "epoch": 0.8552829243865798,
3131
- "grad_norm": 0.42315831780433655,
3132
- "learning_rate": 2.322312942044581e-05,
3133
- "loss": 0.9248,
3134
- "step": 854
3135
- },
3136
- {
3137
- "epoch": 0.85728592889334,
3138
- "grad_norm": 0.4706262946128845,
3139
- "learning_rate": 2.3106829665839677e-05,
3140
- "loss": 0.8772,
3141
- "step": 856
3142
- },
3143
- {
3144
- "epoch": 0.8592889334001002,
3145
- "grad_norm": 0.7145017385482788,
3146
- "learning_rate": 2.2990571103287567e-05,
3147
- "loss": 0.9167,
3148
- "step": 858
3149
- },
3150
- {
3151
- "epoch": 0.8612919379068603,
3152
- "grad_norm": 0.47455379366874695,
3153
- "learning_rate": 2.2874356262371134e-05,
3154
- "loss": 0.9008,
3155
- "step": 860
3156
- },
3157
- {
3158
- "epoch": 0.8632949424136205,
3159
- "grad_norm": 0.41509053111076355,
3160
- "learning_rate": 2.2758187671720772e-05,
3161
- "loss": 0.8976,
3162
- "step": 862
3163
- },
3164
- {
3165
- "epoch": 0.8652979469203805,
3166
- "grad_norm": 0.5434259176254272,
3167
- "learning_rate": 2.2642067858960514e-05,
3168
- "loss": 0.8593,
3169
- "step": 864
3170
- },
3171
- {
3172
- "epoch": 0.8673009514271407,
3173
- "grad_norm": 0.43615275621414185,
3174
- "learning_rate": 2.2525999350653095e-05,
3175
- "loss": 0.9305,
3176
- "step": 866
3177
- },
3178
- {
3179
- "epoch": 0.8693039559339009,
3180
- "grad_norm": 0.5843902230262756,
3181
- "learning_rate": 2.2409984672244934e-05,
3182
- "loss": 0.8521,
3183
- "step": 868
3184
- },
3185
- {
3186
- "epoch": 0.871306960440661,
3187
- "grad_norm": 0.35046350955963135,
3188
- "learning_rate": 2.2294026348011223e-05,
3189
- "loss": 0.8392,
3190
- "step": 870
3191
- },
3192
- {
3193
- "epoch": 0.8733099649474211,
3194
- "grad_norm": 0.4275960624217987,
3195
- "learning_rate": 2.2178126901000996e-05,
3196
- "loss": 0.8883,
3197
- "step": 872
3198
- },
3199
- {
3200
- "epoch": 0.8753129694541812,
3201
- "grad_norm": 1.0779649019241333,
3202
- "learning_rate": 2.2062288852982182e-05,
3203
- "loss": 0.9226,
3204
- "step": 874
3205
- },
3206
- {
3207
- "epoch": 0.8773159739609414,
3208
- "grad_norm": 0.43578073382377625,
3209
- "learning_rate": 2.1946514724386828e-05,
3210
- "loss": 0.877,
3211
- "step": 876
3212
- },
3213
- {
3214
- "epoch": 0.8793189784677016,
3215
- "grad_norm": 0.5768626928329468,
3216
- "learning_rate": 2.1830807034256154e-05,
3217
- "loss": 0.8844,
3218
- "step": 878
3219
- },
3220
- {
3221
- "epoch": 0.8813219829744617,
3222
- "grad_norm": 0.4431218206882477,
3223
- "learning_rate": 2.1715168300185848e-05,
3224
- "loss": 0.9106,
3225
- "step": 880
3226
- },
3227
- {
3228
- "epoch": 0.8833249874812218,
3229
- "grad_norm": 0.44507092237472534,
3230
- "learning_rate": 2.1599601038271186e-05,
3231
- "loss": 0.9349,
3232
- "step": 882
3233
- },
3234
- {
3235
- "epoch": 0.885327991987982,
3236
- "grad_norm": 0.42408713698387146,
3237
- "learning_rate": 2.148410776305237e-05,
3238
- "loss": 0.8704,
3239
- "step": 884
3240
- },
3241
- {
3242
- "epoch": 0.8873309964947421,
3243
- "grad_norm": 0.45474737882614136,
3244
- "learning_rate": 2.136869098745978e-05,
3245
- "loss": 0.8854,
3246
- "step": 886
3247
- },
3248
- {
3249
- "epoch": 0.8893340010015023,
3250
- "grad_norm": 0.42297935485839844,
3251
- "learning_rate": 2.125335322275928e-05,
3252
- "loss": 0.8438,
3253
- "step": 888
3254
- },
3255
- {
3256
- "epoch": 0.8913370055082624,
3257
- "grad_norm": 0.5911722779273987,
3258
- "learning_rate": 2.1138096978497617e-05,
3259
- "loss": 0.8021,
3260
- "step": 890
3261
- },
3262
- {
3263
- "epoch": 0.8933400100150225,
3264
- "grad_norm": 0.5190030336380005,
3265
- "learning_rate": 2.1022924762447767e-05,
3266
- "loss": 0.8814,
3267
- "step": 892
3268
- },
3269
- {
3270
- "epoch": 0.8953430145217827,
3271
- "grad_norm": 0.4616602957248688,
3272
- "learning_rate": 2.0907839080554443e-05,
3273
- "loss": 0.9051,
3274
- "step": 894
3275
- },
3276
- {
3277
- "epoch": 0.8973460190285428,
3278
- "grad_norm": 0.6448442935943604,
3279
- "learning_rate": 2.079284243687948e-05,
3280
- "loss": 0.8667,
3281
- "step": 896
3282
- },
3283
- {
3284
- "epoch": 0.899349023535303,
3285
- "grad_norm": 0.46473053097724915,
3286
- "learning_rate": 2.067793733354743e-05,
3287
- "loss": 0.8543,
3288
- "step": 898
3289
- },
3290
- {
3291
- "epoch": 0.901352028042063,
3292
- "grad_norm": 0.47952961921691895,
3293
- "learning_rate": 2.0563126270691097e-05,
3294
- "loss": 0.869,
3295
- "step": 900
3296
- },
3297
- {
3298
- "epoch": 0.901352028042063,
3299
- "eval_loss": 1.1418862342834473,
3300
- "eval_runtime": 3.8348,
3301
- "eval_samples_per_second": 15.125,
3302
- "eval_steps_per_second": 7.562,
3303
- "step": 900
3304
- },
3305
- {
3306
- "epoch": 0.9033550325488232,
3307
- "grad_norm": 0.4736415147781372,
3308
- "learning_rate": 2.044841174639708e-05,
3309
- "loss": 0.8937,
3310
- "step": 902
3311
- },
3312
- {
3313
- "epoch": 0.9053580370555834,
3314
- "grad_norm": 0.48480942845344543,
3315
- "learning_rate": 2.0333796256651533e-05,
3316
- "loss": 0.9146,
3317
- "step": 904
3318
- },
3319
- {
3320
- "epoch": 0.9073610415623435,
3321
- "grad_norm": 0.519432544708252,
3322
- "learning_rate": 2.0219282295285737e-05,
3323
- "loss": 0.8845,
3324
- "step": 906
3325
- },
3326
- {
3327
- "epoch": 0.9093640460691037,
3328
- "grad_norm": 0.47801777720451355,
3329
- "learning_rate": 2.0104872353921927e-05,
3330
- "loss": 0.8701,
3331
- "step": 908
3332
- },
3333
- {
3334
- "epoch": 0.9113670505758638,
3335
- "grad_norm": 0.5259170532226562,
3336
- "learning_rate": 1.999056892191904e-05,
3337
- "loss": 0.9299,
3338
- "step": 910
3339
- },
3340
- {
3341
- "epoch": 0.9133700550826239,
3342
- "grad_norm": 0.503354549407959,
3343
- "learning_rate": 1.9876374486318543e-05,
3344
- "loss": 0.8895,
3345
- "step": 912
3346
- },
3347
- {
3348
- "epoch": 0.9153730595893841,
3349
- "grad_norm": 0.5313873887062073,
3350
- "learning_rate": 1.9762291531790355e-05,
3351
- "loss": 0.8254,
3352
- "step": 914
3353
- },
3354
- {
3355
- "epoch": 0.9173760640961443,
3356
- "grad_norm": 0.5693700313568115,
3357
- "learning_rate": 1.9648322540578744e-05,
3358
- "loss": 0.8246,
3359
- "step": 916
3360
- },
3361
- {
3362
- "epoch": 0.9193790686029043,
3363
- "grad_norm": 0.5147340893745422,
3364
- "learning_rate": 1.9534469992448358e-05,
3365
- "loss": 0.8987,
3366
- "step": 918
3367
- },
3368
- {
3369
- "epoch": 0.9213820731096645,
3370
- "grad_norm": 0.718410849571228,
3371
- "learning_rate": 1.9420736364630215e-05,
3372
- "loss": 0.8385,
3373
- "step": 920
3374
- },
3375
- {
3376
- "epoch": 0.9233850776164246,
3377
- "grad_norm": 0.49588289856910706,
3378
- "learning_rate": 1.9307124131767877e-05,
3379
- "loss": 0.8652,
3380
- "step": 922
3381
- },
3382
- {
3383
- "epoch": 0.9253880821231848,
3384
- "grad_norm": 0.6265762448310852,
3385
- "learning_rate": 1.9193635765863523e-05,
3386
- "loss": 0.8964,
3387
- "step": 924
3388
- },
3389
- {
3390
- "epoch": 0.927391086629945,
3391
- "grad_norm": 0.4153289496898651,
3392
- "learning_rate": 1.9080273736224236e-05,
3393
- "loss": 0.9286,
3394
- "step": 926
3395
- },
3396
- {
3397
- "epoch": 0.929394091136705,
3398
- "grad_norm": 0.6794211864471436,
3399
- "learning_rate": 1.8967040509408253e-05,
3400
- "loss": 0.9141,
3401
- "step": 928
3402
- },
3403
- {
3404
- "epoch": 0.9313970956434652,
3405
- "grad_norm": 0.595132052898407,
3406
- "learning_rate": 1.885393854917124e-05,
3407
- "loss": 0.8353,
3408
- "step": 930
3409
- },
3410
- {
3411
- "epoch": 0.9334001001502253,
3412
- "grad_norm": 0.4146586060523987,
3413
- "learning_rate": 1.8740970316412793e-05,
3414
- "loss": 0.898,
3415
- "step": 932
3416
- },
3417
- {
3418
- "epoch": 0.9354031046569855,
3419
- "grad_norm": 0.5133841633796692,
3420
- "learning_rate": 1.8628138269122773e-05,
3421
- "loss": 0.8648,
3422
- "step": 934
3423
- },
3424
- {
3425
- "epoch": 0.9374061091637457,
3426
- "grad_norm": 0.4042494595050812,
3427
- "learning_rate": 1.8515444862327946e-05,
3428
- "loss": 0.9285,
3429
- "step": 936
3430
- },
3431
- {
3432
- "epoch": 0.9394091136705057,
3433
- "grad_norm": 0.4541870057582855,
3434
- "learning_rate": 1.8402892548038453e-05,
3435
- "loss": 0.905,
3436
- "step": 938
3437
- },
3438
- {
3439
- "epoch": 0.9414121181772659,
3440
- "grad_norm": 0.4241974949836731,
3441
- "learning_rate": 1.829048377519455e-05,
3442
- "loss": 0.9802,
3443
- "step": 940
3444
- },
3445
- {
3446
- "epoch": 0.943415122684026,
3447
- "grad_norm": 0.5843325257301331,
3448
- "learning_rate": 1.8178220989613254e-05,
3449
- "loss": 0.8694,
3450
- "step": 942
3451
- },
3452
- {
3453
- "epoch": 0.9454181271907862,
3454
- "grad_norm": 0.3579271137714386,
3455
- "learning_rate": 1.806610663393517e-05,
3456
- "loss": 0.9004,
3457
- "step": 944
3458
- },
3459
- {
3460
- "epoch": 0.9474211316975463,
3461
- "grad_norm": 0.409402459859848,
3462
- "learning_rate": 1.795414314757134e-05,
3463
- "loss": 0.9436,
3464
- "step": 946
3465
- },
3466
- {
3467
- "epoch": 0.9494241362043064,
3468
- "grad_norm": 0.40799620747566223,
3469
- "learning_rate": 1.784233296665012e-05,
3470
- "loss": 0.8883,
3471
- "step": 948
3472
- },
3473
- {
3474
- "epoch": 0.9514271407110666,
3475
- "grad_norm": 0.45501673221588135,
3476
- "learning_rate": 1.773067852396426e-05,
3477
- "loss": 0.9641,
3478
- "step": 950
3479
- },
3480
- {
3481
- "epoch": 0.9514271407110666,
3482
- "eval_loss": 1.1456818580627441,
3483
- "eval_runtime": 3.8046,
3484
- "eval_samples_per_second": 15.245,
3485
- "eval_steps_per_second": 7.622,
3486
- "step": 950
3487
- },
3488
- {
3489
- "epoch": 0.9534301452178268,
3490
- "grad_norm": 0.4748212695121765,
3491
- "learning_rate": 1.761918224891787e-05,
3492
- "loss": 0.8753,
3493
- "step": 952
3494
- },
3495
- {
3496
- "epoch": 0.9554331497245869,
3497
- "grad_norm": 0.6242424249649048,
3498
- "learning_rate": 1.7507846567473644e-05,
3499
- "loss": 0.8713,
3500
- "step": 954
3501
- },
3502
- {
3503
- "epoch": 0.957436154231347,
3504
- "grad_norm": 0.42941513657569885,
3505
- "learning_rate": 1.7396673902100035e-05,
3506
- "loss": 0.9128,
3507
- "step": 956
3508
- },
3509
- {
3510
- "epoch": 0.9594391587381071,
3511
- "grad_norm": 0.44053131341934204,
3512
- "learning_rate": 1.728566667171854e-05,
3513
- "loss": 0.8996,
3514
- "step": 958
3515
- },
3516
- {
3517
- "epoch": 0.9614421632448673,
3518
- "grad_norm": 0.6191515922546387,
3519
- "learning_rate": 1.71748272916511e-05,
3520
- "loss": 0.8114,
3521
- "step": 960
3522
- },
3523
- {
3524
- "epoch": 0.9634451677516275,
3525
- "grad_norm": 0.40307995676994324,
3526
- "learning_rate": 1.7064158173567514e-05,
3527
- "loss": 0.8587,
3528
- "step": 962
3529
- },
3530
- {
3531
- "epoch": 0.9654481722583875,
3532
- "grad_norm": 0.3541308641433716,
3533
- "learning_rate": 1.695366172543299e-05,
3534
- "loss": 0.9487,
3535
- "step": 964
3536
- },
3537
- {
3538
- "epoch": 0.9674511767651477,
3539
- "grad_norm": 0.4575124979019165,
3540
- "learning_rate": 1.6843340351455726e-05,
3541
- "loss": 0.9219,
3542
- "step": 966
3543
- },
3544
- {
3545
- "epoch": 0.9694541812719079,
3546
- "grad_norm": 0.4024929702281952,
3547
- "learning_rate": 1.6733196452034653e-05,
3548
- "loss": 0.9609,
3549
- "step": 968
3550
- },
3551
- {
3552
- "epoch": 0.971457185778668,
3553
- "grad_norm": 0.4288537800312042,
3554
- "learning_rate": 1.662323242370711e-05,
3555
- "loss": 0.9131,
3556
- "step": 970
3557
- },
3558
- {
3559
- "epoch": 0.9734601902854282,
3560
- "grad_norm": 0.3629342317581177,
3561
- "learning_rate": 1.6513450659096804e-05,
3562
- "loss": 0.8327,
3563
- "step": 972
3564
- },
3565
- {
3566
- "epoch": 0.9754631947921882,
3567
- "grad_norm": 0.40302079916000366,
3568
- "learning_rate": 1.64038535468617e-05,
3569
- "loss": 0.9035,
3570
- "step": 974
3571
- },
3572
- {
3573
- "epoch": 0.9774661992989484,
3574
- "grad_norm": 0.44683897495269775,
3575
- "learning_rate": 1.629444347164202e-05,
3576
- "loss": 0.9142,
3577
- "step": 976
3578
- },
3579
- {
3580
- "epoch": 0.9794692038057086,
3581
- "grad_norm": 0.6119024157524109,
3582
- "learning_rate": 1.6185222814008433e-05,
3583
- "loss": 0.8105,
3584
- "step": 978
3585
- },
3586
- {
3587
- "epoch": 0.9814722083124687,
3588
- "grad_norm": 0.39314714074134827,
3589
- "learning_rate": 1.6076193950410172e-05,
3590
- "loss": 0.8817,
3591
- "step": 980
3592
- },
3593
- {
3594
- "epoch": 0.9834752128192289,
3595
- "grad_norm": 0.465087354183197,
3596
- "learning_rate": 1.5967359253123403e-05,
3597
- "loss": 0.8979,
3598
- "step": 982
3599
- },
3600
- {
3601
- "epoch": 0.985478217325989,
3602
- "grad_norm": 0.5371639728546143,
3603
- "learning_rate": 1.5858721090199565e-05,
3604
- "loss": 0.9335,
3605
- "step": 984
3606
- },
3607
- {
3608
- "epoch": 0.9874812218327491,
3609
- "grad_norm": 0.5564991235733032,
3610
- "learning_rate": 1.5750281825413836e-05,
3611
- "loss": 0.9051,
3612
- "step": 986
3613
- },
3614
- {
3615
- "epoch": 0.9894842263395093,
3616
- "grad_norm": 0.40404555201530457,
3617
- "learning_rate": 1.5642043818213757e-05,
3618
- "loss": 0.9676,
3619
- "step": 988
3620
- },
3621
- {
3622
- "epoch": 0.9914872308462694,
3623
- "grad_norm": 0.4462992548942566,
3624
- "learning_rate": 1.5534009423667827e-05,
3625
- "loss": 0.8869,
3626
- "step": 990
3627
- },
3628
- {
3629
- "epoch": 0.9934902353530295,
3630
- "grad_norm": 0.4584622085094452,
3631
- "learning_rate": 1.5426180992414318e-05,
3632
- "loss": 0.9093,
3633
- "step": 992
3634
- },
3635
- {
3636
- "epoch": 0.9954932398597897,
3637
- "grad_norm": 0.48583951592445374,
3638
- "learning_rate": 1.5318560870610065e-05,
3639
- "loss": 0.8587,
3640
- "step": 994
3641
- },
3642
- {
3643
- "epoch": 0.9974962443665498,
3644
- "grad_norm": 0.5246539115905762,
3645
- "learning_rate": 1.5211151399879506e-05,
3646
- "loss": 0.8145,
3647
- "step": 996
3648
- },
3649
- {
3650
- "epoch": 0.99949924887331,
3651
- "grad_norm": 0.5616730451583862,
3652
- "learning_rate": 1.510395491726363e-05,
3653
- "loss": 0.9115,
3654
- "step": 998
3655
- },
3656
- {
3657
- "epoch": 1.00100150225338,
3658
- "grad_norm": 0.398170530796051,
3659
- "learning_rate": 1.4996973755169219e-05,
3660
- "loss": 0.674,
3661
- "step": 1000
3662
- },
3663
- {
3664
- "epoch": 1.00100150225338,
3665
- "eval_loss": 1.1421712636947632,
3666
- "eval_runtime": 3.813,
3667
- "eval_samples_per_second": 15.211,
3668
- "eval_steps_per_second": 7.606,
3669
- "step": 1000
3670
- }
3671
- ],
3672
- "logging_steps": 2,
3673
- "max_steps": 1497,
3674
- "num_input_tokens_seen": 0,
3675
- "num_train_epochs": 2,
3676
- "save_steps": 100,
3677
- "stateful_callbacks": {
3678
- "TrainerControl": {
3679
- "args": {
3680
- "should_epoch_stop": false,
3681
- "should_evaluate": false,
3682
- "should_log": false,
3683
- "should_save": true,
3684
- "should_training_stop": false
3685
- },
3686
- "attributes": {}
3687
- }
3688
- },
3689
- "total_flos": 9.382713588973568e+16,
3690
- "train_batch_size": 2,
3691
- "trial_name": null,
3692
- "trial_params": null
3693
- }