Nessii013 commited on
Commit
027bc54
·
1 Parent(s): 86452a8

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +345 -345
trainer_state.json CHANGED
@@ -10,153 +10,153 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.008247422680412371,
13
- "grad_norm": 16.375,
14
  "learning_rate": 6.666666666666667e-06,
15
- "loss": 0.8858,
16
  "num_input_tokens_seen": 1413808,
17
  "step": 4
18
  },
19
  {
20
  "epoch": 0.016494845360824743,
21
- "grad_norm": 4.1875,
22
  "learning_rate": 1.3333333333333333e-05,
23
- "loss": 0.4088,
24
  "num_input_tokens_seen": 2866496,
25
  "step": 8
26
  },
27
  {
28
  "epoch": 0.024742268041237112,
29
- "grad_norm": 3.03125,
30
  "learning_rate": 2e-05,
31
- "loss": 0.2731,
32
  "num_input_tokens_seen": 4305104,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.032989690721649485,
37
- "grad_norm": 1.6484375,
38
  "learning_rate": 2.6666666666666667e-05,
39
- "loss": 0.2563,
40
  "num_input_tokens_seen": 5594128,
41
  "step": 16
42
  },
43
  {
44
  "epoch": 0.041237113402061855,
45
- "grad_norm": 1.46875,
46
  "learning_rate": 3.3333333333333335e-05,
47
- "loss": 0.2536,
48
  "num_input_tokens_seen": 6683376,
49
  "step": 20
50
  },
51
  {
52
  "epoch": 0.049484536082474224,
53
- "grad_norm": 1.2421875,
54
  "learning_rate": 4e-05,
55
- "loss": 0.229,
56
  "num_input_tokens_seen": 8030336,
57
  "step": 24
58
  },
59
  {
60
  "epoch": 0.0577319587628866,
61
- "grad_norm": 1.2578125,
62
  "learning_rate": 3.9992569962849926e-05,
63
- "loss": 0.2212,
64
  "num_input_tokens_seen": 9395728,
65
  "step": 28
66
  },
67
  {
68
  "epoch": 0.06597938144329897,
69
- "grad_norm": 1.3671875,
70
  "learning_rate": 3.99702853719449e-05,
71
- "loss": 0.2275,
72
  "num_input_tokens_seen": 10689344,
73
  "step": 32
74
  },
75
  {
76
  "epoch": 0.07422680412371134,
77
- "grad_norm": 1.328125,
78
  "learning_rate": 3.9933162784818745e-05,
79
- "loss": 0.2262,
80
  "num_input_tokens_seen": 11936704,
81
  "step": 36
82
  },
83
  {
84
  "epoch": 0.08247422680412371,
85
- "grad_norm": 1.484375,
86
  "learning_rate": 3.988122978369162e-05,
87
- "loss": 0.2254,
88
  "num_input_tokens_seen": 13217248,
89
  "step": 40
90
  },
91
  {
92
  "epoch": 0.09072164948453608,
93
- "grad_norm": 1.15625,
94
  "learning_rate": 3.981452495497628e-05,
95
- "loss": 0.2186,
96
  "num_input_tokens_seen": 14587328,
97
  "step": 44
98
  },
99
  {
100
  "epoch": 0.09896907216494845,
101
- "grad_norm": 1.109375,
102
  "learning_rate": 3.973309786060829e-05,
103
- "loss": 0.1971,
104
  "num_input_tokens_seen": 15976464,
105
  "step": 48
106
  },
107
  {
108
  "epoch": 0.10721649484536082,
109
- "grad_norm": 1.125,
110
  "learning_rate": 3.963700900122124e-05,
111
- "loss": 0.2231,
112
  "num_input_tokens_seen": 17262576,
113
  "step": 52
114
  },
115
  {
116
  "epoch": 0.1154639175257732,
117
- "grad_norm": 0.9765625,
118
  "learning_rate": 3.952632977119465e-05,
119
- "loss": 0.2029,
120
  "num_input_tokens_seen": 18801264,
121
  "step": 56
122
  },
123
  {
124
  "epoch": 0.12371134020618557,
125
- "grad_norm": 1.0625,
126
  "learning_rate": 3.9401142405607594e-05,
127
- "loss": 0.2033,
128
  "num_input_tokens_seen": 20158000,
129
  "step": 60
130
  },
131
  {
132
  "epoch": 0.13195876288659794,
133
- "grad_norm": 1.09375,
134
  "learning_rate": 3.9261539919137776e-05,
135
- "loss": 0.2278,
136
  "num_input_tokens_seen": 21322240,
137
  "step": 64
138
  },
139
  {
140
  "epoch": 0.1402061855670103,
141
- "grad_norm": 1.0,
142
  "learning_rate": 3.9107626036951266e-05,
143
- "loss": 0.1998,
144
  "num_input_tokens_seen": 22631360,
145
  "step": 68
146
  },
147
  {
148
  "epoch": 0.14845360824742268,
149
- "grad_norm": 1.109375,
150
  "learning_rate": 3.8939515117634326e-05,
151
- "loss": 0.2148,
152
  "num_input_tokens_seen": 23848496,
153
  "step": 72
154
  },
155
  {
156
  "epoch": 0.15670103092783505,
157
- "grad_norm": 1.1484375,
158
  "learning_rate": 3.875733206822452e-05,
159
- "loss": 0.2246,
160
  "num_input_tokens_seen": 25148336,
161
  "step": 76
162
  },
@@ -164,842 +164,842 @@
164
  "epoch": 0.16494845360824742,
165
  "grad_norm": 1.203125,
166
  "learning_rate": 3.8561212251404406e-05,
167
- "loss": 0.2056,
168
  "num_input_tokens_seen": 26427264,
169
  "step": 80
170
  },
171
  {
172
  "epoch": 0.1731958762886598,
173
- "grad_norm": 1.1328125,
174
  "learning_rate": 3.835130138492644e-05,
175
- "loss": 0.203,
176
  "num_input_tokens_seen": 27833024,
177
  "step": 84
178
  },
179
  {
180
  "epoch": 0.18144329896907216,
181
- "grad_norm": 1.1015625,
182
  "learning_rate": 3.812775543334425e-05,
183
- "loss": 0.1912,
184
  "num_input_tokens_seen": 29273008,
185
  "step": 88
186
  },
187
  {
188
  "epoch": 0.18969072164948453,
189
- "grad_norm": 1.2265625,
190
  "learning_rate": 3.789074049213033e-05,
191
- "loss": 0.2182,
192
- "num_input_tokens_seen": 30624112,
193
  "step": 92
194
  },
195
  {
196
  "epoch": 0.1979381443298969,
197
- "grad_norm": 1.1796875,
198
  "learning_rate": 3.7640432664266514e-05,
199
- "loss": 0.216,
200
- "num_input_tokens_seen": 31857552,
201
  "step": 96
202
  },
203
  {
204
  "epoch": 0.20618556701030927,
205
- "grad_norm": 1.125,
206
  "learning_rate": 3.737701792939881e-05,
207
- "loss": 0.2065,
208
- "num_input_tokens_seen": 33116768,
209
  "step": 100
210
  },
211
  {
212
  "epoch": 0.21443298969072164,
213
- "grad_norm": 1.0,
214
  "learning_rate": 3.7100692005653796e-05,
215
- "loss": 0.206,
216
- "num_input_tokens_seen": 34461024,
217
  "step": 104
218
  },
219
  {
220
  "epoch": 0.22268041237113403,
221
- "grad_norm": 1.0703125,
222
  "learning_rate": 3.681166020421938e-05,
223
- "loss": 0.1912,
224
- "num_input_tokens_seen": 35915264,
225
  "step": 108
226
  },
227
  {
228
  "epoch": 0.2309278350515464,
229
- "grad_norm": 1.0625,
230
  "learning_rate": 3.6510137276797786e-05,
231
- "loss": 0.1952,
232
- "num_input_tokens_seen": 37264080,
233
  "step": 112
234
  },
235
  {
236
  "epoch": 0.23917525773195877,
237
- "grad_norm": 1.109375,
238
  "learning_rate": 3.6196347256044236e-05,
239
- "loss": 0.2273,
240
- "num_input_tokens_seen": 38539072,
241
  "step": 116
242
  },
243
  {
244
  "epoch": 0.24742268041237114,
245
- "grad_norm": 1.109375,
246
  "learning_rate": 3.5870523289109886e-05,
247
- "loss": 0.2041,
248
- "num_input_tokens_seen": 39930480,
249
  "step": 120
250
  },
251
  {
252
  "epoch": 0.2556701030927835,
253
- "grad_norm": 1.15625,
254
  "learning_rate": 3.553290746441261e-05,
255
- "loss": 0.2065,
256
- "num_input_tokens_seen": 41066544,
257
  "step": 124
258
  },
259
  {
260
  "epoch": 0.2639175257731959,
261
  "grad_norm": 1.0390625,
262
  "learning_rate": 3.5183750631764406e-05,
263
- "loss": 0.1979,
264
- "num_input_tokens_seen": 42372160,
265
  "step": 128
266
  },
267
  {
268
  "epoch": 0.2721649484536082,
269
- "grad_norm": 1.0703125,
270
  "learning_rate": 3.4823312215989046e-05,
271
- "loss": 0.2079,
272
- "num_input_tokens_seen": 43644832,
273
  "step": 132
274
  },
275
  {
276
  "epoch": 0.2804123711340206,
277
- "grad_norm": 1.0078125,
278
  "learning_rate": 3.445186002416849e-05,
279
- "loss": 0.2058,
280
- "num_input_tokens_seen": 44948816,
281
  "step": 136
282
  },
283
  {
284
  "epoch": 0.28865979381443296,
285
- "grad_norm": 1.09375,
286
  "learning_rate": 3.4069670046661197e-05,
287
- "loss": 0.1857,
288
- "num_input_tokens_seen": 46404048,
289
  "step": 140
290
  },
291
  {
292
  "epoch": 0.29690721649484536,
293
  "grad_norm": 1.1015625,
294
  "learning_rate": 3.3677026252040306e-05,
295
- "loss": 0.212,
296
- "num_input_tokens_seen": 47646208,
297
  "step": 144
298
  },
299
  {
300
  "epoch": 0.30515463917525776,
301
- "grad_norm": 1.0234375,
302
  "learning_rate": 3.327422037610389e-05,
303
- "loss": 0.1983,
304
- "num_input_tokens_seen": 49010928,
305
  "step": 148
306
  },
307
  {
308
  "epoch": 0.3134020618556701,
309
- "grad_norm": 0.9375,
310
  "learning_rate": 3.286155170511419e-05,
311
- "loss": 0.197,
312
- "num_input_tokens_seen": 50440128,
313
  "step": 152
314
  },
315
  {
316
  "epoch": 0.3216494845360825,
317
- "grad_norm": 1.140625,
318
  "learning_rate": 3.2439326853426824e-05,
319
- "loss": 0.2028,
320
- "num_input_tokens_seen": 51797840,
321
  "step": 156
322
  },
323
  {
324
  "epoch": 0.32989690721649484,
325
- "grad_norm": 0.984375,
326
  "learning_rate": 3.200785953567517e-05,
327
- "loss": 0.196,
328
- "num_input_tokens_seen": 53109456,
329
  "step": 160
330
  },
331
  {
332
  "epoch": 0.33814432989690724,
333
- "grad_norm": 1.0390625,
334
  "learning_rate": 3.156747033367922e-05,
335
- "loss": 0.2016,
336
- "num_input_tokens_seen": 54440768,
337
  "step": 164
338
  },
339
  {
340
  "epoch": 0.3463917525773196,
341
- "grad_norm": 0.97265625,
342
  "learning_rate": 3.1118486458252094e-05,
343
- "loss": 0.1975,
344
- "num_input_tokens_seen": 55879424,
345
  "step": 168
346
  },
347
  {
348
  "epoch": 0.354639175257732,
349
  "grad_norm": 1.0234375,
350
  "learning_rate": 3.0661241506081236e-05,
351
- "loss": 0.1965,
352
- "num_input_tokens_seen": 57154384,
353
  "step": 172
354
  },
355
  {
356
  "epoch": 0.3628865979381443,
357
- "grad_norm": 0.95703125,
358
  "learning_rate": 3.019607521186475e-05,
359
- "loss": 0.2078,
360
- "num_input_tokens_seen": 58470672,
361
  "step": 176
362
  },
363
  {
364
  "epoch": 0.3711340206185567,
365
- "grad_norm": 1.0625,
366
  "learning_rate": 2.972333319588736e-05,
367
- "loss": 0.2092,
368
- "num_input_tokens_seen": 59684416,
369
  "step": 180
370
  },
371
  {
372
  "epoch": 0.37938144329896906,
373
- "grad_norm": 0.96484375,
374
  "learning_rate": 2.9243366707223165e-05,
375
- "loss": 0.2018,
376
- "num_input_tokens_seen": 61002832,
377
  "step": 184
378
  },
379
  {
380
  "epoch": 0.38762886597938145,
381
- "grad_norm": 1.0546875,
382
  "learning_rate": 2.875653236275632e-05,
383
- "loss": 0.2072,
384
- "num_input_tokens_seen": 62262064,
385
  "step": 188
386
  },
387
  {
388
  "epoch": 0.3958762886597938,
389
- "grad_norm": 0.92578125,
390
  "learning_rate": 2.8263191882213362e-05,
391
- "loss": 0.1936,
392
- "num_input_tokens_seen": 63678896,
393
  "step": 192
394
  },
395
  {
396
  "epoch": 0.4041237113402062,
397
- "grad_norm": 0.9609375,
398
  "learning_rate": 2.7763711819404098e-05,
399
- "loss": 0.2069,
400
- "num_input_tokens_seen": 64844672,
401
  "step": 196
402
  },
403
  {
404
  "epoch": 0.41237113402061853,
405
- "grad_norm": 1.0546875,
406
  "learning_rate": 2.7258463289870764e-05,
407
- "loss": 0.1924,
408
- "num_input_tokens_seen": 66274544,
409
  "step": 200
410
  },
411
  {
412
  "epoch": 0.42061855670103093,
413
- "grad_norm": 0.88671875,
414
  "learning_rate": 2.6747821695147806e-05,
415
- "loss": 0.1949,
416
- "num_input_tokens_seen": 67683072,
417
  "step": 204
418
  },
419
  {
420
  "epoch": 0.4288659793814433,
421
- "grad_norm": 1.125,
422
  "learning_rate": 2.623216644383715e-05,
423
- "loss": 0.2092,
424
- "num_input_tokens_seen": 68860288,
425
  "step": 208
426
  },
427
  {
428
  "epoch": 0.43711340206185567,
429
- "grad_norm": 1.078125,
430
  "learning_rate": 2.5711880669706172e-05,
431
- "loss": 0.1959,
432
- "num_input_tokens_seen": 70182736,
433
  "step": 212
434
  },
435
  {
436
  "epoch": 0.44536082474226807,
437
- "grad_norm": 0.9296875,
438
  "learning_rate": 2.5187350947017918e-05,
439
- "loss": 0.2101,
440
- "num_input_tokens_seen": 71494624,
441
  "step": 216
442
  },
443
  {
444
  "epoch": 0.4536082474226804,
445
- "grad_norm": 0.86328125,
446
  "learning_rate": 2.4658967003304986e-05,
447
- "loss": 0.1925,
448
- "num_input_tokens_seen": 72877248,
449
  "step": 220
450
  },
451
  {
452
  "epoch": 0.4618556701030928,
453
- "grad_norm": 1.0078125,
454
  "learning_rate": 2.4127121429800498e-05,
455
- "loss": 0.1841,
456
- "num_input_tokens_seen": 74118560,
457
  "step": 224
458
  },
459
  {
460
  "epoch": 0.47010309278350515,
461
- "grad_norm": 0.88671875,
462
  "learning_rate": 2.3592209389741372e-05,
463
- "loss": 0.174,
464
- "num_input_tokens_seen": 75598912,
465
  "step": 228
466
  },
467
  {
468
  "epoch": 0.47835051546391755,
469
- "grad_norm": 1.0234375,
470
  "learning_rate": 2.30546283247606e-05,
471
  "loss": 0.207,
472
- "num_input_tokens_seen": 76742752,
473
  "step": 232
474
  },
475
  {
476
  "epoch": 0.4865979381443299,
477
- "grad_norm": 1.015625,
478
  "learning_rate": 2.251477765958655e-05,
479
- "loss": 0.1932,
480
- "num_input_tokens_seen": 78206256,
481
  "step": 236
482
  },
483
  {
484
  "epoch": 0.4948453608247423,
485
- "grad_norm": 1.0546875,
486
  "learning_rate": 2.1973058505269007e-05,
487
- "loss": 0.1946,
488
- "num_input_tokens_seen": 79491408,
489
  "step": 240
490
  },
491
  {
492
  "epoch": 0.5030927835051546,
493
  "grad_norm": 1.0546875,
494
  "learning_rate": 2.1429873361152124e-05,
495
- "loss": 0.1975,
496
- "num_input_tokens_seen": 80718320,
497
  "step": 244
498
  },
499
  {
500
  "epoch": 0.511340206185567,
501
- "grad_norm": 0.91015625,
502
  "learning_rate": 2.088562581581592e-05,
503
- "loss": 0.1964,
504
- "num_input_tokens_seen": 81915456,
505
  "step": 248
506
  },
507
  {
508
  "epoch": 0.5195876288659794,
509
- "grad_norm": 1.1015625,
510
  "learning_rate": 2.0340720247208447e-05,
511
- "loss": 0.191,
512
- "num_input_tokens_seen": 83180624,
513
  "step": 252
514
  },
515
  {
516
  "epoch": 0.5278350515463918,
517
- "grad_norm": 0.90234375,
518
  "learning_rate": 1.9795561522191523e-05,
519
- "loss": 0.1832,
520
- "num_input_tokens_seen": 84571536,
521
  "step": 256
522
  },
523
  {
524
  "epoch": 0.5360824742268041,
525
- "grad_norm": 1.046875,
526
  "learning_rate": 1.9250554695723107e-05,
527
- "loss": 0.1964,
528
- "num_input_tokens_seen": 85841328,
529
  "step": 260
530
  },
531
  {
532
  "epoch": 0.5443298969072164,
533
- "grad_norm": 0.953125,
534
  "learning_rate": 1.8706104709899964e-05,
535
- "loss": 0.1875,
536
- "num_input_tokens_seen": 87241616,
537
  "step": 264
538
  },
539
  {
540
  "epoch": 0.5525773195876289,
541
- "grad_norm": 0.92578125,
542
  "learning_rate": 1.816261609308419e-05,
543
- "loss": 0.1809,
544
- "num_input_tokens_seen": 88600352,
545
  "step": 268
546
  },
547
  {
548
  "epoch": 0.5608247422680412,
549
- "grad_norm": 0.87109375,
550
  "learning_rate": 1.7620492659337155e-05,
551
- "loss": 0.1793,
552
- "num_input_tokens_seen": 90051376,
553
  "step": 272
554
  },
555
  {
556
  "epoch": 0.5690721649484536,
557
- "grad_norm": 1.0390625,
558
  "learning_rate": 1.7080137208384122e-05,
559
- "loss": 0.1865,
560
- "num_input_tokens_seen": 91429472,
561
  "step": 276
562
  },
563
  {
564
  "epoch": 0.5773195876288659,
565
- "grad_norm": 0.9140625,
566
  "learning_rate": 1.6541951226332565e-05,
567
- "loss": 0.1745,
568
- "num_input_tokens_seen": 92791856,
569
  "step": 280
570
  },
571
  {
572
  "epoch": 0.5855670103092784,
573
- "grad_norm": 0.875,
574
  "learning_rate": 1.600633458736653e-05,
575
- "loss": 0.1925,
576
- "num_input_tokens_seen": 94068304,
577
  "step": 284
578
  },
579
  {
580
  "epoch": 0.5938144329896907,
581
- "grad_norm": 0.98828125,
582
  "learning_rate": 1.5473685256638572e-05,
583
- "loss": 0.1903,
584
- "num_input_tokens_seen": 95338656,
585
  "step": 288
586
  },
587
  {
588
  "epoch": 0.6020618556701031,
589
- "grad_norm": 0.90625,
590
  "learning_rate": 1.4944398994580232e-05,
591
- "loss": 0.1834,
592
- "num_input_tokens_seen": 96565872,
593
  "step": 292
594
  },
595
  {
596
  "epoch": 0.6103092783505155,
597
- "grad_norm": 1.0,
598
  "learning_rate": 1.4418869062850514e-05,
599
- "loss": 0.211,
600
- "num_input_tokens_seen": 97845776,
601
  "step": 296
602
  },
603
  {
604
  "epoch": 0.6185567010309279,
605
- "grad_norm": 0.92578125,
606
  "learning_rate": 1.3897485932141042e-05,
607
- "loss": 0.1872,
608
- "num_input_tokens_seen": 99080048,
609
  "step": 300
610
  },
611
  {
612
  "epoch": 0.6268041237113402,
613
- "grad_norm": 0.88671875,
614
  "learning_rate": 1.3380636992054878e-05,
615
- "loss": 0.17,
616
- "num_input_tokens_seen": 100563184,
617
  "step": 304
618
  },
619
  {
620
  "epoch": 0.6350515463917525,
621
- "grad_norm": 0.9375,
622
  "learning_rate": 1.2868706263274602e-05,
623
- "loss": 0.1935,
624
- "num_input_tokens_seen": 101820432,
625
  "step": 308
626
  },
627
  {
628
  "epoch": 0.643298969072165,
629
- "grad_norm": 0.88671875,
630
  "learning_rate": 1.236207411223353e-05,
631
- "loss": 0.1833,
632
- "num_input_tokens_seen": 103280736,
633
  "step": 312
634
  },
635
  {
636
  "epoch": 0.6515463917525773,
637
- "grad_norm": 0.88671875,
638
  "learning_rate": 1.1861116968502015e-05,
639
- "loss": 0.1815,
640
- "num_input_tokens_seen": 104563920,
641
  "step": 316
642
  },
643
  {
644
  "epoch": 0.6597938144329897,
645
- "grad_norm": 0.875,
646
  "learning_rate": 1.136620704509892e-05,
647
- "loss": 0.1816,
648
- "num_input_tokens_seen": 105869408,
649
  "step": 320
650
  },
651
  {
652
  "epoch": 0.668041237113402,
653
- "grad_norm": 0.9296875,
654
  "learning_rate": 1.087771206193593e-05,
655
- "loss": 0.1837,
656
- "num_input_tokens_seen": 107213792,
657
  "step": 324
658
  },
659
  {
660
  "epoch": 0.6762886597938145,
661
- "grad_norm": 0.96484375,
662
  "learning_rate": 1.0395994972600285e-05,
663
- "loss": 0.1775,
664
- "num_input_tokens_seen": 108623536,
665
  "step": 328
666
  },
667
  {
668
  "epoch": 0.6845360824742268,
669
- "grad_norm": 0.97265625,
670
  "learning_rate": 9.921413694678959e-06,
671
- "loss": 0.2035,
672
- "num_input_tokens_seen": 109750560,
673
  "step": 332
674
  },
675
  {
676
  "epoch": 0.6927835051546392,
677
- "grad_norm": 0.921875,
678
  "learning_rate": 9.454320843824512e-06,
679
- "loss": 0.1862,
680
- "num_input_tokens_seen": 111023152,
681
  "step": 336
682
  },
683
  {
684
  "epoch": 0.7010309278350515,
685
- "grad_norm": 0.95703125,
686
  "learning_rate": 8.995063471760377e-06,
687
- "loss": 0.1927,
688
- "num_input_tokens_seen": 112284320,
689
  "step": 340
690
  },
691
  {
692
  "epoch": 0.709278350515464,
693
- "grad_norm": 0.96484375,
694
  "learning_rate": 8.543982808420156e-06,
695
- "loss": 0.1856,
696
- "num_input_tokens_seen": 113630688,
697
  "step": 344
698
  },
699
  {
700
  "epoch": 0.7175257731958763,
701
- "grad_norm": 0.953125,
702
  "learning_rate": 8.101414008412469e-06,
703
- "loss": 0.1792,
704
- "num_input_tokens_seen": 114946320,
705
  "step": 348
706
  },
707
  {
708
  "epoch": 0.7257731958762886,
709
- "grad_norm": 0.94921875,
710
  "learning_rate": 7.667685901999875e-06,
711
- "loss": 0.1891,
712
- "num_input_tokens_seen": 116220208,
713
  "step": 352
714
  },
715
  {
716
  "epoch": 0.734020618556701,
717
- "grad_norm": 0.8671875,
718
  "learning_rate": 7.24312075077674e-06,
719
- "loss": 0.1891,
720
- "num_input_tokens_seen": 117614672,
721
  "step": 356
722
  },
723
  {
724
  "epoch": 0.7422680412371134,
725
- "grad_norm": 1.0859375,
726
  "learning_rate": 6.828034008227678e-06,
727
- "loss": 0.1714,
728
- "num_input_tokens_seen": 118996816,
729
  "step": 360
730
  },
731
  {
732
  "epoch": 0.7505154639175258,
733
- "grad_norm": 0.90234375,
734
  "learning_rate": 6.422734085344464e-06,
735
- "loss": 0.1871,
736
- "num_input_tokens_seen": 120229232,
737
  "step": 364
738
  },
739
  {
740
  "epoch": 0.7587628865979381,
741
- "grad_norm": 0.8203125,
742
  "learning_rate": 6.027522121475482e-06,
743
- "loss": 0.1795,
744
- "num_input_tokens_seen": 121495936,
745
  "step": 368
746
  },
747
  {
748
  "epoch": 0.7670103092783506,
749
  "grad_norm": 0.95703125,
750
  "learning_rate": 5.642691760578116e-06,
751
- "loss": 0.1833,
752
- "num_input_tokens_seen": 122787872,
753
  "step": 372
754
  },
755
  {
756
  "epoch": 0.7752577319587629,
757
- "grad_norm": 0.83984375,
758
  "learning_rate": 5.268528933040147e-06,
759
- "loss": 0.1673,
760
- "num_input_tokens_seen": 124257600,
761
  "step": 376
762
  },
763
  {
764
  "epoch": 0.7835051546391752,
765
- "grad_norm": 0.875,
766
  "learning_rate": 4.905311643232464e-06,
767
- "loss": 0.1763,
768
- "num_input_tokens_seen": 125705408,
769
  "step": 380
770
  },
771
  {
772
  "epoch": 0.7917525773195876,
773
- "grad_norm": 0.9765625,
774
  "learning_rate": 4.553309762950739e-06,
775
- "loss": 0.1877,
776
- "num_input_tokens_seen": 126862272,
777
  "step": 384
778
  },
779
  {
780
  "epoch": 0.8,
781
- "grad_norm": 0.90625,
782
  "learning_rate": 4.212784830899725e-06,
783
- "loss": 0.1795,
784
- "num_input_tokens_seen": 128153600,
785
  "step": 388
786
  },
787
  {
788
  "epoch": 0.8082474226804124,
789
- "grad_norm": 0.828125,
790
  "learning_rate": 3.8839898583689725e-06,
791
- "loss": 0.1803,
792
- "num_input_tokens_seen": 129461872,
793
  "step": 392
794
  },
795
  {
796
  "epoch": 0.8164948453608247,
797
- "grad_norm": 0.88671875,
798
  "learning_rate": 3.567169141244562e-06,
799
- "loss": 0.179,
800
- "num_input_tokens_seen": 130662064,
801
  "step": 396
802
  },
803
  {
804
  "epoch": 0.8247422680412371,
805
- "grad_norm": 0.8359375,
806
  "learning_rate": 3.262558078496301e-06,
807
- "loss": 0.1679,
808
- "num_input_tokens_seen": 131997568,
809
  "step": 400
810
  },
811
  {
812
  "epoch": 0.8329896907216495,
813
- "grad_norm": 0.94921875,
814
  "learning_rate": 2.9703829972754407e-06,
815
- "loss": 0.1974,
816
- "num_input_tokens_seen": 133415744,
817
  "step": 404
818
  },
819
  {
820
  "epoch": 0.8412371134020619,
821
- "grad_norm": 0.828125,
822
  "learning_rate": 2.69086098475277e-06,
823
- "loss": 0.1699,
824
- "num_input_tokens_seen": 134815840,
825
  "step": 408
826
  },
827
  {
828
  "epoch": 0.8494845360824742,
829
- "grad_norm": 0.9921875,
830
  "learning_rate": 2.4241997268220096e-06,
831
- "loss": 0.1815,
832
- "num_input_tokens_seen": 136262128,
833
  "step": 412
834
  },
835
  {
836
  "epoch": 0.8577319587628865,
837
- "grad_norm": 0.96484375,
838
  "learning_rate": 2.1705973537884615e-06,
839
- "loss": 0.1781,
840
- "num_input_tokens_seen": 137430160,
841
  "step": 416
842
  },
843
  {
844
  "epoch": 0.865979381443299,
845
- "grad_norm": 0.95703125,
846
  "learning_rate": 1.9302422931574183e-06,
847
- "loss": 0.1899,
848
- "num_input_tokens_seen": 138709200,
849
  "step": 420
850
  },
851
  {
852
  "epoch": 0.8742268041237113,
853
  "grad_norm": 0.83984375,
854
  "learning_rate": 1.7033131296318473e-06,
855
- "loss": 0.1795,
856
- "num_input_tokens_seen": 140033680,
857
  "step": 424
858
  },
859
  {
860
  "epoch": 0.8824742268041237,
861
- "grad_norm": 0.8203125,
862
  "learning_rate": 1.4899784724232968e-06,
863
- "loss": 0.1749,
864
- "num_input_tokens_seen": 141348848,
865
  "step": 428
866
  },
867
  {
868
  "epoch": 0.8907216494845361,
869
- "grad_norm": 0.83984375,
870
  "learning_rate": 1.2903968299746094e-06,
871
- "loss": 0.171,
872
- "num_input_tokens_seen": 142797664,
873
  "step": 432
874
  },
875
  {
876
  "epoch": 0.8989690721649485,
877
- "grad_norm": 0.89453125,
878
  "learning_rate": 1.104716492187574e-06,
879
- "loss": 0.1812,
880
- "num_input_tokens_seen": 144154208,
881
  "step": 436
882
  },
883
  {
884
  "epoch": 0.9072164948453608,
885
- "grad_norm": 1.03125,
886
  "learning_rate": 9.330754202429726e-07,
887
- "loss": 0.1891,
888
- "num_input_tokens_seen": 145332560,
889
  "step": 440
890
  },
891
  {
892
  "epoch": 0.9154639175257732,
893
- "grad_norm": 0.98046875,
894
  "learning_rate": 7.756011440948996e-07,
895
- "loss": 0.1902,
896
- "num_input_tokens_seen": 146527344,
897
  "step": 444
898
  },
899
  {
900
  "epoch": 0.9237113402061856,
901
- "grad_norm": 1.0,
902
  "learning_rate": 6.324106677155573e-07,
903
- "loss": 0.1793,
904
- "num_input_tokens_seen": 147821568,
905
  "step": 448
906
  },
907
  {
908
  "epoch": 0.931958762886598,
909
  "grad_norm": 0.88671875,
910
  "learning_rate": 5.036103821608485e-07,
911
- "loss": 0.1844,
912
- "num_input_tokens_seen": 149191664,
913
  "step": 452
914
  },
915
  {
916
  "epoch": 0.9402061855670103,
917
- "grad_norm": 0.8515625,
918
  "learning_rate": 3.892959865214363e-07,
919
- "loss": 0.1795,
920
- "num_input_tokens_seen": 150526864,
921
  "step": 456
922
  },
923
  {
924
  "epoch": 0.9484536082474226,
925
- "grad_norm": 0.9453125,
926
  "learning_rate": 2.8955241681795534e-07,
927
- "loss": 0.1859,
928
- "num_input_tokens_seen": 151861952,
929
  "step": 460
930
  },
931
  {
932
  "epoch": 0.9567010309278351,
933
- "grad_norm": 0.71875,
934
  "learning_rate": 2.044537828932458e-07,
935
- "loss": 0.1787,
936
- "num_input_tokens_seen": 153201488,
937
  "step": 464
938
  },
939
  {
940
  "epoch": 0.9649484536082474,
941
- "grad_norm": 0.87109375,
942
  "learning_rate": 1.3406331334845813e-07,
943
- "loss": 0.1884,
944
- "num_input_tokens_seen": 154511184,
945
  "step": 468
946
  },
947
  {
948
  "epoch": 0.9731958762886598,
949
- "grad_norm": 0.796875,
950
  "learning_rate": 7.843330856396103e-08,
951
- "loss": 0.1858,
952
- "num_input_tokens_seen": 155737200,
953
  "step": 472
954
  },
955
  {
956
  "epoch": 0.9814432989690721,
957
- "grad_norm": 0.8671875,
958
  "learning_rate": 3.760510183997701e-08,
959
- "loss": 0.183,
960
- "num_input_tokens_seen": 157084960,
961
  "step": 476
962
  },
963
  {
964
  "epoch": 0.9896907216494846,
965
- "grad_norm": 0.93359375,
966
  "learning_rate": 1.160902868577951e-08,
967
- "loss": 0.1908,
968
- "num_input_tokens_seen": 158349904,
969
  "step": 480
970
  },
971
  {
972
  "epoch": 0.9979381443298969,
973
- "grad_norm": 0.8984375,
974
  "learning_rate": 4.64404280295927e-10,
975
- "loss": 0.1764,
976
- "num_input_tokens_seen": 159669152,
977
  "step": 484
978
  },
979
  {
980
  "epoch": 1.0,
981
- "eval_loss": 0.0952233299612999,
982
- "eval_runtime": 83.4314,
983
- "eval_samples_per_second": 12.453,
984
- "eval_steps_per_second": 0.396,
985
- "num_input_tokens_seen": 160041824,
986
  "step": 485
987
  },
988
  {
989
  "epoch": 1.0,
990
- "num_input_tokens_seen": 160041824,
991
  "step": 485,
992
- "total_flos": 9.01334912177537e+17,
993
- "train_loss": 0.20323730206366666,
994
- "train_runtime": 14397.8817,
995
- "train_samples_per_second": 4.306,
996
  "train_steps_per_second": 0.034,
997
- "train_tokens_per_second": 1386.226
998
  }
999
  ],
1000
  "logging_steps": 4,
1001
  "max_steps": 485,
1002
- "num_input_tokens_seen": 160041824,
1003
  "num_train_epochs": 1,
1004
  "save_steps": 0,
1005
  "stateful_callbacks": {
@@ -1014,7 +1014,7 @@
1014
  "attributes": {}
1015
  }
1016
  },
1017
- "total_flos": 9.01334912177537e+17,
1018
  "train_batch_size": 16,
1019
  "trial_name": null,
1020
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.008247422680412371,
13
+ "grad_norm": 16.75,
14
  "learning_rate": 6.666666666666667e-06,
15
+ "loss": 0.8829,
16
  "num_input_tokens_seen": 1413808,
17
  "step": 4
18
  },
19
  {
20
  "epoch": 0.016494845360824743,
21
+ "grad_norm": 3.90625,
22
  "learning_rate": 1.3333333333333333e-05,
23
+ "loss": 0.4033,
24
  "num_input_tokens_seen": 2866496,
25
  "step": 8
26
  },
27
  {
28
  "epoch": 0.024742268041237112,
29
+ "grad_norm": 5.0,
30
  "learning_rate": 2e-05,
31
+ "loss": 0.2755,
32
  "num_input_tokens_seen": 4305104,
33
  "step": 12
34
  },
35
  {
36
  "epoch": 0.032989690721649485,
37
+ "grad_norm": 2.453125,
38
  "learning_rate": 2.6666666666666667e-05,
39
+ "loss": 0.2531,
40
  "num_input_tokens_seen": 5594128,
41
  "step": 16
42
  },
43
  {
44
  "epoch": 0.041237113402061855,
45
+ "grad_norm": 1.484375,
46
  "learning_rate": 3.3333333333333335e-05,
47
+ "loss": 0.2585,
48
  "num_input_tokens_seen": 6683376,
49
  "step": 20
50
  },
51
  {
52
  "epoch": 0.049484536082474224,
53
+ "grad_norm": 1.328125,
54
  "learning_rate": 4e-05,
55
+ "loss": 0.2308,
56
  "num_input_tokens_seen": 8030336,
57
  "step": 24
58
  },
59
  {
60
  "epoch": 0.0577319587628866,
61
+ "grad_norm": 1.3671875,
62
  "learning_rate": 3.9992569962849926e-05,
63
+ "loss": 0.2221,
64
  "num_input_tokens_seen": 9395728,
65
  "step": 28
66
  },
67
  {
68
  "epoch": 0.06597938144329897,
69
+ "grad_norm": 1.3125,
70
  "learning_rate": 3.99702853719449e-05,
71
+ "loss": 0.2259,
72
  "num_input_tokens_seen": 10689344,
73
  "step": 32
74
  },
75
  {
76
  "epoch": 0.07422680412371134,
77
+ "grad_norm": 1.234375,
78
  "learning_rate": 3.9933162784818745e-05,
79
+ "loss": 0.2201,
80
  "num_input_tokens_seen": 11936704,
81
  "step": 36
82
  },
83
  {
84
  "epoch": 0.08247422680412371,
85
+ "grad_norm": 1.359375,
86
  "learning_rate": 3.988122978369162e-05,
87
+ "loss": 0.2242,
88
  "num_input_tokens_seen": 13217248,
89
  "step": 40
90
  },
91
  {
92
  "epoch": 0.09072164948453608,
93
+ "grad_norm": 1.203125,
94
  "learning_rate": 3.981452495497628e-05,
95
+ "loss": 0.2213,
96
  "num_input_tokens_seen": 14587328,
97
  "step": 44
98
  },
99
  {
100
  "epoch": 0.09896907216494845,
101
+ "grad_norm": 1.0859375,
102
  "learning_rate": 3.973309786060829e-05,
103
+ "loss": 0.1958,
104
  "num_input_tokens_seen": 15976464,
105
  "step": 48
106
  },
107
  {
108
  "epoch": 0.10721649484536082,
109
+ "grad_norm": 1.09375,
110
  "learning_rate": 3.963700900122124e-05,
111
+ "loss": 0.2136,
112
  "num_input_tokens_seen": 17262576,
113
  "step": 52
114
  },
115
  {
116
  "epoch": 0.1154639175257732,
117
+ "grad_norm": 1.03125,
118
  "learning_rate": 3.952632977119465e-05,
119
+ "loss": 0.2059,
120
  "num_input_tokens_seen": 18801264,
121
  "step": 56
122
  },
123
  {
124
  "epoch": 0.12371134020618557,
125
+ "grad_norm": 1.0859375,
126
  "learning_rate": 3.9401142405607594e-05,
127
+ "loss": 0.197,
128
  "num_input_tokens_seen": 20158000,
129
  "step": 60
130
  },
131
  {
132
  "epoch": 0.13195876288659794,
133
+ "grad_norm": 1.2109375,
134
  "learning_rate": 3.9261539919137776e-05,
135
+ "loss": 0.2273,
136
  "num_input_tokens_seen": 21322240,
137
  "step": 64
138
  },
139
  {
140
  "epoch": 0.1402061855670103,
141
+ "grad_norm": 0.98828125,
142
  "learning_rate": 3.9107626036951266e-05,
143
+ "loss": 0.1971,
144
  "num_input_tokens_seen": 22631360,
145
  "step": 68
146
  },
147
  {
148
  "epoch": 0.14845360824742268,
149
+ "grad_norm": 1.125,
150
  "learning_rate": 3.8939515117634326e-05,
151
+ "loss": 0.2194,
152
  "num_input_tokens_seen": 23848496,
153
  "step": 72
154
  },
155
  {
156
  "epoch": 0.15670103092783505,
157
+ "grad_norm": 1.1953125,
158
  "learning_rate": 3.875733206822452e-05,
159
+ "loss": 0.2215,
160
  "num_input_tokens_seen": 25148336,
161
  "step": 76
162
  },
 
164
  "epoch": 0.16494845360824742,
165
  "grad_norm": 1.203125,
166
  "learning_rate": 3.8561212251404406e-05,
167
+ "loss": 0.1989,
168
  "num_input_tokens_seen": 26427264,
169
  "step": 80
170
  },
171
  {
172
  "epoch": 0.1731958762886598,
173
+ "grad_norm": 1.203125,
174
  "learning_rate": 3.835130138492644e-05,
175
+ "loss": 0.2072,
176
  "num_input_tokens_seen": 27833024,
177
  "step": 84
178
  },
179
  {
180
  "epoch": 0.18144329896907216,
181
+ "grad_norm": 1.1171875,
182
  "learning_rate": 3.812775543334425e-05,
183
+ "loss": 0.2013,
184
  "num_input_tokens_seen": 29273008,
185
  "step": 88
186
  },
187
  {
188
  "epoch": 0.18969072164948453,
189
+ "grad_norm": 1.1484375,
190
  "learning_rate": 3.789074049213033e-05,
191
+ "loss": 0.2119,
192
+ "num_input_tokens_seen": 30628416,
193
  "step": 92
194
  },
195
  {
196
  "epoch": 0.1979381443298969,
197
+ "grad_norm": 1.1328125,
198
  "learning_rate": 3.7640432664266514e-05,
199
+ "loss": 0.2213,
200
+ "num_input_tokens_seen": 31861856,
201
  "step": 96
202
  },
203
  {
204
  "epoch": 0.20618556701030927,
205
+ "grad_norm": 1.0859375,
206
  "learning_rate": 3.737701792939881e-05,
207
+ "loss": 0.2102,
208
+ "num_input_tokens_seen": 33121072,
209
  "step": 100
210
  },
211
  {
212
  "epoch": 0.21443298969072164,
213
+ "grad_norm": 1.0859375,
214
  "learning_rate": 3.7100692005653796e-05,
215
+ "loss": 0.2052,
216
+ "num_input_tokens_seen": 34464560,
217
  "step": 104
218
  },
219
  {
220
  "epoch": 0.22268041237113403,
221
+ "grad_norm": 1.09375,
222
  "learning_rate": 3.681166020421938e-05,
223
+ "loss": 0.1942,
224
+ "num_input_tokens_seen": 35918800,
225
  "step": 108
226
  },
227
  {
228
  "epoch": 0.2309278350515464,
229
+ "grad_norm": 1.0546875,
230
  "learning_rate": 3.6510137276797786e-05,
231
+ "loss": 0.1946,
232
+ "num_input_tokens_seen": 37267616,
233
  "step": 112
234
  },
235
  {
236
  "epoch": 0.23917525773195877,
237
+ "grad_norm": 1.078125,
238
  "learning_rate": 3.6196347256044236e-05,
239
+ "loss": 0.2263,
240
+ "num_input_tokens_seen": 38542608,
241
  "step": 116
242
  },
243
  {
244
  "epoch": 0.24742268041237114,
245
+ "grad_norm": 1.015625,
246
  "learning_rate": 3.5870523289109886e-05,
247
+ "loss": 0.2102,
248
+ "num_input_tokens_seen": 39934016,
249
  "step": 120
250
  },
251
  {
252
  "epoch": 0.2556701030927835,
253
+ "grad_norm": 1.0390625,
254
  "learning_rate": 3.553290746441261e-05,
255
+ "loss": 0.2084,
256
+ "num_input_tokens_seen": 41070080,
257
  "step": 124
258
  },
259
  {
260
  "epoch": 0.2639175257731959,
261
  "grad_norm": 1.0390625,
262
  "learning_rate": 3.5183750631764406e-05,
263
+ "loss": 0.1939,
264
+ "num_input_tokens_seen": 42375696,
265
  "step": 128
266
  },
267
  {
268
  "epoch": 0.2721649484536082,
269
+ "grad_norm": 1.0390625,
270
  "learning_rate": 3.4823312215989046e-05,
271
+ "loss": 0.2027,
272
+ "num_input_tokens_seen": 43648368,
273
  "step": 132
274
  },
275
  {
276
  "epoch": 0.2804123711340206,
277
+ "grad_norm": 1.03125,
278
  "learning_rate": 3.445186002416849e-05,
279
+ "loss": 0.2093,
280
+ "num_input_tokens_seen": 44952352,
281
  "step": 136
282
  },
283
  {
284
  "epoch": 0.28865979381443296,
285
+ "grad_norm": 1.1484375,
286
  "learning_rate": 3.4069670046661197e-05,
287
+ "loss": 0.1887,
288
+ "num_input_tokens_seen": 46407584,
289
  "step": 140
290
  },
291
  {
292
  "epoch": 0.29690721649484536,
293
  "grad_norm": 1.1015625,
294
  "learning_rate": 3.3677026252040306e-05,
295
+ "loss": 0.2109,
296
+ "num_input_tokens_seen": 47649744,
297
  "step": 144
298
  },
299
  {
300
  "epoch": 0.30515463917525776,
301
+ "grad_norm": 0.98046875,
302
  "learning_rate": 3.327422037610389e-05,
303
+ "loss": 0.2072,
304
+ "num_input_tokens_seen": 49014464,
305
  "step": 148
306
  },
307
  {
308
  "epoch": 0.3134020618556701,
309
+ "grad_norm": 0.97265625,
310
  "learning_rate": 3.286155170511419e-05,
311
+ "loss": 0.2046,
312
+ "num_input_tokens_seen": 50443616,
313
  "step": 152
314
  },
315
  {
316
  "epoch": 0.3216494845360825,
317
+ "grad_norm": 1.1328125,
318
  "learning_rate": 3.2439326853426824e-05,
319
+ "loss": 0.211,
320
+ "num_input_tokens_seen": 51801328,
321
  "step": 156
322
  },
323
  {
324
  "epoch": 0.32989690721649484,
325
+ "grad_norm": 0.9765625,
326
  "learning_rate": 3.200785953567517e-05,
327
+ "loss": 0.1977,
328
+ "num_input_tokens_seen": 53112944,
329
  "step": 160
330
  },
331
  {
332
  "epoch": 0.33814432989690724,
333
+ "grad_norm": 1.03125,
334
  "learning_rate": 3.156747033367922e-05,
335
+ "loss": 0.2001,
336
+ "num_input_tokens_seen": 54444256,
337
  "step": 164
338
  },
339
  {
340
  "epoch": 0.3463917525773196,
341
+ "grad_norm": 1.046875,
342
  "learning_rate": 3.1118486458252094e-05,
343
+ "loss": 0.2,
344
+ "num_input_tokens_seen": 55882912,
345
  "step": 168
346
  },
347
  {
348
  "epoch": 0.354639175257732,
349
  "grad_norm": 1.0234375,
350
  "learning_rate": 3.0661241506081236e-05,
351
+ "loss": 0.1997,
352
+ "num_input_tokens_seen": 57157872,
353
  "step": 172
354
  },
355
  {
356
  "epoch": 0.3628865979381443,
357
+ "grad_norm": 1.0078125,
358
  "learning_rate": 3.019607521186475e-05,
359
+ "loss": 0.2085,
360
+ "num_input_tokens_seen": 58474160,
361
  "step": 176
362
  },
363
  {
364
  "epoch": 0.3711340206185567,
365
+ "grad_norm": 1.0859375,
366
  "learning_rate": 2.972333319588736e-05,
367
+ "loss": 0.2093,
368
+ "num_input_tokens_seen": 59687904,
369
  "step": 180
370
  },
371
  {
372
  "epoch": 0.37938144329896906,
373
+ "grad_norm": 0.99609375,
374
  "learning_rate": 2.9243366707223165e-05,
375
+ "loss": 0.1963,
376
+ "num_input_tokens_seen": 61006320,
377
  "step": 184
378
  },
379
  {
380
  "epoch": 0.38762886597938145,
381
+ "grad_norm": 0.98828125,
382
  "learning_rate": 2.875653236275632e-05,
383
+ "loss": 0.2001,
384
+ "num_input_tokens_seen": 62265552,
385
  "step": 188
386
  },
387
  {
388
  "epoch": 0.3958762886597938,
389
+ "grad_norm": 0.9609375,
390
  "learning_rate": 2.8263191882213362e-05,
391
+ "loss": 0.1948,
392
+ "num_input_tokens_seen": 63682384,
393
  "step": 192
394
  },
395
  {
396
  "epoch": 0.4041237113402062,
397
+ "grad_norm": 0.953125,
398
  "learning_rate": 2.7763711819404098e-05,
399
+ "loss": 0.2048,
400
+ "num_input_tokens_seen": 64848160,
401
  "step": 196
402
  },
403
  {
404
  "epoch": 0.41237113402061853,
405
+ "grad_norm": 0.98046875,
406
  "learning_rate": 2.7258463289870764e-05,
407
+ "loss": 0.192,
408
+ "num_input_tokens_seen": 66278032,
409
  "step": 200
410
  },
411
  {
412
  "epoch": 0.42061855670103093,
413
+ "grad_norm": 0.80859375,
414
  "learning_rate": 2.6747821695147806e-05,
415
+ "loss": 0.1933,
416
+ "num_input_tokens_seen": 67686560,
417
  "step": 204
418
  },
419
  {
420
  "epoch": 0.4288659793814433,
421
+ "grad_norm": 1.0703125,
422
  "learning_rate": 2.623216644383715e-05,
423
+ "loss": 0.2094,
424
+ "num_input_tokens_seen": 68863776,
425
  "step": 208
426
  },
427
  {
428
  "epoch": 0.43711340206185567,
429
+ "grad_norm": 1.0859375,
430
  "learning_rate": 2.5711880669706172e-05,
431
+ "loss": 0.1964,
432
+ "num_input_tokens_seen": 70186224,
433
  "step": 212
434
  },
435
  {
436
  "epoch": 0.44536082474226807,
437
+ "grad_norm": 0.9765625,
438
  "learning_rate": 2.5187350947017918e-05,
439
+ "loss": 0.2042,
440
+ "num_input_tokens_seen": 71498112,
441
  "step": 216
442
  },
443
  {
444
  "epoch": 0.4536082474226804,
445
+ "grad_norm": 0.89453125,
446
  "learning_rate": 2.4658967003304986e-05,
447
+ "loss": 0.1908,
448
+ "num_input_tokens_seen": 72880736,
449
  "step": 220
450
  },
451
  {
452
  "epoch": 0.4618556701030928,
453
+ "grad_norm": 1.0,
454
  "learning_rate": 2.4127121429800498e-05,
455
+ "loss": 0.187,
456
+ "num_input_tokens_seen": 74122048,
457
  "step": 224
458
  },
459
  {
460
  "epoch": 0.47010309278350515,
461
+ "grad_norm": 0.9296875,
462
  "learning_rate": 2.3592209389741372e-05,
463
+ "loss": 0.1778,
464
+ "num_input_tokens_seen": 75602400,
465
  "step": 228
466
  },
467
  {
468
  "epoch": 0.47835051546391755,
469
+ "grad_norm": 1.0546875,
470
  "learning_rate": 2.30546283247606e-05,
471
  "loss": 0.207,
472
+ "num_input_tokens_seen": 76746240,
473
  "step": 232
474
  },
475
  {
476
  "epoch": 0.4865979381443299,
477
+ "grad_norm": 0.984375,
478
  "learning_rate": 2.251477765958655e-05,
479
+ "loss": 0.1911,
480
+ "num_input_tokens_seen": 78209744,
481
  "step": 236
482
  },
483
  {
484
  "epoch": 0.4948453608247423,
485
+ "grad_norm": 1.125,
486
  "learning_rate": 2.1973058505269007e-05,
487
+ "loss": 0.1935,
488
+ "num_input_tokens_seen": 79494896,
489
  "step": 240
490
  },
491
  {
492
  "epoch": 0.5030927835051546,
493
  "grad_norm": 1.0546875,
494
  "learning_rate": 2.1429873361152124e-05,
495
+ "loss": 0.1977,
496
+ "num_input_tokens_seen": 80721808,
497
  "step": 244
498
  },
499
  {
500
  "epoch": 0.511340206185567,
501
+ "grad_norm": 0.96875,
502
  "learning_rate": 2.088562581581592e-05,
503
+ "loss": 0.1956,
504
+ "num_input_tokens_seen": 81918944,
505
  "step": 248
506
  },
507
  {
508
  "epoch": 0.5195876288659794,
509
+ "grad_norm": 1.125,
510
  "learning_rate": 2.0340720247208447e-05,
511
+ "loss": 0.1912,
512
+ "num_input_tokens_seen": 83184064,
513
  "step": 252
514
  },
515
  {
516
  "epoch": 0.5278350515463918,
517
+ "grad_norm": 1.03125,
518
  "learning_rate": 1.9795561522191523e-05,
519
+ "loss": 0.1843,
520
+ "num_input_tokens_seen": 84574976,
521
  "step": 256
522
  },
523
  {
524
  "epoch": 0.5360824742268041,
525
+ "grad_norm": 1.0078125,
526
  "learning_rate": 1.9250554695723107e-05,
527
+ "loss": 0.1942,
528
+ "num_input_tokens_seen": 85844768,
529
  "step": 260
530
  },
531
  {
532
  "epoch": 0.5443298969072164,
533
+ "grad_norm": 0.9765625,
534
  "learning_rate": 1.8706104709899964e-05,
535
+ "loss": 0.1922,
536
+ "num_input_tokens_seen": 87245056,
537
  "step": 264
538
  },
539
  {
540
  "epoch": 0.5525773195876289,
541
+ "grad_norm": 1.0234375,
542
  "learning_rate": 1.816261609308419e-05,
543
+ "loss": 0.182,
544
+ "num_input_tokens_seen": 88603792,
545
  "step": 268
546
  },
547
  {
548
  "epoch": 0.5608247422680412,
549
+ "grad_norm": 0.90625,
550
  "learning_rate": 1.7620492659337155e-05,
551
+ "loss": 0.1879,
552
+ "num_input_tokens_seen": 90054816,
553
  "step": 272
554
  },
555
  {
556
  "epoch": 0.5690721649484536,
557
+ "grad_norm": 0.9453125,
558
  "learning_rate": 1.7080137208384122e-05,
559
+ "loss": 0.1809,
560
+ "num_input_tokens_seen": 91432912,
561
  "step": 276
562
  },
563
  {
564
  "epoch": 0.5773195876288659,
565
+ "grad_norm": 0.921875,
566
  "learning_rate": 1.6541951226332565e-05,
567
+ "loss": 0.1735,
568
+ "num_input_tokens_seen": 92795296,
569
  "step": 280
570
  },
571
  {
572
  "epoch": 0.5855670103092784,
573
+ "grad_norm": 0.9296875,
574
  "learning_rate": 1.600633458736653e-05,
575
+ "loss": 0.1915,
576
+ "num_input_tokens_seen": 94071744,
577
  "step": 284
578
  },
579
  {
580
  "epoch": 0.5938144329896907,
581
+ "grad_norm": 0.94921875,
582
  "learning_rate": 1.5473685256638572e-05,
583
+ "loss": 0.1895,
584
+ "num_input_tokens_seen": 95342096,
585
  "step": 288
586
  },
587
  {
588
  "epoch": 0.6020618556701031,
589
+ "grad_norm": 0.98828125,
590
  "learning_rate": 1.4944398994580232e-05,
591
+ "loss": 0.1869,
592
+ "num_input_tokens_seen": 96569312,
593
  "step": 292
594
  },
595
  {
596
  "epoch": 0.6103092783505155,
597
+ "grad_norm": 1.03125,
598
  "learning_rate": 1.4418869062850514e-05,
599
+ "loss": 0.2004,
600
+ "num_input_tokens_seen": 97849216,
601
  "step": 296
602
  },
603
  {
604
  "epoch": 0.6185567010309279,
605
+ "grad_norm": 0.93359375,
606
  "learning_rate": 1.3897485932141042e-05,
607
+ "loss": 0.1865,
608
+ "num_input_tokens_seen": 99083488,
609
  "step": 300
610
  },
611
  {
612
  "epoch": 0.6268041237113402,
613
+ "grad_norm": 0.91796875,
614
  "learning_rate": 1.3380636992054878e-05,
615
+ "loss": 0.1769,
616
+ "num_input_tokens_seen": 100566624,
617
  "step": 304
618
  },
619
  {
620
  "epoch": 0.6350515463917525,
621
+ "grad_norm": 0.86328125,
622
  "learning_rate": 1.2868706263274602e-05,
623
+ "loss": 0.1969,
624
+ "num_input_tokens_seen": 101823872,
625
  "step": 308
626
  },
627
  {
628
  "epoch": 0.643298969072165,
629
+ "grad_norm": 0.96484375,
630
  "learning_rate": 1.236207411223353e-05,
631
+ "loss": 0.1767,
632
+ "num_input_tokens_seen": 103284176,
633
  "step": 312
634
  },
635
  {
636
  "epoch": 0.6515463917525773,
637
+ "grad_norm": 0.83984375,
638
  "learning_rate": 1.1861116968502015e-05,
639
+ "loss": 0.1799,
640
+ "num_input_tokens_seen": 104567360,
641
  "step": 316
642
  },
643
  {
644
  "epoch": 0.6597938144329897,
645
+ "grad_norm": 0.92578125,
646
  "learning_rate": 1.136620704509892e-05,
647
+ "loss": 0.1856,
648
+ "num_input_tokens_seen": 105872848,
649
  "step": 320
650
  },
651
  {
652
  "epoch": 0.668041237113402,
653
+ "grad_norm": 0.86328125,
654
  "learning_rate": 1.087771206193593e-05,
655
+ "loss": 0.1791,
656
+ "num_input_tokens_seen": 107217232,
657
  "step": 324
658
  },
659
  {
660
  "epoch": 0.6762886597938145,
661
+ "grad_norm": 0.9609375,
662
  "learning_rate": 1.0395994972600285e-05,
663
+ "loss": 0.1806,
664
+ "num_input_tokens_seen": 108626976,
665
  "step": 328
666
  },
667
  {
668
  "epoch": 0.6845360824742268,
669
+ "grad_norm": 0.9609375,
670
  "learning_rate": 9.921413694678959e-06,
671
+ "loss": 0.2018,
672
+ "num_input_tokens_seen": 109754000,
673
  "step": 332
674
  },
675
  {
676
  "epoch": 0.6927835051546392,
677
+ "grad_norm": 0.94921875,
678
  "learning_rate": 9.454320843824512e-06,
679
+ "loss": 0.1848,
680
+ "num_input_tokens_seen": 111026592,
681
  "step": 336
682
  },
683
  {
684
  "epoch": 0.7010309278350515,
685
+ "grad_norm": 0.96484375,
686
  "learning_rate": 8.995063471760377e-06,
687
+ "loss": 0.1885,
688
+ "num_input_tokens_seen": 112287760,
689
  "step": 340
690
  },
691
  {
692
  "epoch": 0.709278350515464,
693
+ "grad_norm": 0.98828125,
694
  "learning_rate": 8.543982808420156e-06,
695
+ "loss": 0.1838,
696
+ "num_input_tokens_seen": 113634128,
697
  "step": 344
698
  },
699
  {
700
  "epoch": 0.7175257731958763,
701
+ "grad_norm": 0.984375,
702
  "learning_rate": 8.101414008412469e-06,
703
+ "loss": 0.1842,
704
+ "num_input_tokens_seen": 114949760,
705
  "step": 348
706
  },
707
  {
708
  "epoch": 0.7257731958762886,
709
+ "grad_norm": 0.96484375,
710
  "learning_rate": 7.667685901999875e-06,
711
+ "loss": 0.1935,
712
+ "num_input_tokens_seen": 116223648,
713
  "step": 352
714
  },
715
  {
716
  "epoch": 0.734020618556701,
717
+ "grad_norm": 0.86328125,
718
  "learning_rate": 7.24312075077674e-06,
719
+ "loss": 0.1866,
720
+ "num_input_tokens_seen": 117618112,
721
  "step": 356
722
  },
723
  {
724
  "epoch": 0.7422680412371134,
725
+ "grad_norm": 0.875,
726
  "learning_rate": 6.828034008227678e-06,
727
+ "loss": 0.1751,
728
+ "num_input_tokens_seen": 119000256,
729
  "step": 360
730
  },
731
  {
732
  "epoch": 0.7505154639175258,
733
+ "grad_norm": 0.9765625,
734
  "learning_rate": 6.422734085344464e-06,
735
+ "loss": 0.1796,
736
+ "num_input_tokens_seen": 120232672,
737
  "step": 364
738
  },
739
  {
740
  "epoch": 0.7587628865979381,
741
+ "grad_norm": 0.83984375,
742
  "learning_rate": 6.027522121475482e-06,
743
+ "loss": 0.1783,
744
+ "num_input_tokens_seen": 121499376,
745
  "step": 368
746
  },
747
  {
748
  "epoch": 0.7670103092783506,
749
  "grad_norm": 0.95703125,
750
  "learning_rate": 5.642691760578116e-06,
751
+ "loss": 0.1856,
752
+ "num_input_tokens_seen": 122791312,
753
  "step": 372
754
  },
755
  {
756
  "epoch": 0.7752577319587629,
757
+ "grad_norm": 0.85546875,
758
  "learning_rate": 5.268528933040147e-06,
759
+ "loss": 0.1674,
760
+ "num_input_tokens_seen": 124261040,
761
  "step": 376
762
  },
763
  {
764
  "epoch": 0.7835051546391752,
765
+ "grad_norm": 0.82421875,
766
  "learning_rate": 4.905311643232464e-06,
767
+ "loss": 0.1773,
768
+ "num_input_tokens_seen": 125708848,
769
  "step": 380
770
  },
771
  {
772
  "epoch": 0.7917525773195876,
773
+ "grad_norm": 0.9375,
774
  "learning_rate": 4.553309762950739e-06,
775
+ "loss": 0.1905,
776
+ "num_input_tokens_seen": 126865712,
777
  "step": 384
778
  },
779
  {
780
  "epoch": 0.8,
781
+ "grad_norm": 0.99609375,
782
  "learning_rate": 4.212784830899725e-06,
783
+ "loss": 0.1793,
784
+ "num_input_tokens_seen": 128157040,
785
  "step": 388
786
  },
787
  {
788
  "epoch": 0.8082474226804124,
789
+ "grad_norm": 0.84375,
790
  "learning_rate": 3.8839898583689725e-06,
791
+ "loss": 0.1812,
792
+ "num_input_tokens_seen": 129465312,
793
  "step": 392
794
  },
795
  {
796
  "epoch": 0.8164948453608247,
797
+ "grad_norm": 0.890625,
798
  "learning_rate": 3.567169141244562e-06,
799
+ "loss": 0.1813,
800
+ "num_input_tokens_seen": 130665504,
801
  "step": 396
802
  },
803
  {
804
  "epoch": 0.8247422680412371,
805
+ "grad_norm": 0.92578125,
806
  "learning_rate": 3.262558078496301e-06,
807
+ "loss": 0.1727,
808
+ "num_input_tokens_seen": 132002896,
809
  "step": 400
810
  },
811
  {
812
  "epoch": 0.8329896907216495,
813
+ "grad_norm": 0.8828125,
814
  "learning_rate": 2.9703829972754407e-06,
815
+ "loss": 0.1858,
816
+ "num_input_tokens_seen": 133415088,
817
  "step": 404
818
  },
819
  {
820
  "epoch": 0.8412371134020619,
821
+ "grad_norm": 0.90234375,
822
  "learning_rate": 2.69086098475277e-06,
823
+ "loss": 0.1707,
824
+ "num_input_tokens_seen": 134815184,
825
  "step": 408
826
  },
827
  {
828
  "epoch": 0.8494845360824742,
829
+ "grad_norm": 0.97265625,
830
  "learning_rate": 2.4241997268220096e-06,
831
+ "loss": 0.1822,
832
+ "num_input_tokens_seen": 136261472,
833
  "step": 412
834
  },
835
  {
836
  "epoch": 0.8577319587628865,
837
+ "grad_norm": 0.98828125,
838
  "learning_rate": 2.1705973537884615e-06,
839
+ "loss": 0.1809,
840
+ "num_input_tokens_seen": 137429504,
841
  "step": 416
842
  },
843
  {
844
  "epoch": 0.865979381443299,
845
+ "grad_norm": 0.9375,
846
  "learning_rate": 1.9302422931574183e-06,
847
+ "loss": 0.1885,
848
+ "num_input_tokens_seen": 138708544,
849
  "step": 420
850
  },
851
  {
852
  "epoch": 0.8742268041237113,
853
  "grad_norm": 0.83984375,
854
  "learning_rate": 1.7033131296318473e-06,
855
+ "loss": 0.1687,
856
+ "num_input_tokens_seen": 140033024,
857
  "step": 424
858
  },
859
  {
860
  "epoch": 0.8824742268041237,
861
+ "grad_norm": 0.859375,
862
  "learning_rate": 1.4899784724232968e-06,
863
+ "loss": 0.1748,
864
+ "num_input_tokens_seen": 141348192,
865
  "step": 428
866
  },
867
  {
868
  "epoch": 0.8907216494845361,
869
+ "grad_norm": 0.859375,
870
  "learning_rate": 1.2903968299746094e-06,
871
+ "loss": 0.1716,
872
+ "num_input_tokens_seen": 142800048,
873
  "step": 432
874
  },
875
  {
876
  "epoch": 0.8989690721649485,
877
+ "grad_norm": 0.94140625,
878
  "learning_rate": 1.104716492187574e-06,
879
+ "loss": 0.1841,
880
+ "num_input_tokens_seen": 144156592,
881
  "step": 436
882
  },
883
  {
884
  "epoch": 0.9072164948453608,
885
+ "grad_norm": 0.984375,
886
  "learning_rate": 9.330754202429726e-07,
887
+ "loss": 0.1855,
888
+ "num_input_tokens_seen": 145334944,
889
  "step": 440
890
  },
891
  {
892
  "epoch": 0.9154639175257732,
893
+ "grad_norm": 1.0625,
894
  "learning_rate": 7.756011440948996e-07,
895
+ "loss": 0.1895,
896
+ "num_input_tokens_seen": 146529728,
897
  "step": 444
898
  },
899
  {
900
  "epoch": 0.9237113402061856,
901
+ "grad_norm": 1.03125,
902
  "learning_rate": 6.324106677155573e-07,
903
+ "loss": 0.1841,
904
+ "num_input_tokens_seen": 147823952,
905
  "step": 448
906
  },
907
  {
908
  "epoch": 0.931958762886598,
909
  "grad_norm": 0.88671875,
910
  "learning_rate": 5.036103821608485e-07,
911
+ "loss": 0.1838,
912
+ "num_input_tokens_seen": 149194048,
913
  "step": 452
914
  },
915
  {
916
  "epoch": 0.9402061855670103,
917
+ "grad_norm": 0.83984375,
918
  "learning_rate": 3.892959865214363e-07,
919
+ "loss": 0.1773,
920
+ "num_input_tokens_seen": 150529248,
921
  "step": 456
922
  },
923
  {
924
  "epoch": 0.9484536082474226,
925
+ "grad_norm": 1.1640625,
926
  "learning_rate": 2.8955241681795534e-07,
927
+ "loss": 0.1863,
928
+ "num_input_tokens_seen": 151864336,
929
  "step": 460
930
  },
931
  {
932
  "epoch": 0.9567010309278351,
933
+ "grad_norm": 0.765625,
934
  "learning_rate": 2.044537828932458e-07,
935
+ "loss": 0.1803,
936
+ "num_input_tokens_seen": 153203872,
937
  "step": 464
938
  },
939
  {
940
  "epoch": 0.9649484536082474,
941
+ "grad_norm": 0.8828125,
942
  "learning_rate": 1.3406331334845813e-07,
943
+ "loss": 0.1869,
944
+ "num_input_tokens_seen": 154513568,
945
  "step": 468
946
  },
947
  {
948
  "epoch": 0.9731958762886598,
949
+ "grad_norm": 0.8046875,
950
  "learning_rate": 7.843330856396103e-08,
951
+ "loss": 0.1818,
952
+ "num_input_tokens_seen": 155739584,
953
  "step": 472
954
  },
955
  {
956
  "epoch": 0.9814432989690721,
957
+ "grad_norm": 0.89453125,
958
  "learning_rate": 3.760510183997701e-08,
959
+ "loss": 0.1826,
960
+ "num_input_tokens_seen": 157087344,
961
  "step": 476
962
  },
963
  {
964
  "epoch": 0.9896907216494846,
965
+ "grad_norm": 0.8984375,
966
  "learning_rate": 1.160902868577951e-08,
967
+ "loss": 0.1905,
968
+ "num_input_tokens_seen": 158352288,
969
  "step": 480
970
  },
971
  {
972
  "epoch": 0.9979381443298969,
973
+ "grad_norm": 0.86328125,
974
  "learning_rate": 4.64404280295927e-10,
975
+ "loss": 0.1794,
976
+ "num_input_tokens_seen": 159671536,
977
  "step": 484
978
  },
979
  {
980
  "epoch": 1.0,
981
+ "eval_loss": 0.09546061605215073,
982
+ "eval_runtime": 83.6615,
983
+ "eval_samples_per_second": 12.419,
984
+ "eval_steps_per_second": 0.394,
985
+ "num_input_tokens_seen": 160044208,
986
  "step": 485
987
  },
988
  {
989
  "epoch": 1.0,
990
+ "num_input_tokens_seen": 160044208,
991
  "step": 485,
992
+ "total_flos": 9.013493089079132e+17,
993
+ "train_loss": 0.20329311268845784,
994
+ "train_runtime": 14400.7959,
995
+ "train_samples_per_second": 4.305,
996
  "train_steps_per_second": 0.034,
997
+ "train_tokens_per_second": 1385.946
998
  }
999
  ],
1000
  "logging_steps": 4,
1001
  "max_steps": 485,
1002
+ "num_input_tokens_seen": 160044208,
1003
  "num_train_epochs": 1,
1004
  "save_steps": 0,
1005
  "stateful_callbacks": {
 
1014
  "attributes": {}
1015
  }
1016
  },
1017
+ "total_flos": 9.013493089079132e+17,
1018
  "train_batch_size": 16,
1019
  "trial_name": null,
1020
  "trial_params": null