yangwang825 commited on
Commit
ddf2069
·
verified ·
1 Parent(s): 770dac0

End of training

Browse files
Files changed (6) hide show
  1. README.md +18 -4
  2. all_results.json +10 -10
  3. eval_results.json +5 -5
  4. log_history.json +1319 -1137
  5. train_results.json +5 -5
  6. trainer_state.json +1324 -1142
README.md CHANGED
@@ -1,6 +1,7 @@
1
  ---
2
  library_name: transformers
3
  tags:
 
4
  - generated_from_trainer
5
  datasets:
6
  - voxceleb
@@ -8,7 +9,20 @@ metrics:
8
  - accuracy
9
  model-index:
10
  - name: ecapa-tdnn-voxceleb1-c512-aam
11
- results: []
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -16,10 +30,10 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  # ecapa-tdnn-voxceleb1-c512-aam
18
 
19
- This model is a fine-tuned version of [](https://huggingface.co/) on the voxceleb dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: nan
22
- - Accuracy: 0.0007
23
 
24
  ## Model description
25
 
 
1
  ---
2
  library_name: transformers
3
  tags:
4
+ - audio-classification
5
  - generated_from_trainer
6
  datasets:
7
  - voxceleb
 
9
  - accuracy
10
  model-index:
11
  - name: ecapa-tdnn-voxceleb1-c512-aam
12
+ results:
13
+ - task:
14
+ name: Audio Classification
15
+ type: audio-classification
16
+ dataset:
17
+ name: confit/voxceleb
18
+ type: voxceleb
19
+ config: verification
20
+ split: train
21
+ args: verification
22
+ metrics:
23
+ - name: Accuracy
24
+ type: accuracy
25
+ value: 0.9757901815736382
26
  ---
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
30
 
31
  # ecapa-tdnn-voxceleb1-c512-aam
32
 
33
+ This model is a fine-tuned version of [](https://huggingface.co/) on the confit/voxceleb dataset.
34
  It achieves the following results on the evaluation set:
35
+ - Loss: 0.5840
36
+ - Accuracy: 0.9758
37
 
38
  ## Model description
39
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8030272452068618,
4
- "eval_loss": 4.700281620025635,
5
- "eval_runtime": 77.9967,
6
- "eval_samples_per_second": 190.585,
7
- "eval_steps_per_second": 190.585,
8
- "total_flos": 2.49073133395968e+18,
9
- "train_loss": 7.888943860726876,
10
- "train_runtime": 28748.4048,
11
- "train_samples_per_second": 46.534,
12
- "train_steps_per_second": 0.182
13
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.9757901815736382,
4
+ "eval_loss": 0.5840117335319519,
5
+ "eval_runtime": 13.1656,
6
+ "eval_samples_per_second": 112.946,
7
+ "eval_steps_per_second": 112.946,
8
+ "total_flos": 2.7398100529152e+18,
9
+ "train_loss": 2.9414075751926587,
10
+ "train_runtime": 59857.6179,
11
+ "train_samples_per_second": 24.584,
12
+ "train_steps_per_second": 0.096
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "eval_accuracy": 0.8030272452068618,
4
- "eval_loss": 4.700281620025635,
5
- "eval_runtime": 77.9967,
6
- "eval_samples_per_second": 190.585,
7
- "eval_steps_per_second": 190.585
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "eval_accuracy": 0.9757901815736382,
4
+ "eval_loss": 0.5840117335319519,
5
+ "eval_runtime": 13.1656,
6
+ "eval_samples_per_second": 112.946,
7
+ "eval_steps_per_second": 112.946
8
  }
log_history.json CHANGED
@@ -1,1928 +1,2110 @@
1
  [
2
  {
3
- "loss": 13.2232,
4
- "grad_norm": 6.157718181610107,
5
- "learning_rate": 3.824091778202677e-06,
6
- "epoch": 0.03824091778202677,
7
  "step": 20
8
  },
9
  {
10
- "loss": 13.2113,
11
- "grad_norm": 6.144223213195801,
12
- "learning_rate": 7.648183556405354e-06,
13
- "epoch": 0.07648183556405354,
14
  "step": 40
15
  },
16
  {
17
- "loss": 13.1625,
18
- "grad_norm": 6.032691955566406,
19
- "learning_rate": 1.147227533460803e-05,
20
- "epoch": 0.1147227533460803,
21
  "step": 60
22
  },
23
  {
24
- "loss": 13.1174,
25
- "grad_norm": 5.916826248168945,
26
- "learning_rate": 1.529636711281071e-05,
27
- "epoch": 0.15296367112810708,
28
  "step": 80
29
  },
30
  {
31
- "loss": 13.0512,
32
- "grad_norm": 5.7198004722595215,
33
- "learning_rate": 1.9120458891013384e-05,
34
- "epoch": 0.19120458891013384,
35
  "step": 100
36
  },
37
  {
38
- "loss": 12.9931,
39
- "grad_norm": 5.554529666900635,
40
- "learning_rate": 2.294455066921606e-05,
41
- "epoch": 0.2294455066921606,
42
  "step": 120
43
  },
44
  {
45
- "loss": 12.9042,
46
- "grad_norm": 5.364482879638672,
47
- "learning_rate": 2.6768642447418742e-05,
48
- "epoch": 0.2676864244741874,
49
  "step": 140
50
  },
51
  {
52
- "loss": 12.8488,
53
- "grad_norm": 5.091818809509277,
54
- "learning_rate": 3.059273422562142e-05,
55
- "epoch": 0.30592734225621415,
56
  "step": 160
57
  },
58
  {
59
- "loss": 12.7715,
60
- "grad_norm": 5.035643577575684,
61
- "learning_rate": 3.441682600382409e-05,
62
- "epoch": 0.3441682600382409,
63
  "step": 180
64
  },
65
  {
66
- "loss": 12.6747,
67
- "grad_norm": 4.819056987762451,
68
- "learning_rate": 3.824091778202677e-05,
69
- "epoch": 0.3824091778202677,
70
  "step": 200
71
  },
72
  {
73
- "loss": 12.6366,
74
- "grad_norm": 4.597919464111328,
75
- "learning_rate": 4.2065009560229444e-05,
76
- "epoch": 0.42065009560229444,
77
  "step": 220
78
  },
79
  {
80
- "loss": 12.5388,
81
- "grad_norm": 4.551054954528809,
82
- "learning_rate": 4.588910133843212e-05,
83
- "epoch": 0.4588910133843212,
84
  "step": 240
85
  },
86
  {
87
- "loss": 12.4527,
88
- "grad_norm": 4.289029598236084,
89
- "learning_rate": 4.97131931166348e-05,
90
- "epoch": 0.497131931166348,
91
  "step": 260
92
  },
93
  {
94
- "loss": 12.3809,
95
- "grad_norm": 4.291126728057861,
96
- "learning_rate": 5.3537284894837484e-05,
97
- "epoch": 0.5353728489483748,
98
  "step": 280
99
  },
100
  {
101
- "loss": 12.3185,
102
- "grad_norm": 4.090356826782227,
103
- "learning_rate": 5.736137667304016e-05,
104
- "epoch": 0.5736137667304015,
105
  "step": 300
106
  },
107
  {
108
- "loss": 12.2101,
109
- "grad_norm": 3.9066805839538574,
110
- "learning_rate": 6.118546845124283e-05,
111
- "epoch": 0.6118546845124283,
112
  "step": 320
113
  },
114
  {
115
- "loss": 12.1255,
116
- "grad_norm": 3.937908887863159,
117
- "learning_rate": 6.50095602294455e-05,
118
- "epoch": 0.6500956022944551,
119
  "step": 340
120
  },
121
  {
122
- "loss": 12.0543,
123
- "grad_norm": 3.919820547103882,
124
- "learning_rate": 6.883365200764819e-05,
125
- "epoch": 0.6883365200764818,
126
  "step": 360
127
  },
128
  {
129
- "loss": 11.9417,
130
- "grad_norm": 3.8298187255859375,
131
- "learning_rate": 7.265774378585087e-05,
132
- "epoch": 0.7265774378585086,
133
  "step": 380
134
  },
135
  {
136
- "loss": 11.8644,
137
- "grad_norm": 3.7290520668029785,
138
- "learning_rate": 7.648183556405354e-05,
139
- "epoch": 0.7648183556405354,
140
  "step": 400
141
  },
142
  {
143
- "loss": 11.8122,
144
- "grad_norm": 3.76938533782959,
145
- "learning_rate": 8.030592734225622e-05,
146
- "epoch": 0.8030592734225621,
147
  "step": 420
148
  },
149
  {
150
- "loss": 11.7117,
151
- "grad_norm": 3.8729827404022217,
152
- "learning_rate": 8.413001912045889e-05,
153
- "epoch": 0.8413001912045889,
154
  "step": 440
155
  },
156
  {
157
- "loss": 11.6245,
158
- "grad_norm": 3.7178924083709717,
159
- "learning_rate": 8.795411089866157e-05,
160
- "epoch": 0.8795411089866156,
161
  "step": 460
162
  },
163
  {
164
- "loss": 11.547,
165
- "grad_norm": 3.7744827270507812,
166
- "learning_rate": 9.177820267686424e-05,
167
- "epoch": 0.9177820267686424,
168
  "step": 480
169
  },
170
  {
171
- "loss": 11.4699,
172
- "grad_norm": 3.6705052852630615,
173
- "learning_rate": 9.560229445506692e-05,
174
- "epoch": 0.9560229445506692,
175
  "step": 500
176
  },
177
  {
178
- "loss": 11.3851,
179
- "grad_norm": 3.6992719173431396,
180
- "learning_rate": 9.94263862332696e-05,
181
- "epoch": 0.994263862332696,
182
  "step": 520
183
  },
184
  {
185
- "eval_loss": 11.029301643371582,
186
- "eval_accuracy": 0.18062563067608475,
187
- "eval_runtime": 592.6353,
188
- "eval_samples_per_second": 25.083,
189
- "eval_steps_per_second": 25.083,
190
- "epoch": 1.0,
191
- "step": 523
192
- },
193
- {
194
- "loss": 11.2589,
195
- "grad_norm": 3.6838159561157227,
196
- "learning_rate": 9.963883577650308e-05,
197
- "epoch": 1.0325047801147227,
198
  "step": 540
199
  },
200
  {
201
- "loss": 11.1668,
202
- "grad_norm": 3.7846293449401855,
203
- "learning_rate": 9.921393669003612e-05,
204
- "epoch": 1.0707456978967496,
205
  "step": 560
206
  },
207
  {
208
- "loss": 11.1053,
209
- "grad_norm": 3.688416004180908,
210
- "learning_rate": 9.878903760356916e-05,
211
- "epoch": 1.1089866156787762,
 
 
 
 
 
 
 
 
 
212
  "step": 580
213
  },
214
  {
215
- "loss": 11.019,
216
- "grad_norm": 3.724273204803467,
217
- "learning_rate": 9.836413851710219e-05,
218
- "epoch": 1.147227533460803,
219
  "step": 600
220
  },
221
  {
222
- "loss": 10.9731,
223
- "grad_norm": 3.840388536453247,
224
- "learning_rate": 9.793923943063523e-05,
225
- "epoch": 1.1854684512428297,
226
  "step": 620
227
  },
228
  {
229
- "loss": 10.875,
230
- "grad_norm": 3.828228235244751,
231
- "learning_rate": 9.751434034416827e-05,
232
- "epoch": 1.2237093690248566,
233
  "step": 640
234
  },
235
  {
236
- "loss": 10.8111,
237
- "grad_norm": 3.891911745071411,
238
- "learning_rate": 9.70894412577013e-05,
239
- "epoch": 1.2619502868068833,
240
  "step": 660
241
  },
242
  {
243
- "loss": 10.7717,
244
- "grad_norm": 3.8076562881469727,
245
- "learning_rate": 9.666454217123433e-05,
246
- "epoch": 1.3001912045889101,
247
  "step": 680
248
  },
249
  {
250
- "loss": 10.6723,
251
- "grad_norm": 3.8521881103515625,
252
- "learning_rate": 9.623964308476737e-05,
253
- "epoch": 1.338432122370937,
254
  "step": 700
255
  },
256
  {
257
- "loss": 10.5961,
258
- "grad_norm": 3.8576488494873047,
259
- "learning_rate": 9.58147439983004e-05,
260
- "epoch": 1.3766730401529637,
261
  "step": 720
262
  },
263
  {
264
- "loss": 10.5392,
265
- "grad_norm": 4.002715587615967,
266
- "learning_rate": 9.538984491183345e-05,
267
- "epoch": 1.4149139579349903,
268
  "step": 740
269
  },
270
  {
271
- "loss": 10.5018,
272
- "grad_norm": 3.8657026290893555,
273
- "learning_rate": 9.496494582536648e-05,
274
- "epoch": 1.4531548757170172,
275
  "step": 760
276
  },
277
  {
278
- "loss": 10.4325,
279
- "grad_norm": 3.9424169063568115,
280
- "learning_rate": 9.454004673889951e-05,
281
- "epoch": 1.491395793499044,
282
  "step": 780
283
  },
284
  {
285
- "loss": 10.3722,
286
- "grad_norm": 3.9783968925476074,
287
- "learning_rate": 9.411514765243256e-05,
288
- "epoch": 1.5296367112810707,
289
  "step": 800
290
  },
291
  {
292
- "loss": 10.3069,
293
- "grad_norm": 4.081951141357422,
294
- "learning_rate": 9.369024856596559e-05,
295
- "epoch": 1.5678776290630974,
296
  "step": 820
297
  },
298
  {
299
- "loss": 10.2527,
300
- "grad_norm": 4.141290187835693,
301
- "learning_rate": 9.326534947949863e-05,
302
- "epoch": 1.6061185468451242,
303
  "step": 840
304
  },
305
  {
306
- "loss": 10.2271,
307
- "grad_norm": 4.294083595275879,
308
- "learning_rate": 9.284045039303167e-05,
309
- "epoch": 1.644359464627151,
310
  "step": 860
311
  },
312
  {
313
- "loss": 10.1756,
314
- "grad_norm": 4.727543354034424,
315
- "learning_rate": 9.241555130656469e-05,
316
- "epoch": 1.682600382409178,
317
  "step": 880
318
  },
319
  {
320
- "loss": 10.0936,
321
- "grad_norm": 4.068965911865234,
322
- "learning_rate": 9.199065222009773e-05,
323
- "epoch": 1.7208413001912046,
324
  "step": 900
325
  },
326
  {
327
- "loss": 10.0937,
328
- "grad_norm": 4.025643825531006,
329
- "learning_rate": 9.156575313363077e-05,
330
- "epoch": 1.7590822179732313,
331
  "step": 920
332
  },
333
  {
334
- "loss": 10.0217,
335
- "grad_norm": 4.317354679107666,
336
- "learning_rate": 9.11408540471638e-05,
337
- "epoch": 1.7973231357552581,
338
  "step": 940
339
  },
340
  {
341
- "loss": 9.9743,
342
- "grad_norm": 4.101060390472412,
343
- "learning_rate": 9.071595496069684e-05,
344
- "epoch": 1.835564053537285,
345
  "step": 960
346
  },
347
  {
348
- "loss": 9.9879,
349
- "grad_norm": 4.225609302520752,
350
- "learning_rate": 9.029105587422988e-05,
351
- "epoch": 1.8738049713193117,
352
  "step": 980
353
  },
354
  {
355
- "loss": 9.8273,
356
- "grad_norm": 4.3140668869018555,
357
- "learning_rate": 8.986615678776292e-05,
358
- "epoch": 1.9120458891013383,
359
  "step": 1000
360
  },
361
  {
362
- "loss": 9.8136,
363
- "grad_norm": 4.199500560760498,
364
- "learning_rate": 8.944125770129594e-05,
365
- "epoch": 1.9502868068833652,
366
  "step": 1020
367
  },
368
  {
369
- "loss": 9.7596,
370
- "grad_norm": 4.457912445068359,
371
- "learning_rate": 8.901635861482898e-05,
372
- "epoch": 1.988527724665392,
373
  "step": 1040
374
  },
375
  {
376
- "eval_loss": 9.140138626098633,
377
- "eval_accuracy": 0.3849983181971073,
378
- "eval_runtime": 461.2724,
379
- "eval_samples_per_second": 32.226,
380
- "eval_steps_per_second": 32.226,
381
- "epoch": 2.0,
382
- "step": 1046
383
- },
384
- {
385
- "loss": 9.714,
386
- "grad_norm": 4.428006172180176,
387
- "learning_rate": 8.859145952836202e-05,
388
- "epoch": 2.026768642447419,
389
  "step": 1060
390
  },
391
  {
392
- "loss": 9.5508,
393
- "grad_norm": 4.372852325439453,
394
- "learning_rate": 8.816656044189505e-05,
395
- "epoch": 2.0650095602294454,
396
  "step": 1080
397
  },
398
  {
399
- "loss": 9.6096,
400
- "grad_norm": 4.381687641143799,
401
- "learning_rate": 8.774166135542809e-05,
402
- "epoch": 2.1032504780114722,
403
  "step": 1100
404
  },
405
  {
406
- "loss": 9.5077,
407
- "grad_norm": 4.5865631103515625,
408
- "learning_rate": 8.731676226896113e-05,
409
- "epoch": 2.141491395793499,
410
  "step": 1120
411
  },
412
  {
413
- "loss": 9.5044,
414
- "grad_norm": 4.363910675048828,
415
- "learning_rate": 8.689186318249416e-05,
416
- "epoch": 2.179732313575526,
417
  "step": 1140
418
  },
419
  {
420
- "loss": 9.4205,
421
- "grad_norm": 4.577084541320801,
422
- "learning_rate": 8.646696409602721e-05,
423
- "epoch": 2.2179732313575524,
 
 
 
 
 
 
 
 
 
424
  "step": 1160
425
  },
426
  {
427
- "loss": 9.4317,
428
- "grad_norm": 4.576254367828369,
429
- "learning_rate": 8.604206500956024e-05,
430
- "epoch": 2.2562141491395793,
431
  "step": 1180
432
  },
433
  {
434
- "loss": 9.3607,
435
- "grad_norm": 4.4399847984313965,
436
- "learning_rate": 8.561716592309326e-05,
437
- "epoch": 2.294455066921606,
438
  "step": 1200
439
  },
440
  {
441
- "loss": 9.2533,
442
- "grad_norm": 4.595015525817871,
443
- "learning_rate": 8.51922668366263e-05,
444
- "epoch": 2.332695984703633,
445
  "step": 1220
446
  },
447
  {
448
- "loss": 9.3384,
449
- "grad_norm": 4.900874614715576,
450
- "learning_rate": 8.476736775015934e-05,
451
- "epoch": 2.3709369024856595,
452
  "step": 1240
453
  },
454
  {
455
- "loss": 9.293,
456
- "grad_norm": 4.594742774963379,
457
- "learning_rate": 8.434246866369238e-05,
458
- "epoch": 2.4091778202676863,
459
  "step": 1260
460
  },
461
  {
462
- "loss": 9.1986,
463
- "grad_norm": 4.587216377258301,
464
- "learning_rate": 8.391756957722541e-05,
465
- "epoch": 2.447418738049713,
466
  "step": 1280
467
  },
468
  {
469
- "loss": 9.1358,
470
- "grad_norm": 4.735275745391846,
471
- "learning_rate": 8.349267049075845e-05,
472
- "epoch": 2.48565965583174,
473
  "step": 1300
474
  },
475
  {
476
- "loss": 9.1284,
477
- "grad_norm": 4.627840995788574,
478
- "learning_rate": 8.306777140429149e-05,
479
- "epoch": 2.5239005736137665,
480
  "step": 1320
481
  },
482
  {
483
- "loss": 9.0949,
484
- "grad_norm": 4.658718585968018,
485
- "learning_rate": 8.264287231782451e-05,
486
- "epoch": 2.5621414913957934,
487
  "step": 1340
488
  },
489
  {
490
- "loss": 9.0312,
491
- "grad_norm": 4.875549793243408,
492
- "learning_rate": 8.221797323135755e-05,
493
- "epoch": 2.6003824091778203,
494
  "step": 1360
495
  },
496
  {
497
- "loss": 8.9949,
498
- "grad_norm": 4.683437347412109,
499
- "learning_rate": 8.179307414489059e-05,
500
- "epoch": 2.638623326959847,
501
  "step": 1380
502
  },
503
  {
504
- "loss": 8.9705,
505
- "grad_norm": 4.861114025115967,
506
- "learning_rate": 8.136817505842362e-05,
507
- "epoch": 2.676864244741874,
508
  "step": 1400
509
  },
510
  {
511
- "loss": 8.9483,
512
- "grad_norm": 4.727562427520752,
513
- "learning_rate": 8.094327597195667e-05,
514
- "epoch": 2.7151051625239004,
515
  "step": 1420
516
  },
517
  {
518
- "loss": 8.9254,
519
- "grad_norm": 4.8202948570251465,
520
- "learning_rate": 8.05183768854897e-05,
521
- "epoch": 2.7533460803059273,
522
  "step": 1440
523
  },
524
  {
525
- "loss": 8.8768,
526
- "grad_norm": 4.926464557647705,
527
- "learning_rate": 8.009347779902273e-05,
528
- "epoch": 2.791586998087954,
529
  "step": 1460
530
  },
531
  {
532
- "loss": 8.8044,
533
- "grad_norm": 4.7756028175354,
534
- "learning_rate": 7.966857871255578e-05,
535
- "epoch": 2.8298279158699806,
536
  "step": 1480
537
  },
538
  {
539
- "loss": 8.7788,
540
- "grad_norm": 4.888403415679932,
541
- "learning_rate": 7.92436796260888e-05,
542
- "epoch": 2.8680688336520075,
543
  "step": 1500
544
  },
545
  {
546
- "loss": 8.8032,
547
- "grad_norm": 4.943230152130127,
548
- "learning_rate": 7.881878053962184e-05,
549
- "epoch": 2.9063097514340344,
550
  "step": 1520
551
  },
552
  {
553
- "loss": 8.7507,
554
- "grad_norm": 5.011119842529297,
555
- "learning_rate": 7.839388145315488e-05,
556
- "epoch": 2.9445506692160612,
557
  "step": 1540
558
  },
559
  {
560
- "loss": 8.7136,
561
- "grad_norm": 5.068637847900391,
562
- "learning_rate": 7.796898236668791e-05,
563
- "epoch": 2.982791586998088,
564
  "step": 1560
565
  },
566
  {
567
- "eval_loss": 7.882061958312988,
568
- "eval_accuracy": 0.52418432559704,
569
- "eval_runtime": 418.5795,
570
- "eval_samples_per_second": 35.513,
571
- "eval_steps_per_second": 35.513,
572
- "epoch": 3.0,
573
- "step": 1569
574
- },
575
- {
576
- "loss": 8.6104,
577
- "grad_norm": 4.895749092102051,
578
- "learning_rate": 7.754408328022095e-05,
579
- "epoch": 3.0210325047801145,
580
  "step": 1580
581
  },
582
  {
583
- "loss": 8.6136,
584
- "grad_norm": 5.138400077819824,
585
- "learning_rate": 7.711918419375399e-05,
586
- "epoch": 3.0592734225621414,
587
  "step": 1600
588
  },
589
  {
590
- "loss": 8.5866,
591
- "grad_norm": 5.270049571990967,
592
- "learning_rate": 7.669428510728702e-05,
593
- "epoch": 3.0975143403441683,
594
  "step": 1620
595
  },
596
  {
597
- "loss": 8.492,
598
- "grad_norm": 5.178355693817139,
599
- "learning_rate": 7.626938602082006e-05,
600
- "epoch": 3.135755258126195,
601
  "step": 1640
602
  },
603
  {
604
- "loss": 8.4897,
605
- "grad_norm": 5.312692165374756,
606
- "learning_rate": 7.58444869343531e-05,
607
- "epoch": 3.173996175908222,
608
  "step": 1660
609
  },
610
  {
611
- "loss": 8.4441,
612
- "grad_norm": 5.227985382080078,
613
- "learning_rate": 7.541958784788614e-05,
614
- "epoch": 3.2122370936902485,
615
  "step": 1680
616
  },
617
  {
618
- "loss": 8.4722,
619
- "grad_norm": 5.042078495025635,
620
- "learning_rate": 7.499468876141916e-05,
621
- "epoch": 3.2504780114722753,
622
  "step": 1700
623
  },
624
  {
625
- "loss": 8.3105,
626
- "grad_norm": 5.250526428222656,
627
- "learning_rate": 7.45697896749522e-05,
628
- "epoch": 3.288718929254302,
629
  "step": 1720
630
  },
631
  {
632
- "loss": 8.3308,
633
- "grad_norm": 5.22187614440918,
634
- "learning_rate": 7.414489058848524e-05,
635
- "epoch": 3.3269598470363286,
 
 
 
 
 
 
 
 
 
636
  "step": 1740
637
  },
638
  {
639
- "loss": 8.2969,
640
- "grad_norm": 5.491254806518555,
641
- "learning_rate": 7.371999150201827e-05,
642
- "epoch": 3.3652007648183555,
643
  "step": 1760
644
  },
645
  {
646
- "loss": 8.2593,
647
- "grad_norm": 5.482990741729736,
648
- "learning_rate": 7.329509241555131e-05,
649
- "epoch": 3.4034416826003824,
650
  "step": 1780
651
  },
652
  {
653
- "loss": 8.3087,
654
- "grad_norm": 5.359766960144043,
655
- "learning_rate": 7.287019332908435e-05,
656
- "epoch": 3.4416826003824093,
657
  "step": 1800
658
  },
659
  {
660
- "loss": 8.2664,
661
- "grad_norm": 5.788363456726074,
662
- "learning_rate": 7.244529424261737e-05,
663
- "epoch": 3.479923518164436,
664
  "step": 1820
665
  },
666
  {
667
- "loss": 8.2543,
668
- "grad_norm": 5.335551738739014,
669
- "learning_rate": 7.202039515615043e-05,
670
- "epoch": 3.5181644359464626,
671
  "step": 1840
672
  },
673
  {
674
- "loss": 8.2604,
675
- "grad_norm": 5.465627193450928,
676
- "learning_rate": 7.159549606968345e-05,
677
- "epoch": 3.5564053537284894,
678
  "step": 1860
679
  },
680
  {
681
- "loss": 8.1616,
682
- "grad_norm": 5.594823837280273,
683
- "learning_rate": 7.117059698321648e-05,
684
- "epoch": 3.5946462715105163,
685
  "step": 1880
686
  },
687
  {
688
- "loss": 8.1582,
689
- "grad_norm": 5.58858060836792,
690
- "learning_rate": 7.074569789674953e-05,
691
- "epoch": 3.632887189292543,
692
  "step": 1900
693
  },
694
  {
695
- "loss": 8.1061,
696
- "grad_norm": 5.514508247375488,
697
- "learning_rate": 7.032079881028256e-05,
698
- "epoch": 3.67112810707457,
699
  "step": 1920
700
  },
701
  {
702
- "loss": 8.0912,
703
- "grad_norm": 5.644900321960449,
704
- "learning_rate": 6.98958997238156e-05,
705
- "epoch": 3.7093690248565965,
706
  "step": 1940
707
  },
708
  {
709
- "loss": 7.9596,
710
- "grad_norm": 5.701168060302734,
711
- "learning_rate": 6.947100063734864e-05,
712
- "epoch": 3.7476099426386233,
713
  "step": 1960
714
  },
715
  {
716
- "loss": 8.0403,
717
- "grad_norm": 5.880733013153076,
718
- "learning_rate": 6.904610155088167e-05,
719
- "epoch": 3.78585086042065,
720
  "step": 1980
721
  },
722
  {
723
- "loss": 7.9666,
724
- "grad_norm": 5.638689994812012,
725
- "learning_rate": 6.86212024644147e-05,
726
- "epoch": 3.8240917782026767,
727
  "step": 2000
728
  },
729
  {
730
- "loss": 7.9633,
731
- "grad_norm": 6.002101421356201,
732
- "learning_rate": 6.819630337794775e-05,
733
- "epoch": 3.8623326959847035,
734
  "step": 2020
735
  },
736
  {
737
- "loss": 7.8817,
738
- "grad_norm": 5.628067493438721,
739
- "learning_rate": 6.777140429148077e-05,
740
- "epoch": 3.9005736137667304,
741
  "step": 2040
742
  },
743
  {
744
- "loss": 7.9118,
745
- "grad_norm": 6.128510475158691,
746
- "learning_rate": 6.734650520501381e-05,
747
- "epoch": 3.9388145315487573,
748
  "step": 2060
749
  },
750
  {
751
- "loss": 7.848,
752
- "grad_norm": 5.620929718017578,
753
- "learning_rate": 6.692160611854685e-05,
754
- "epoch": 3.977055449330784,
755
  "step": 2080
756
  },
757
  {
758
- "eval_loss": 6.945113658905029,
759
- "eval_accuracy": 0.6143962327615203,
760
- "eval_runtime": 367.1966,
761
- "eval_samples_per_second": 40.482,
762
- "eval_steps_per_second": 40.482,
763
- "epoch": 4.0,
764
- "step": 2092
765
- },
766
- {
767
- "loss": 7.8607,
768
- "grad_norm": 5.820804595947266,
769
- "learning_rate": 6.649670703207989e-05,
770
- "epoch": 4.015296367112811,
771
  "step": 2100
772
  },
773
  {
774
- "loss": 7.7072,
775
- "grad_norm": 5.6448493003845215,
776
- "learning_rate": 6.607180794561292e-05,
777
- "epoch": 4.053537284894838,
778
  "step": 2120
779
  },
780
  {
781
- "loss": 7.772,
782
- "grad_norm": 6.283373832702637,
783
- "learning_rate": 6.564690885914596e-05,
784
- "epoch": 4.091778202676864,
785
  "step": 2140
786
  },
787
  {
788
- "loss": 7.7211,
789
- "grad_norm": 6.125846862792969,
790
- "learning_rate": 6.5222009772679e-05,
791
- "epoch": 4.130019120458891,
792
  "step": 2160
793
  },
794
  {
795
- "loss": 7.6563,
796
- "grad_norm": 5.701002597808838,
797
- "learning_rate": 6.479711068621202e-05,
798
- "epoch": 4.168260038240918,
799
  "step": 2180
800
  },
801
  {
802
- "loss": 7.711,
803
- "grad_norm": 5.910340785980225,
804
- "learning_rate": 6.437221159974506e-05,
805
- "epoch": 4.2065009560229445,
806
  "step": 2200
807
  },
808
  {
809
- "loss": 7.7582,
810
- "grad_norm": 5.8003082275390625,
811
- "learning_rate": 6.39473125132781e-05,
812
- "epoch": 4.244741873804971,
813
  "step": 2220
814
  },
815
  {
816
- "loss": 7.6215,
817
- "grad_norm": 5.95621395111084,
818
- "learning_rate": 6.352241342681113e-05,
819
- "epoch": 4.282982791586998,
820
  "step": 2240
821
  },
822
  {
823
- "loss": 7.5932,
824
- "grad_norm": 5.836912155151367,
825
- "learning_rate": 6.309751434034417e-05,
826
- "epoch": 4.321223709369025,
827
  "step": 2260
828
  },
829
  {
830
- "loss": 7.5122,
831
- "grad_norm": 6.156320095062256,
832
- "learning_rate": 6.267261525387721e-05,
833
- "epoch": 4.359464627151052,
834
  "step": 2280
835
  },
836
  {
837
- "loss": 7.5488,
838
- "grad_norm": 5.937085151672363,
839
- "learning_rate": 6.224771616741024e-05,
840
- "epoch": 4.397705544933078,
841
  "step": 2300
842
  },
843
  {
844
- "loss": 7.5972,
845
- "grad_norm": 5.949016571044922,
846
- "learning_rate": 6.182281708094328e-05,
847
- "epoch": 4.435946462715105,
 
 
 
 
 
 
 
 
 
848
  "step": 2320
849
  },
850
  {
851
- "loss": 7.4327,
852
- "grad_norm": 6.26347541809082,
853
- "learning_rate": 6.139791799447631e-05,
854
- "epoch": 4.474187380497132,
855
  "step": 2340
856
  },
857
  {
858
- "loss": 7.555,
859
- "grad_norm": 6.376476287841797,
860
- "learning_rate": 6.097301890800935e-05,
861
- "epoch": 4.512428298279159,
862
  "step": 2360
863
  },
864
  {
865
- "loss": 7.5463,
866
- "grad_norm": 6.2988200187683105,
867
- "learning_rate": 6.054811982154238e-05,
868
- "epoch": 4.550669216061186,
869
  "step": 2380
870
  },
871
  {
872
- "loss": 7.4637,
873
- "grad_norm": 5.916903972625732,
874
- "learning_rate": 6.012322073507543e-05,
875
- "epoch": 4.588910133843212,
876
  "step": 2400
877
  },
878
  {
879
- "loss": 7.3857,
880
- "grad_norm": 5.896063327789307,
881
- "learning_rate": 5.969832164860846e-05,
882
- "epoch": 4.627151051625239,
883
  "step": 2420
884
  },
885
  {
886
- "loss": 7.4363,
887
- "grad_norm": 6.14431619644165,
888
- "learning_rate": 5.927342256214149e-05,
889
- "epoch": 4.665391969407266,
890
  "step": 2440
891
  },
892
  {
893
- "loss": 7.406,
894
- "grad_norm": 6.2994256019592285,
895
- "learning_rate": 5.8848523475674533e-05,
896
- "epoch": 4.7036328871892925,
897
  "step": 2460
898
  },
899
  {
900
- "loss": 7.338,
901
- "grad_norm": 6.134793758392334,
902
- "learning_rate": 5.8423624389207567e-05,
903
- "epoch": 4.741873804971319,
904
  "step": 2480
905
  },
906
  {
907
- "loss": 7.3912,
908
- "grad_norm": 6.245213031768799,
909
- "learning_rate": 5.79987253027406e-05,
910
- "epoch": 4.780114722753346,
911
  "step": 2500
912
  },
913
  {
914
- "loss": 7.3548,
915
- "grad_norm": 6.118636131286621,
916
- "learning_rate": 5.757382621627364e-05,
917
- "epoch": 4.818355640535373,
918
  "step": 2520
919
  },
920
  {
921
- "loss": 7.3119,
922
- "grad_norm": 6.391002178192139,
923
- "learning_rate": 5.714892712980667e-05,
924
- "epoch": 4.8565965583174,
925
  "step": 2540
926
  },
927
  {
928
- "loss": 7.2119,
929
- "grad_norm": 6.539446830749512,
930
- "learning_rate": 5.6724028043339705e-05,
931
- "epoch": 4.894837476099426,
932
  "step": 2560
933
  },
934
  {
935
- "loss": 7.2505,
936
- "grad_norm": 6.162653923034668,
937
- "learning_rate": 5.6299128956872745e-05,
938
- "epoch": 4.933078393881453,
939
  "step": 2580
940
  },
941
  {
942
- "loss": 7.1912,
943
- "grad_norm": 6.580591678619385,
944
- "learning_rate": 5.587422987040578e-05,
945
- "epoch": 4.97131931166348,
946
  "step": 2600
947
  },
948
  {
949
- "eval_loss": 6.262951850891113,
950
- "eval_accuracy": 0.6821392532795156,
951
- "eval_runtime": 76.4531,
952
- "eval_samples_per_second": 194.433,
953
- "eval_steps_per_second": 194.433,
954
- "epoch": 5.0,
955
- "step": 2615
956
- },
957
- {
958
- "loss": 7.1863,
959
- "grad_norm": 6.838705062866211,
960
- "learning_rate": 5.544933078393881e-05,
961
- "epoch": 5.009560229445507,
962
  "step": 2620
963
  },
964
  {
965
- "loss": 7.1259,
966
- "grad_norm": 6.260281562805176,
967
- "learning_rate": 5.502443169747186e-05,
968
- "epoch": 5.047801147227533,
969
  "step": 2640
970
  },
971
  {
972
- "loss": 7.1559,
973
- "grad_norm": 6.463006496429443,
974
- "learning_rate": 5.459953261100489e-05,
975
- "epoch": 5.08604206500956,
976
  "step": 2660
977
  },
978
  {
979
- "loss": 7.1318,
980
- "grad_norm": 6.499185562133789,
981
- "learning_rate": 5.4174633524537924e-05,
982
- "epoch": 5.124282982791587,
983
  "step": 2680
984
  },
985
  {
986
- "loss": 7.0993,
987
- "grad_norm": 6.508650302886963,
988
- "learning_rate": 5.3749734438070964e-05,
989
- "epoch": 5.162523900573614,
990
  "step": 2700
991
  },
992
  {
993
- "loss": 7.0823,
994
- "grad_norm": 6.573218822479248,
995
- "learning_rate": 5.3324835351604e-05,
996
- "epoch": 5.2007648183556405,
997
  "step": 2720
998
  },
999
  {
1000
- "loss": 7.0839,
1001
- "grad_norm": 6.863697052001953,
1002
- "learning_rate": 5.289993626513703e-05,
1003
- "epoch": 5.239005736137667,
1004
  "step": 2740
1005
  },
1006
  {
1007
- "loss": 7.0723,
1008
- "grad_norm": 6.305070877075195,
1009
- "learning_rate": 5.247503717867007e-05,
1010
- "epoch": 5.277246653919694,
1011
  "step": 2760
1012
  },
1013
  {
1014
- "loss": 6.9592,
1015
- "grad_norm": 6.715279579162598,
1016
- "learning_rate": 5.20501380922031e-05,
1017
- "epoch": 5.315487571701721,
1018
  "step": 2780
1019
  },
1020
  {
1021
- "loss": 7.0275,
1022
- "grad_norm": 6.625701904296875,
1023
- "learning_rate": 5.1625239005736136e-05,
1024
- "epoch": 5.353728489483748,
1025
  "step": 2800
1026
  },
1027
  {
1028
- "loss": 6.9146,
1029
- "grad_norm": 6.717496871948242,
1030
- "learning_rate": 5.120033991926918e-05,
1031
- "epoch": 5.3919694072657744,
1032
  "step": 2820
1033
  },
1034
  {
1035
- "loss": 6.9984,
1036
- "grad_norm": 6.500243186950684,
1037
- "learning_rate": 5.0775440832802216e-05,
1038
- "epoch": 5.430210325047801,
1039
  "step": 2840
1040
  },
1041
  {
1042
- "loss": 6.9367,
1043
- "grad_norm": 6.41347074508667,
1044
- "learning_rate": 5.035054174633524e-05,
1045
- "epoch": 5.468451242829828,
1046
  "step": 2860
1047
  },
1048
  {
1049
- "loss": 6.9997,
1050
- "grad_norm": 6.83429479598999,
1051
- "learning_rate": 4.992564265986828e-05,
1052
- "epoch": 5.506692160611855,
 
 
 
 
 
 
 
 
 
1053
  "step": 2880
1054
  },
1055
  {
1056
- "loss": 6.9204,
1057
- "grad_norm": 6.565597057342529,
1058
- "learning_rate": 4.950074357340132e-05,
1059
- "epoch": 5.544933078393882,
1060
  "step": 2900
1061
  },
1062
  {
1063
- "loss": 6.8926,
1064
- "grad_norm": 6.9456095695495605,
1065
- "learning_rate": 4.907584448693436e-05,
1066
- "epoch": 5.583173996175908,
1067
  "step": 2920
1068
  },
1069
  {
1070
- "loss": 6.8993,
1071
- "grad_norm": 7.052099704742432,
1072
- "learning_rate": 4.865094540046739e-05,
1073
- "epoch": 5.621414913957935,
1074
  "step": 2940
1075
  },
1076
  {
1077
- "loss": 6.8474,
1078
- "grad_norm": 7.128490924835205,
1079
- "learning_rate": 4.822604631400043e-05,
1080
- "epoch": 5.659655831739962,
1081
  "step": 2960
1082
  },
1083
  {
1084
- "loss": 6.8509,
1085
- "grad_norm": 6.792144298553467,
1086
- "learning_rate": 4.780114722753346e-05,
1087
- "epoch": 5.6978967495219885,
1088
  "step": 2980
1089
  },
1090
  {
1091
- "loss": 6.9141,
1092
- "grad_norm": 6.853285312652588,
1093
- "learning_rate": 4.73762481410665e-05,
1094
- "epoch": 5.736137667304015,
1095
  "step": 3000
1096
  },
1097
  {
1098
- "loss": 6.7391,
1099
- "grad_norm": 7.153258800506592,
1100
- "learning_rate": 4.695134905459953e-05,
1101
- "epoch": 5.774378585086042,
1102
  "step": 3020
1103
  },
1104
  {
1105
- "loss": 6.7554,
1106
- "grad_norm": 6.9271321296691895,
1107
- "learning_rate": 4.6526449968132566e-05,
1108
- "epoch": 5.812619502868069,
1109
  "step": 3040
1110
  },
1111
  {
1112
- "loss": 6.8172,
1113
- "grad_norm": 7.218133926391602,
1114
- "learning_rate": 4.6101550881665606e-05,
1115
- "epoch": 5.850860420650095,
1116
  "step": 3060
1117
  },
1118
  {
1119
- "loss": 6.8442,
1120
- "grad_norm": 7.0558695793151855,
1121
- "learning_rate": 4.5676651795198646e-05,
1122
- "epoch": 5.8891013384321225,
1123
  "step": 3080
1124
  },
1125
  {
1126
- "loss": 6.696,
1127
- "grad_norm": 6.762065887451172,
1128
- "learning_rate": 4.525175270873168e-05,
1129
- "epoch": 5.927342256214149,
1130
  "step": 3100
1131
  },
1132
  {
1133
- "loss": 6.6763,
1134
- "grad_norm": 6.8173604011535645,
1135
- "learning_rate": 4.482685362226471e-05,
1136
- "epoch": 5.965583173996176,
1137
  "step": 3120
1138
  },
1139
  {
1140
- "eval_loss": 5.7182440757751465,
1141
- "eval_accuracy": 0.7291624621594349,
1142
- "eval_runtime": 444.003,
1143
- "eval_samples_per_second": 33.48,
1144
- "eval_steps_per_second": 33.48,
1145
- "epoch": 6.0,
1146
- "step": 3138
1147
- },
1148
- {
1149
- "loss": 6.6927,
1150
- "grad_norm": 7.1014723777771,
1151
- "learning_rate": 4.440195453579775e-05,
1152
- "epoch": 6.003824091778203,
1153
  "step": 3140
1154
  },
1155
  {
1156
- "loss": 6.6538,
1157
- "grad_norm": 6.958450794219971,
1158
- "learning_rate": 4.3977055449330785e-05,
1159
- "epoch": 6.042065009560229,
1160
  "step": 3160
1161
  },
1162
  {
1163
- "loss": 6.5479,
1164
- "grad_norm": 6.920003890991211,
1165
- "learning_rate": 4.3552156362863825e-05,
1166
- "epoch": 6.080305927342256,
1167
  "step": 3180
1168
  },
1169
  {
1170
- "loss": 6.5668,
1171
- "grad_norm": 7.053244113922119,
1172
- "learning_rate": 4.312725727639686e-05,
1173
- "epoch": 6.118546845124283,
1174
  "step": 3200
1175
  },
1176
  {
1177
- "loss": 6.6722,
1178
- "grad_norm": 6.9157185554504395,
1179
- "learning_rate": 4.270235818992989e-05,
1180
- "epoch": 6.15678776290631,
1181
  "step": 3220
1182
  },
1183
  {
1184
- "loss": 6.6397,
1185
- "grad_norm": 7.149935722351074,
1186
- "learning_rate": 4.227745910346293e-05,
1187
- "epoch": 6.195028680688337,
1188
  "step": 3240
1189
  },
1190
  {
1191
- "loss": 6.6041,
1192
- "grad_norm": 7.318164825439453,
1193
- "learning_rate": 4.185256001699597e-05,
1194
- "epoch": 6.233269598470363,
1195
  "step": 3260
1196
  },
1197
  {
1198
- "loss": 6.5492,
1199
- "grad_norm": 7.044018268585205,
1200
- "learning_rate": 4.1427660930529e-05,
1201
- "epoch": 6.27151051625239,
1202
  "step": 3280
1203
  },
1204
  {
1205
- "loss": 6.5679,
1206
- "grad_norm": 7.045164585113525,
1207
- "learning_rate": 4.1002761844062037e-05,
1208
- "epoch": 6.309751434034417,
1209
  "step": 3300
1210
  },
1211
  {
1212
- "loss": 6.5695,
1213
- "grad_norm": 7.092489242553711,
1214
- "learning_rate": 4.0577862757595076e-05,
1215
- "epoch": 6.347992351816444,
1216
  "step": 3320
1217
  },
1218
  {
1219
- "loss": 6.4842,
1220
- "grad_norm": 6.940147399902344,
1221
- "learning_rate": 4.015296367112811e-05,
1222
- "epoch": 6.3862332695984705,
1223
  "step": 3340
1224
  },
1225
  {
1226
- "loss": 6.5317,
1227
- "grad_norm": 7.10172176361084,
1228
- "learning_rate": 3.972806458466114e-05,
1229
- "epoch": 6.424474187380497,
1230
  "step": 3360
1231
  },
1232
  {
1233
- "loss": 6.4702,
1234
- "grad_norm": 7.129051208496094,
1235
- "learning_rate": 3.930316549819418e-05,
1236
- "epoch": 6.462715105162524,
1237
  "step": 3380
1238
  },
1239
  {
1240
- "loss": 6.3999,
1241
- "grad_norm": 7.501070499420166,
1242
- "learning_rate": 3.8878266411727215e-05,
1243
- "epoch": 6.500956022944551,
1244
  "step": 3400
1245
  },
1246
  {
1247
- "loss": 6.4932,
1248
- "grad_norm": 7.325244426727295,
1249
- "learning_rate": 3.8453367325260255e-05,
1250
- "epoch": 6.539196940726577,
1251
  "step": 3420
1252
  },
1253
  {
1254
- "loss": 6.3927,
1255
- "grad_norm": 7.361093521118164,
1256
- "learning_rate": 3.802846823879329e-05,
1257
- "epoch": 6.577437858508604,
1258
  "step": 3440
1259
  },
1260
  {
1261
- "loss": 6.4861,
1262
- "grad_norm": 7.228673458099365,
1263
- "learning_rate": 3.760356915232632e-05,
1264
- "epoch": 6.615678776290631,
 
 
 
 
 
 
 
 
 
1265
  "step": 3460
1266
  },
1267
  {
1268
- "loss": 6.4623,
1269
- "grad_norm": 7.602611064910889,
1270
- "learning_rate": 3.717867006585936e-05,
1271
- "epoch": 6.653919694072657,
1272
  "step": 3480
1273
  },
1274
  {
1275
- "loss": 6.4282,
1276
- "grad_norm": 7.901960372924805,
1277
- "learning_rate": 3.6753770979392394e-05,
1278
- "epoch": 6.692160611854685,
1279
  "step": 3500
1280
  },
1281
  {
1282
- "loss": 6.3799,
1283
- "grad_norm": 7.1125383377075195,
1284
- "learning_rate": 3.6328871892925434e-05,
1285
- "epoch": 6.730401529636711,
1286
  "step": 3520
1287
  },
1288
  {
1289
- "loss": 6.3707,
1290
- "grad_norm": 7.1385884284973145,
1291
- "learning_rate": 3.590397280645847e-05,
1292
- "epoch": 6.768642447418738,
1293
  "step": 3540
1294
  },
1295
  {
1296
- "loss": 6.4388,
1297
- "grad_norm": 7.548192977905273,
1298
- "learning_rate": 3.54790737199915e-05,
1299
- "epoch": 6.806883365200765,
1300
  "step": 3560
1301
  },
1302
  {
1303
- "loss": 6.4223,
1304
- "grad_norm": 7.492359161376953,
1305
- "learning_rate": 3.505417463352454e-05,
1306
- "epoch": 6.845124282982791,
1307
  "step": 3580
1308
  },
1309
  {
1310
- "loss": 6.3552,
1311
- "grad_norm": 7.575985431671143,
1312
- "learning_rate": 3.462927554705758e-05,
1313
- "epoch": 6.8833652007648185,
1314
  "step": 3600
1315
  },
1316
  {
1317
- "loss": 6.3379,
1318
- "grad_norm": 7.351112365722656,
1319
- "learning_rate": 3.4204376460590606e-05,
1320
- "epoch": 6.921606118546845,
1321
  "step": 3620
1322
  },
1323
  {
1324
- "loss": 6.3429,
1325
- "grad_norm": 7.33430290222168,
1326
- "learning_rate": 3.3779477374123646e-05,
1327
- "epoch": 6.959847036328872,
1328
  "step": 3640
1329
  },
1330
  {
1331
- "loss": 6.3112,
1332
- "grad_norm": 7.511825084686279,
1333
- "learning_rate": 3.3354578287656686e-05,
1334
- "epoch": 6.998087954110899,
1335
  "step": 3660
1336
  },
1337
  {
1338
- "eval_loss": 5.265278339385986,
1339
- "eval_accuracy": 0.7632021527077026,
1340
- "eval_runtime": 484.395,
1341
- "eval_samples_per_second": 30.688,
1342
- "eval_steps_per_second": 30.688,
1343
- "epoch": 7.0,
1344
- "step": 3661
1345
- },
1346
- {
1347
- "loss": 6.1764,
1348
- "grad_norm": 7.424711227416992,
1349
- "learning_rate": 3.292967920118972e-05,
1350
- "epoch": 7.036328871892925,
1351
  "step": 3680
1352
  },
1353
  {
1354
- "loss": 6.2389,
1355
- "grad_norm": 7.648799896240234,
1356
- "learning_rate": 3.250478011472275e-05,
1357
- "epoch": 7.074569789674952,
1358
  "step": 3700
1359
  },
1360
  {
1361
- "loss": 6.2506,
1362
- "grad_norm": 7.4450483322143555,
1363
- "learning_rate": 3.207988102825579e-05,
1364
- "epoch": 7.112810707456979,
1365
  "step": 3720
1366
  },
1367
  {
1368
- "loss": 6.2049,
1369
- "grad_norm": 7.422061443328857,
1370
- "learning_rate": 3.1654981941788825e-05,
1371
- "epoch": 7.151051625239006,
1372
  "step": 3740
1373
  },
1374
  {
1375
- "loss": 6.2906,
1376
- "grad_norm": 7.345204830169678,
1377
- "learning_rate": 3.1230082855321864e-05,
1378
- "epoch": 7.189292543021033,
1379
  "step": 3760
1380
  },
1381
  {
1382
- "loss": 6.2644,
1383
- "grad_norm": 7.486473083496094,
1384
- "learning_rate": 3.08051837688549e-05,
1385
- "epoch": 7.227533460803059,
1386
  "step": 3780
1387
  },
1388
  {
1389
- "loss": 6.2421,
1390
- "grad_norm": 7.317290782928467,
1391
- "learning_rate": 3.0380284682387934e-05,
1392
- "epoch": 7.265774378585086,
1393
  "step": 3800
1394
  },
1395
  {
1396
- "loss": 6.1406,
1397
- "grad_norm": 7.4384002685546875,
1398
- "learning_rate": 2.995538559592097e-05,
1399
- "epoch": 7.304015296367113,
1400
  "step": 3820
1401
  },
1402
  {
1403
- "loss": 6.2031,
1404
- "grad_norm": 7.7606000900268555,
1405
- "learning_rate": 2.9530486509454007e-05,
1406
- "epoch": 7.342256214149139,
1407
  "step": 3840
1408
  },
1409
  {
1410
- "loss": 6.127,
1411
- "grad_norm": 7.305050373077393,
1412
- "learning_rate": 2.910558742298704e-05,
1413
- "epoch": 7.3804971319311665,
1414
  "step": 3860
1415
  },
1416
  {
1417
- "loss": 6.1474,
1418
- "grad_norm": 7.713500022888184,
1419
- "learning_rate": 2.868068833652008e-05,
1420
- "epoch": 7.418738049713193,
1421
  "step": 3880
1422
  },
1423
  {
1424
- "loss": 6.1542,
1425
- "grad_norm": 8.028603553771973,
1426
- "learning_rate": 2.8255789250053116e-05,
1427
- "epoch": 7.45697896749522,
1428
  "step": 3900
1429
  },
1430
  {
1431
- "loss": 6.225,
1432
- "grad_norm": 7.4730329513549805,
1433
- "learning_rate": 2.783089016358615e-05,
1434
- "epoch": 7.495219885277247,
1435
  "step": 3920
1436
  },
1437
  {
1438
- "loss": 6.1674,
1439
- "grad_norm": 7.52304220199585,
1440
- "learning_rate": 2.7405991077119186e-05,
1441
- "epoch": 7.533460803059273,
1442
  "step": 3940
1443
  },
1444
  {
1445
- "loss": 6.1169,
1446
- "grad_norm": 7.616427898406982,
1447
- "learning_rate": 2.6981091990652225e-05,
1448
- "epoch": 7.5717017208413,
1449
  "step": 3960
1450
  },
1451
  {
1452
- "loss": 6.1041,
1453
- "grad_norm": 7.784472465515137,
1454
- "learning_rate": 2.6556192904185255e-05,
1455
- "epoch": 7.609942638623327,
1456
  "step": 3980
1457
  },
1458
  {
1459
- "loss": 6.1069,
1460
- "grad_norm": 7.819777011871338,
1461
- "learning_rate": 2.6131293817718295e-05,
1462
- "epoch": 7.648183556405353,
1463
  "step": 4000
1464
  },
1465
  {
1466
- "loss": 5.9985,
1467
- "grad_norm": 7.889120101928711,
1468
- "learning_rate": 2.5706394731251328e-05,
1469
- "epoch": 7.686424474187381,
1470
  "step": 4020
1471
  },
1472
  {
1473
- "loss": 6.0437,
1474
- "grad_norm": 7.858097076416016,
1475
- "learning_rate": 2.5281495644784364e-05,
1476
- "epoch": 7.724665391969407,
 
 
 
 
 
 
 
 
 
1477
  "step": 4040
1478
  },
1479
  {
1480
- "loss": 6.1376,
1481
- "grad_norm": 7.739562511444092,
1482
- "learning_rate": 2.48565965583174e-05,
1483
- "epoch": 7.762906309751434,
1484
  "step": 4060
1485
  },
1486
  {
1487
- "loss": 6.2084,
1488
- "grad_norm": 7.778552532196045,
1489
- "learning_rate": 2.4431697471850437e-05,
1490
- "epoch": 7.801147227533461,
1491
  "step": 4080
1492
  },
1493
  {
1494
- "loss": 6.0325,
1495
- "grad_norm": 7.536991596221924,
1496
- "learning_rate": 2.4006798385383474e-05,
1497
- "epoch": 7.839388145315487,
1498
  "step": 4100
1499
  },
1500
  {
1501
- "loss": 6.098,
1502
- "grad_norm": 7.846856594085693,
1503
- "learning_rate": 2.3581899298916507e-05,
1504
- "epoch": 7.8776290630975145,
1505
  "step": 4120
1506
  },
1507
  {
1508
- "loss": 5.9765,
1509
- "grad_norm": 7.760807991027832,
1510
- "learning_rate": 2.3157000212449547e-05,
1511
- "epoch": 7.915869980879541,
1512
  "step": 4140
1513
  },
1514
  {
1515
- "loss": 5.9915,
1516
- "grad_norm": 7.827345371246338,
1517
- "learning_rate": 2.273210112598258e-05,
1518
- "epoch": 7.954110898661568,
1519
  "step": 4160
1520
  },
1521
  {
1522
- "loss": 6.0255,
1523
- "grad_norm": 8.129748344421387,
1524
- "learning_rate": 2.2307202039515616e-05,
1525
- "epoch": 7.992351816443595,
1526
  "step": 4180
1527
  },
1528
  {
1529
- "eval_loss": 4.966301918029785,
1530
- "eval_accuracy": 0.782643794147326,
1531
- "eval_runtime": 260.149,
1532
- "eval_samples_per_second": 57.14,
1533
- "eval_steps_per_second": 57.14,
1534
- "epoch": 8.0,
1535
- "step": 4184
1536
- },
1537
- {
1538
- "loss": 6.0763,
1539
- "grad_norm": 7.686340808868408,
1540
- "learning_rate": 2.1882302953048652e-05,
1541
- "epoch": 8.030592734225621,
1542
  "step": 4200
1543
  },
1544
  {
1545
- "loss": 5.868,
1546
- "grad_norm": 7.666318893432617,
1547
- "learning_rate": 2.145740386658169e-05,
1548
- "epoch": 8.068833652007648,
1549
  "step": 4220
1550
  },
1551
  {
1552
- "loss": 5.8964,
1553
- "grad_norm": 7.686400890350342,
1554
- "learning_rate": 2.1032504780114722e-05,
1555
- "epoch": 8.107074569789676,
1556
  "step": 4240
1557
  },
1558
  {
1559
- "loss": 5.8408,
1560
- "grad_norm": 7.418490886688232,
1561
- "learning_rate": 2.0607605693647762e-05,
1562
- "epoch": 8.145315487571702,
1563
  "step": 4260
1564
  },
1565
  {
1566
- "loss": 5.9742,
1567
- "grad_norm": 7.769067287445068,
1568
- "learning_rate": 2.0182706607180795e-05,
1569
- "epoch": 8.183556405353729,
1570
  "step": 4280
1571
  },
1572
  {
1573
- "loss": 5.913,
1574
- "grad_norm": 7.915468215942383,
1575
- "learning_rate": 1.975780752071383e-05,
1576
- "epoch": 8.221797323135755,
1577
  "step": 4300
1578
  },
1579
  {
1580
- "loss": 5.8613,
1581
- "grad_norm": 7.884761810302734,
1582
- "learning_rate": 1.9332908434246868e-05,
1583
- "epoch": 8.260038240917781,
1584
  "step": 4320
1585
  },
1586
  {
1587
- "loss": 5.9791,
1588
- "grad_norm": 7.765011787414551,
1589
- "learning_rate": 1.8908009347779904e-05,
1590
- "epoch": 8.29827915869981,
1591
  "step": 4340
1592
  },
1593
  {
1594
- "loss": 5.9675,
1595
- "grad_norm": 8.110984802246094,
1596
- "learning_rate": 1.8483110261312937e-05,
1597
- "epoch": 8.336520076481836,
1598
  "step": 4360
1599
  },
1600
  {
1601
- "loss": 5.9804,
1602
- "grad_norm": 8.114306449890137,
1603
- "learning_rate": 1.8058211174845974e-05,
1604
- "epoch": 8.374760994263863,
1605
  "step": 4380
1606
  },
1607
  {
1608
- "loss": 5.8832,
1609
- "grad_norm": 7.981202125549316,
1610
- "learning_rate": 1.763331208837901e-05,
1611
- "epoch": 8.413001912045889,
1612
  "step": 4400
1613
  },
1614
  {
1615
- "loss": 5.9301,
1616
- "grad_norm": 7.628136157989502,
1617
- "learning_rate": 1.7208413001912046e-05,
1618
- "epoch": 8.451242829827915,
1619
  "step": 4420
1620
  },
1621
  {
1622
- "loss": 5.8983,
1623
- "grad_norm": 7.863382816314697,
1624
- "learning_rate": 1.6783513915445083e-05,
1625
- "epoch": 8.489483747609942,
1626
  "step": 4440
1627
  },
1628
  {
1629
- "loss": 5.8938,
1630
- "grad_norm": 7.82211971282959,
1631
- "learning_rate": 1.635861482897812e-05,
1632
- "epoch": 8.52772466539197,
1633
  "step": 4460
1634
  },
1635
  {
1636
- "loss": 5.8945,
1637
- "grad_norm": 8.038976669311523,
1638
- "learning_rate": 1.5933715742511156e-05,
1639
- "epoch": 8.565965583173996,
1640
  "step": 4480
1641
  },
1642
  {
1643
- "loss": 5.8895,
1644
- "grad_norm": 7.884932518005371,
1645
- "learning_rate": 1.550881665604419e-05,
1646
- "epoch": 8.604206500956023,
1647
  "step": 4500
1648
  },
1649
  {
1650
- "loss": 5.9617,
1651
- "grad_norm": 7.975419521331787,
1652
- "learning_rate": 1.5083917569577227e-05,
1653
- "epoch": 8.64244741873805,
1654
  "step": 4520
1655
  },
1656
  {
1657
- "loss": 5.8659,
1658
- "grad_norm": 7.786068916320801,
1659
- "learning_rate": 1.4659018483110262e-05,
1660
- "epoch": 8.680688336520076,
1661
  "step": 4540
1662
  },
1663
  {
1664
- "loss": 5.9116,
1665
- "grad_norm": 8.130301475524902,
1666
- "learning_rate": 1.4234119396643298e-05,
1667
- "epoch": 8.718929254302104,
1668
  "step": 4560
1669
  },
1670
  {
1671
- "loss": 5.8536,
1672
- "grad_norm": 8.042682647705078,
1673
- "learning_rate": 1.3809220310176335e-05,
1674
- "epoch": 8.75717017208413,
1675
  "step": 4580
1676
  },
1677
  {
1678
- "loss": 5.9241,
1679
- "grad_norm": 8.327803611755371,
1680
- "learning_rate": 1.3384321223709371e-05,
1681
- "epoch": 8.795411089866157,
1682
  "step": 4600
1683
  },
1684
  {
1685
- "loss": 5.864,
1686
- "grad_norm": 7.880401134490967,
1687
- "learning_rate": 1.2959422137242406e-05,
1688
- "epoch": 8.833652007648183,
 
 
 
 
 
 
 
 
 
1689
  "step": 4620
1690
  },
1691
  {
1692
- "loss": 5.9457,
1693
- "grad_norm": 7.6825127601623535,
1694
- "learning_rate": 1.253452305077544e-05,
1695
- "epoch": 8.87189292543021,
1696
  "step": 4640
1697
  },
1698
  {
1699
- "loss": 5.8329,
1700
- "grad_norm": 7.971193313598633,
1701
- "learning_rate": 1.2109623964308479e-05,
1702
- "epoch": 8.910133843212238,
1703
  "step": 4660
1704
  },
1705
  {
1706
- "loss": 5.8671,
1707
- "grad_norm": 8.04354476928711,
1708
- "learning_rate": 1.1684724877841513e-05,
1709
- "epoch": 8.948374760994264,
1710
  "step": 4680
1711
  },
1712
  {
1713
- "loss": 5.8091,
1714
- "grad_norm": 7.942180633544922,
1715
- "learning_rate": 1.125982579137455e-05,
1716
- "epoch": 8.98661567877629,
1717
  "step": 4700
1718
  },
1719
  {
1720
- "eval_loss": 4.778744220733643,
1721
- "eval_accuracy": 0.7956945845946855,
1722
- "eval_runtime": 531.1827,
1723
- "eval_samples_per_second": 27.985,
1724
- "eval_steps_per_second": 27.985,
1725
- "epoch": 9.0,
1726
- "step": 4707
1727
- },
1728
- {
1729
- "loss": 5.7978,
1730
- "grad_norm": 7.77038049697876,
1731
- "learning_rate": 1.0834926704907584e-05,
1732
- "epoch": 9.024856596558317,
1733
  "step": 4720
1734
  },
1735
  {
1736
- "loss": 5.7849,
1737
- "grad_norm": 7.850288391113281,
1738
- "learning_rate": 1.0410027618440621e-05,
1739
- "epoch": 9.063097514340344,
1740
  "step": 4740
1741
  },
1742
  {
1743
- "loss": 5.7891,
1744
- "grad_norm": 8.032878875732422,
1745
- "learning_rate": 9.985128531973657e-06,
1746
- "epoch": 9.101338432122372,
1747
  "step": 4760
1748
  },
1749
  {
1750
- "loss": 5.781,
1751
- "grad_norm": 7.886658668518066,
1752
- "learning_rate": 9.560229445506692e-06,
1753
- "epoch": 9.139579349904398,
1754
  "step": 4780
1755
  },
1756
  {
1757
- "loss": 5.8584,
1758
- "grad_norm": 7.953343868255615,
1759
- "learning_rate": 9.135330359039729e-06,
1760
- "epoch": 9.177820267686425,
1761
  "step": 4800
1762
  },
1763
  {
1764
- "loss": 5.8192,
1765
- "grad_norm": 7.899537563323975,
1766
- "learning_rate": 8.710431272572763e-06,
1767
- "epoch": 9.216061185468451,
1768
  "step": 4820
1769
  },
1770
  {
1771
- "loss": 5.7122,
1772
- "grad_norm": 8.269824028015137,
1773
- "learning_rate": 8.2855321861058e-06,
1774
- "epoch": 9.254302103250478,
1775
  "step": 4840
1776
  },
1777
  {
1778
- "loss": 5.7634,
1779
- "grad_norm": 7.824770450592041,
1780
- "learning_rate": 7.860633099638836e-06,
1781
- "epoch": 9.292543021032504,
1782
  "step": 4860
1783
  },
1784
  {
1785
- "loss": 5.8083,
1786
- "grad_norm": 7.953860759735107,
1787
- "learning_rate": 7.435734013171872e-06,
1788
- "epoch": 9.330783938814532,
1789
  "step": 4880
1790
  },
1791
  {
1792
- "loss": 5.8012,
1793
- "grad_norm": 8.25514030456543,
1794
- "learning_rate": 7.010834926704908e-06,
1795
- "epoch": 9.369024856596559,
1796
  "step": 4900
1797
  },
1798
  {
1799
- "loss": 5.7938,
1800
- "grad_norm": 8.2761869430542,
1801
- "learning_rate": 6.585935840237943e-06,
1802
- "epoch": 9.407265774378585,
1803
  "step": 4920
1804
  },
1805
  {
1806
- "loss": 5.6735,
1807
- "grad_norm": 7.865163803100586,
1808
- "learning_rate": 6.161036753770979e-06,
1809
- "epoch": 9.445506692160611,
1810
  "step": 4940
1811
  },
1812
  {
1813
- "loss": 5.7914,
1814
- "grad_norm": 8.172937393188477,
1815
- "learning_rate": 5.736137667304015e-06,
1816
- "epoch": 9.483747609942638,
1817
  "step": 4960
1818
  },
1819
  {
1820
- "loss": 5.7702,
1821
- "grad_norm": 8.558911323547363,
1822
- "learning_rate": 5.311238580837051e-06,
1823
- "epoch": 9.521988527724666,
1824
  "step": 4980
1825
  },
1826
  {
1827
- "loss": 5.7283,
1828
- "grad_norm": 8.265515327453613,
1829
- "learning_rate": 4.886339494370088e-06,
1830
- "epoch": 9.560229445506693,
1831
  "step": 5000
1832
  },
1833
  {
1834
- "loss": 5.8007,
1835
- "grad_norm": 8.17795467376709,
1836
- "learning_rate": 4.461440407903123e-06,
1837
- "epoch": 9.598470363288719,
1838
  "step": 5020
1839
  },
1840
  {
1841
- "loss": 5.8121,
1842
- "grad_norm": 8.109586715698242,
1843
- "learning_rate": 4.036541321436159e-06,
1844
- "epoch": 9.636711281070745,
1845
  "step": 5040
1846
  },
1847
  {
1848
- "loss": 5.789,
1849
- "grad_norm": 7.911646842956543,
1850
- "learning_rate": 3.6116422349691954e-06,
1851
- "epoch": 9.674952198852772,
1852
  "step": 5060
1853
  },
1854
  {
1855
- "loss": 5.7266,
1856
- "grad_norm": 8.030941009521484,
1857
- "learning_rate": 3.186743148502231e-06,
1858
- "epoch": 9.7131931166348,
1859
  "step": 5080
1860
  },
1861
  {
1862
- "loss": 5.761,
1863
- "grad_norm": 8.059958457946777,
1864
- "learning_rate": 2.7618440620352666e-06,
1865
- "epoch": 9.751434034416826,
1866
  "step": 5100
1867
  },
1868
  {
1869
- "loss": 5.7338,
1870
- "grad_norm": 8.002403259277344,
1871
- "learning_rate": 2.3369449755683026e-06,
1872
- "epoch": 9.789674952198853,
1873
  "step": 5120
1874
  },
1875
  {
1876
- "loss": 5.7088,
1877
- "grad_norm": 8.306962966918945,
1878
- "learning_rate": 1.9120458891013386e-06,
1879
- "epoch": 9.82791586998088,
1880
  "step": 5140
1881
  },
1882
  {
1883
- "loss": 5.6973,
1884
- "grad_norm": 8.018095970153809,
1885
- "learning_rate": 1.4871468026343744e-06,
1886
- "epoch": 9.866156787762906,
1887
  "step": 5160
1888
  },
1889
  {
1890
- "loss": 5.6422,
1891
- "grad_norm": 8.168917655944824,
1892
- "learning_rate": 1.0622477161674104e-06,
1893
- "epoch": 9.904397705544934,
 
 
 
 
 
 
 
 
 
1894
  "step": 5180
1895
  },
1896
  {
1897
- "loss": 5.7399,
1898
- "grad_norm": 7.939206123352051,
1899
- "learning_rate": 6.373486297004462e-07,
1900
- "epoch": 9.94263862332696,
1901
  "step": 5200
1902
  },
1903
  {
1904
- "loss": 5.7269,
1905
- "grad_norm": 7.970940589904785,
1906
- "learning_rate": 2.1244954323348205e-07,
1907
- "epoch": 9.980879541108987,
1908
  "step": 5220
1909
  },
1910
  {
1911
- "eval_loss": 4.700281620025635,
1912
- "eval_accuracy": 0.8030272452068618,
1913
- "eval_runtime": 552.5641,
1914
- "eval_samples_per_second": 26.902,
1915
- "eval_steps_per_second": 26.902,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1916
  "epoch": 10.0,
1917
- "step": 5230
1918
  },
1919
  {
1920
- "train_runtime": 28748.4048,
1921
- "train_samples_per_second": 46.534,
1922
- "train_steps_per_second": 0.182,
1923
- "total_flos": 2.49073133395968e+18,
1924
- "train_loss": 7.888943860726876,
1925
  "epoch": 10.0,
1926
- "step": 5230
1927
  }
1928
  ]
 
1
  [
2
  {
3
+ "loss": 13.2026,
4
+ "grad_norm": 6.155358791351318,
5
+ "learning_rate": 1.739130434782609e-05,
6
+ "epoch": 0.034782608695652174,
7
  "step": 20
8
  },
9
  {
10
+ "loss": 13.1252,
11
+ "grad_norm": 5.816741943359375,
12
+ "learning_rate": 3.478260869565218e-05,
13
+ "epoch": 0.06956521739130435,
14
  "step": 40
15
  },
16
  {
17
+ "loss": 13.0001,
18
+ "grad_norm": 5.273156642913818,
19
+ "learning_rate": 5.2173913043478256e-05,
20
+ "epoch": 0.10434782608695652,
21
  "step": 60
22
  },
23
  {
24
+ "loss": 12.8639,
25
+ "grad_norm": 4.86655330657959,
26
+ "learning_rate": 6.956521739130436e-05,
27
+ "epoch": 0.1391304347826087,
28
  "step": 80
29
  },
30
  {
31
+ "loss": 12.7376,
32
+ "grad_norm": 4.438321113586426,
33
+ "learning_rate": 8.695652173913044e-05,
34
+ "epoch": 0.17391304347826086,
35
  "step": 100
36
  },
37
  {
38
+ "loss": 12.5722,
39
+ "grad_norm": 4.164404392242432,
40
+ "learning_rate": 0.00010434782608695651,
41
+ "epoch": 0.20869565217391303,
42
  "step": 120
43
  },
44
  {
45
+ "loss": 12.4229,
46
+ "grad_norm": 3.858990430831909,
47
+ "learning_rate": 0.00012173913043478261,
48
+ "epoch": 0.24347826086956523,
49
  "step": 140
50
  },
51
  {
52
+ "loss": 12.2581,
53
+ "grad_norm": 3.6574394702911377,
54
+ "learning_rate": 0.0001391304347826087,
55
+ "epoch": 0.2782608695652174,
56
  "step": 160
57
  },
58
  {
59
+ "loss": 12.0753,
60
+ "grad_norm": 3.3787951469421387,
61
+ "learning_rate": 0.0001565217391304348,
62
+ "epoch": 0.3130434782608696,
63
  "step": 180
64
  },
65
  {
66
+ "loss": 11.9261,
67
+ "grad_norm": 3.323820114135742,
68
+ "learning_rate": 0.00017391304347826088,
69
+ "epoch": 0.34782608695652173,
70
  "step": 200
71
  },
72
  {
73
+ "loss": 11.7417,
74
+ "grad_norm": 3.247619152069092,
75
+ "learning_rate": 0.00019130434782608697,
76
+ "epoch": 0.3826086956521739,
77
  "step": 220
78
  },
79
  {
80
+ "loss": 11.5771,
81
+ "grad_norm": 3.2254152297973633,
82
+ "learning_rate": 0.00020869565217391303,
83
+ "epoch": 0.41739130434782606,
84
  "step": 240
85
  },
86
  {
87
+ "loss": 11.3969,
88
+ "grad_norm": 3.1803464889526367,
89
+ "learning_rate": 0.00022608695652173914,
90
+ "epoch": 0.45217391304347826,
91
  "step": 260
92
  },
93
  {
94
+ "loss": 11.2684,
95
+ "grad_norm": 3.41034197807312,
96
+ "learning_rate": 0.00024347826086956522,
97
+ "epoch": 0.48695652173913045,
98
  "step": 280
99
  },
100
  {
101
+ "loss": 11.0744,
102
+ "grad_norm": 3.246403217315674,
103
+ "learning_rate": 0.0002608695652173913,
104
+ "epoch": 0.5217391304347826,
105
  "step": 300
106
  },
107
  {
108
+ "loss": 10.8929,
109
+ "grad_norm": 3.202021360397339,
110
+ "learning_rate": 0.0002782608695652174,
111
+ "epoch": 0.5565217391304348,
112
  "step": 320
113
  },
114
  {
115
+ "loss": 10.7468,
116
+ "grad_norm": 3.1231367588043213,
117
+ "learning_rate": 0.0002956521739130435,
118
+ "epoch": 0.591304347826087,
119
  "step": 340
120
  },
121
  {
122
+ "loss": 10.606,
123
+ "grad_norm": 3.1820390224456787,
124
+ "learning_rate": 0.0003130434782608696,
125
+ "epoch": 0.6260869565217392,
126
  "step": 360
127
  },
128
  {
129
+ "loss": 10.4871,
130
+ "grad_norm": 3.2470555305480957,
131
+ "learning_rate": 0.0003304347826086956,
132
+ "epoch": 0.6608695652173913,
133
  "step": 380
134
  },
135
  {
136
+ "loss": 10.2836,
137
+ "grad_norm": 3.2452709674835205,
138
+ "learning_rate": 0.00034782608695652176,
139
+ "epoch": 0.6956521739130435,
140
  "step": 400
141
  },
142
  {
143
+ "loss": 10.1154,
144
+ "grad_norm": 3.203894853591919,
145
+ "learning_rate": 0.00036521739130434785,
146
+ "epoch": 0.7304347826086957,
147
  "step": 420
148
  },
149
  {
150
+ "loss": 9.9283,
151
+ "grad_norm": 3.269970178604126,
152
+ "learning_rate": 0.00038260869565217393,
153
+ "epoch": 0.7652173913043478,
154
  "step": 440
155
  },
156
  {
157
+ "loss": 9.8674,
158
+ "grad_norm": 3.261357545852661,
159
+ "learning_rate": 0.0004,
160
+ "epoch": 0.8,
161
  "step": 460
162
  },
163
  {
164
+ "loss": 9.6224,
165
+ "grad_norm": 3.393953323364258,
166
+ "learning_rate": 0.00041739130434782605,
167
+ "epoch": 0.8347826086956521,
168
  "step": 480
169
  },
170
  {
171
+ "loss": 9.524,
172
+ "grad_norm": 3.321411609649658,
173
+ "learning_rate": 0.0004347826086956522,
174
+ "epoch": 0.8695652173913043,
175
  "step": 500
176
  },
177
  {
178
+ "loss": 9.384,
179
+ "grad_norm": 3.3886823654174805,
180
+ "learning_rate": 0.0004521739130434783,
181
+ "epoch": 0.9043478260869565,
182
  "step": 520
183
  },
184
  {
185
+ "loss": 9.1767,
186
+ "grad_norm": 3.4735491275787354,
187
+ "learning_rate": 0.00046956521739130436,
188
+ "epoch": 0.9391304347826087,
 
 
 
 
 
 
 
 
 
189
  "step": 540
190
  },
191
  {
192
+ "loss": 9.047,
193
+ "grad_norm": 3.416966676712036,
194
+ "learning_rate": 0.00048695652173913045,
195
+ "epoch": 0.9739130434782609,
196
  "step": 560
197
  },
198
  {
199
+ "eval_loss": 8.366157531738281,
200
+ "eval_accuracy": 0.43039677202420984,
201
+ "eval_runtime": 42.3364,
202
+ "eval_samples_per_second": 35.123,
203
+ "eval_steps_per_second": 35.123,
204
+ "epoch": 1.0,
205
+ "step": 575
206
+ },
207
+ {
208
+ "loss": 8.8835,
209
+ "grad_norm": 3.446899890899658,
210
+ "learning_rate": 0.0004995169082125604,
211
+ "epoch": 1.008695652173913,
212
  "step": 580
213
  },
214
  {
215
+ "loss": 8.6436,
216
+ "grad_norm": 3.5842247009277344,
217
+ "learning_rate": 0.0004975845410628019,
218
+ "epoch": 1.0434782608695652,
219
  "step": 600
220
  },
221
  {
222
+ "loss": 8.4775,
223
+ "grad_norm": 3.5029306411743164,
224
+ "learning_rate": 0.0004956521739130435,
225
+ "epoch": 1.0782608695652174,
226
  "step": 620
227
  },
228
  {
229
+ "loss": 8.322,
230
+ "grad_norm": 3.5451033115386963,
231
+ "learning_rate": 0.0004937198067632851,
232
+ "epoch": 1.1130434782608696,
233
  "step": 640
234
  },
235
  {
236
+ "loss": 8.1264,
237
+ "grad_norm": 3.5502634048461914,
238
+ "learning_rate": 0.0004917874396135266,
239
+ "epoch": 1.1478260869565218,
240
  "step": 660
241
  },
242
  {
243
+ "loss": 7.9905,
244
+ "grad_norm": 3.607395648956299,
245
+ "learning_rate": 0.0004898550724637681,
246
+ "epoch": 1.182608695652174,
247
  "step": 680
248
  },
249
  {
250
+ "loss": 7.8252,
251
+ "grad_norm": 3.6438565254211426,
252
+ "learning_rate": 0.0004879227053140097,
253
+ "epoch": 1.2173913043478262,
254
  "step": 700
255
  },
256
  {
257
+ "loss": 7.7737,
258
+ "grad_norm": 3.656705141067505,
259
+ "learning_rate": 0.0004859903381642512,
260
+ "epoch": 1.2521739130434781,
261
  "step": 720
262
  },
263
  {
264
+ "loss": 7.5822,
265
+ "grad_norm": 3.7424328327178955,
266
+ "learning_rate": 0.0004840579710144928,
267
+ "epoch": 1.2869565217391306,
268
  "step": 740
269
  },
270
  {
271
+ "loss": 7.4563,
272
+ "grad_norm": 3.673156261444092,
273
+ "learning_rate": 0.0004821256038647343,
274
+ "epoch": 1.3217391304347825,
275
  "step": 760
276
  },
277
  {
278
+ "loss": 7.3379,
279
+ "grad_norm": 3.6774067878723145,
280
+ "learning_rate": 0.0004801932367149758,
281
+ "epoch": 1.3565217391304347,
282
  "step": 780
283
  },
284
  {
285
+ "loss": 7.1559,
286
+ "grad_norm": 3.811283826828003,
287
+ "learning_rate": 0.0004782608695652174,
288
+ "epoch": 1.391304347826087,
289
  "step": 800
290
  },
291
  {
292
+ "loss": 7.0834,
293
+ "grad_norm": 3.7899839878082275,
294
+ "learning_rate": 0.00047632850241545894,
295
+ "epoch": 1.4260869565217391,
296
  "step": 820
297
  },
298
  {
299
+ "loss": 6.9172,
300
+ "grad_norm": 3.583247423171997,
301
+ "learning_rate": 0.00047439613526570047,
302
+ "epoch": 1.4608695652173913,
303
  "step": 840
304
  },
305
  {
306
+ "loss": 6.7251,
307
+ "grad_norm": 3.8192331790924072,
308
+ "learning_rate": 0.00047246376811594206,
309
+ "epoch": 1.4956521739130435,
310
  "step": 860
311
  },
312
  {
313
+ "loss": 6.7871,
314
+ "grad_norm": 3.8098299503326416,
315
+ "learning_rate": 0.0004705314009661836,
316
+ "epoch": 1.5304347826086957,
317
  "step": 880
318
  },
319
  {
320
+ "loss": 6.6103,
321
+ "grad_norm": 3.7341325283050537,
322
+ "learning_rate": 0.0004685990338164252,
323
+ "epoch": 1.5652173913043477,
324
  "step": 900
325
  },
326
  {
327
+ "loss": 6.4507,
328
+ "grad_norm": 3.9190495014190674,
329
+ "learning_rate": 0.00046666666666666666,
330
+ "epoch": 1.6,
331
  "step": 920
332
  },
333
  {
334
+ "loss": 6.3619,
335
+ "grad_norm": 3.9456422328948975,
336
+ "learning_rate": 0.0004647342995169082,
337
+ "epoch": 1.634782608695652,
338
  "step": 940
339
  },
340
  {
341
+ "loss": 6.2957,
342
+ "grad_norm": 3.899134874343872,
343
+ "learning_rate": 0.0004628019323671498,
344
+ "epoch": 1.6695652173913045,
345
  "step": 960
346
  },
347
  {
348
+ "loss": 6.1362,
349
+ "grad_norm": 3.878810167312622,
350
+ "learning_rate": 0.0004608695652173913,
351
+ "epoch": 1.7043478260869565,
352
  "step": 980
353
  },
354
  {
355
+ "loss": 5.9814,
356
+ "grad_norm": 3.9270784854888916,
357
+ "learning_rate": 0.00045893719806763285,
358
+ "epoch": 1.7391304347826086,
359
  "step": 1000
360
  },
361
  {
362
+ "loss": 5.9095,
363
+ "grad_norm": 3.8247644901275635,
364
+ "learning_rate": 0.00045700483091787444,
365
+ "epoch": 1.7739130434782608,
366
  "step": 1020
367
  },
368
  {
369
+ "loss": 5.7793,
370
+ "grad_norm": 3.8870134353637695,
371
+ "learning_rate": 0.000455072463768116,
372
+ "epoch": 1.808695652173913,
373
  "step": 1040
374
  },
375
  {
376
+ "loss": 5.7754,
377
+ "grad_norm": 3.9533441066741943,
378
+ "learning_rate": 0.00045314009661835745,
379
+ "epoch": 1.8434782608695652,
 
 
 
 
 
 
 
 
 
380
  "step": 1060
381
  },
382
  {
383
+ "loss": 5.5886,
384
+ "grad_norm": 3.9928998947143555,
385
+ "learning_rate": 0.00045120772946859904,
386
+ "epoch": 1.8782608695652174,
387
  "step": 1080
388
  },
389
  {
390
+ "loss": 5.5482,
391
+ "grad_norm": 4.030064582824707,
392
+ "learning_rate": 0.0004492753623188406,
393
+ "epoch": 1.9130434782608696,
394
  "step": 1100
395
  },
396
  {
397
+ "loss": 5.4807,
398
+ "grad_norm": 3.961806297302246,
399
+ "learning_rate": 0.0004473429951690821,
400
+ "epoch": 1.9478260869565216,
401
  "step": 1120
402
  },
403
  {
404
+ "loss": 5.3508,
405
+ "grad_norm": 4.003119945526123,
406
+ "learning_rate": 0.0004454106280193237,
407
+ "epoch": 1.982608695652174,
408
  "step": 1140
409
  },
410
  {
411
+ "eval_loss": 4.025164604187012,
412
+ "eval_accuracy": 0.8190988567585743,
413
+ "eval_runtime": 42.7144,
414
+ "eval_samples_per_second": 34.813,
415
+ "eval_steps_per_second": 34.813,
416
+ "epoch": 2.0,
417
+ "step": 1150
418
+ },
419
+ {
420
+ "loss": 5.1229,
421
+ "grad_norm": 3.958116292953491,
422
+ "learning_rate": 0.00044347826086956523,
423
+ "epoch": 2.017391304347826,
424
  "step": 1160
425
  },
426
  {
427
+ "loss": 4.8146,
428
+ "grad_norm": 3.864279270172119,
429
+ "learning_rate": 0.00044154589371980677,
430
+ "epoch": 2.0521739130434784,
431
  "step": 1180
432
  },
433
  {
434
+ "loss": 4.8843,
435
+ "grad_norm": 4.045077323913574,
436
+ "learning_rate": 0.0004396135265700483,
437
+ "epoch": 2.0869565217391304,
438
  "step": 1200
439
  },
440
  {
441
+ "loss": 4.8078,
442
+ "grad_norm": 4.061978816986084,
443
+ "learning_rate": 0.00043768115942028983,
444
+ "epoch": 2.121739130434783,
445
  "step": 1220
446
  },
447
  {
448
+ "loss": 4.6812,
449
+ "grad_norm": 4.040159225463867,
450
+ "learning_rate": 0.0004357487922705314,
451
+ "epoch": 2.1565217391304348,
452
  "step": 1240
453
  },
454
  {
455
+ "loss": 4.6701,
456
+ "grad_norm": 4.234623908996582,
457
+ "learning_rate": 0.00043381642512077296,
458
+ "epoch": 2.1913043478260867,
459
  "step": 1260
460
  },
461
  {
462
+ "loss": 4.6221,
463
+ "grad_norm": 4.030038356781006,
464
+ "learning_rate": 0.0004318840579710145,
465
+ "epoch": 2.226086956521739,
466
  "step": 1280
467
  },
468
  {
469
+ "loss": 4.5647,
470
+ "grad_norm": 3.9954497814178467,
471
+ "learning_rate": 0.0004299516908212561,
472
+ "epoch": 2.260869565217391,
473
  "step": 1300
474
  },
475
  {
476
+ "loss": 4.4502,
477
+ "grad_norm": 4.188636779785156,
478
+ "learning_rate": 0.0004280193236714976,
479
+ "epoch": 2.2956521739130435,
480
  "step": 1320
481
  },
482
  {
483
+ "loss": 4.359,
484
+ "grad_norm": 4.185456275939941,
485
+ "learning_rate": 0.00042608695652173915,
486
+ "epoch": 2.3304347826086955,
487
  "step": 1340
488
  },
489
  {
490
+ "loss": 4.2863,
491
+ "grad_norm": 4.123263359069824,
492
+ "learning_rate": 0.0004241545893719807,
493
+ "epoch": 2.365217391304348,
494
  "step": 1360
495
  },
496
  {
497
+ "loss": 4.3354,
498
+ "grad_norm": 4.194387435913086,
499
+ "learning_rate": 0.0004222222222222222,
500
+ "epoch": 2.4,
501
  "step": 1380
502
  },
503
  {
504
+ "loss": 4.2176,
505
+ "grad_norm": 4.065763473510742,
506
+ "learning_rate": 0.00042028985507246375,
507
+ "epoch": 2.4347826086956523,
508
  "step": 1400
509
  },
510
  {
511
+ "loss": 4.0597,
512
+ "grad_norm": 4.120363712310791,
513
+ "learning_rate": 0.00041835748792270534,
514
+ "epoch": 2.4695652173913043,
515
  "step": 1420
516
  },
517
  {
518
+ "loss": 4.028,
519
+ "grad_norm": 4.3197174072265625,
520
+ "learning_rate": 0.00041642512077294687,
521
+ "epoch": 2.5043478260869563,
522
  "step": 1440
523
  },
524
  {
525
+ "loss": 3.9833,
526
+ "grad_norm": 4.2683610916137695,
527
+ "learning_rate": 0.0004144927536231884,
528
+ "epoch": 2.5391304347826087,
529
  "step": 1460
530
  },
531
  {
532
+ "loss": 4.0065,
533
+ "grad_norm": 4.15448522567749,
534
+ "learning_rate": 0.00041256038647343,
535
+ "epoch": 2.573913043478261,
536
  "step": 1480
537
  },
538
  {
539
+ "loss": 3.8134,
540
+ "grad_norm": 4.348177433013916,
541
+ "learning_rate": 0.0004106280193236715,
542
+ "epoch": 2.608695652173913,
543
  "step": 1500
544
  },
545
  {
546
+ "loss": 3.8548,
547
+ "grad_norm": 4.100021839141846,
548
+ "learning_rate": 0.00040869565217391306,
549
+ "epoch": 2.643478260869565,
550
  "step": 1520
551
  },
552
  {
553
+ "loss": 3.7814,
554
+ "grad_norm": 4.344174385070801,
555
+ "learning_rate": 0.0004067632850241546,
556
+ "epoch": 2.6782608695652175,
557
  "step": 1540
558
  },
559
  {
560
+ "loss": 3.7578,
561
+ "grad_norm": 4.240079402923584,
562
+ "learning_rate": 0.00040483091787439613,
563
+ "epoch": 2.7130434782608694,
564
  "step": 1560
565
  },
566
  {
567
+ "loss": 3.7331,
568
+ "grad_norm": 4.468689918518066,
569
+ "learning_rate": 0.0004028985507246377,
570
+ "epoch": 2.747826086956522,
 
 
 
 
 
 
 
 
 
571
  "step": 1580
572
  },
573
  {
574
+ "loss": 3.6396,
575
+ "grad_norm": 4.28464937210083,
576
+ "learning_rate": 0.00040096618357487925,
577
+ "epoch": 2.782608695652174,
578
  "step": 1600
579
  },
580
  {
581
+ "loss": 3.5799,
582
+ "grad_norm": 4.166805744171143,
583
+ "learning_rate": 0.0003990338164251208,
584
+ "epoch": 2.8173913043478263,
585
  "step": 1620
586
  },
587
  {
588
+ "loss": 3.4734,
589
+ "grad_norm": 4.237683296203613,
590
+ "learning_rate": 0.0003971014492753624,
591
+ "epoch": 2.8521739130434782,
592
  "step": 1640
593
  },
594
  {
595
+ "loss": 3.5183,
596
+ "grad_norm": 4.153097152709961,
597
+ "learning_rate": 0.00039516908212560385,
598
+ "epoch": 2.8869565217391306,
599
  "step": 1660
600
  },
601
  {
602
+ "loss": 3.3963,
603
+ "grad_norm": 4.2313947677612305,
604
+ "learning_rate": 0.0003932367149758454,
605
+ "epoch": 2.9217391304347826,
606
  "step": 1680
607
  },
608
  {
609
+ "loss": 3.3081,
610
+ "grad_norm": 3.992475748062134,
611
+ "learning_rate": 0.000391304347826087,
612
+ "epoch": 2.9565217391304346,
613
  "step": 1700
614
  },
615
  {
616
+ "loss": 3.3124,
617
+ "grad_norm": 4.4731059074401855,
618
+ "learning_rate": 0.0003893719806763285,
619
+ "epoch": 2.991304347826087,
620
  "step": 1720
621
  },
622
  {
623
+ "eval_loss": 2.1082653999328613,
624
+ "eval_accuracy": 0.9260255548083389,
625
+ "eval_runtime": 22.1676,
626
+ "eval_samples_per_second": 67.08,
627
+ "eval_steps_per_second": 67.08,
628
+ "epoch": 3.0,
629
+ "step": 1725
630
+ },
631
+ {
632
+ "loss": 3.1247,
633
+ "grad_norm": 4.272000312805176,
634
+ "learning_rate": 0.00038743961352657004,
635
+ "epoch": 3.026086956521739,
636
  "step": 1740
637
  },
638
  {
639
+ "loss": 3.1064,
640
+ "grad_norm": 4.102330207824707,
641
+ "learning_rate": 0.00038550724637681163,
642
+ "epoch": 3.0608695652173914,
643
  "step": 1760
644
  },
645
  {
646
+ "loss": 2.9371,
647
+ "grad_norm": 4.381846904754639,
648
+ "learning_rate": 0.00038357487922705317,
649
+ "epoch": 3.0956521739130434,
650
  "step": 1780
651
  },
652
  {
653
+ "loss": 2.9355,
654
+ "grad_norm": 4.1588921546936035,
655
+ "learning_rate": 0.00038164251207729465,
656
+ "epoch": 3.130434782608696,
657
  "step": 1800
658
  },
659
  {
660
+ "loss": 2.8545,
661
+ "grad_norm": 4.279609203338623,
662
+ "learning_rate": 0.00037971014492753623,
663
+ "epoch": 3.1652173913043478,
664
  "step": 1820
665
  },
666
  {
667
+ "loss": 2.8096,
668
+ "grad_norm": 4.240756988525391,
669
+ "learning_rate": 0.00037777777777777777,
670
+ "epoch": 3.2,
671
  "step": 1840
672
  },
673
  {
674
+ "loss": 2.8138,
675
+ "grad_norm": 4.11091947555542,
676
+ "learning_rate": 0.00037584541062801936,
677
+ "epoch": 3.234782608695652,
678
  "step": 1860
679
  },
680
  {
681
+ "loss": 2.7417,
682
+ "grad_norm": 4.078794479370117,
683
+ "learning_rate": 0.0003739130434782609,
684
+ "epoch": 3.269565217391304,
685
  "step": 1880
686
  },
687
  {
688
+ "loss": 2.7937,
689
+ "grad_norm": 4.368116855621338,
690
+ "learning_rate": 0.0003719806763285024,
691
+ "epoch": 3.3043478260869565,
692
  "step": 1900
693
  },
694
  {
695
+ "loss": 2.7361,
696
+ "grad_norm": 4.044319152832031,
697
+ "learning_rate": 0.000370048309178744,
698
+ "epoch": 3.3391304347826085,
699
  "step": 1920
700
  },
701
  {
702
+ "loss": 2.7054,
703
+ "grad_norm": 4.314040184020996,
704
+ "learning_rate": 0.0003681159420289855,
705
+ "epoch": 3.373913043478261,
706
  "step": 1940
707
  },
708
  {
709
+ "loss": 2.6682,
710
+ "grad_norm": 4.185855388641357,
711
+ "learning_rate": 0.000366183574879227,
712
+ "epoch": 3.408695652173913,
713
  "step": 1960
714
  },
715
  {
716
+ "loss": 2.6644,
717
+ "grad_norm": 4.433622360229492,
718
+ "learning_rate": 0.0003642512077294686,
719
+ "epoch": 3.4434782608695653,
720
  "step": 1980
721
  },
722
  {
723
+ "loss": 2.618,
724
+ "grad_norm": 4.048947811126709,
725
+ "learning_rate": 0.00036231884057971015,
726
+ "epoch": 3.4782608695652173,
727
  "step": 2000
728
  },
729
  {
730
+ "loss": 2.5982,
731
+ "grad_norm": 4.145406246185303,
732
+ "learning_rate": 0.0003603864734299517,
733
+ "epoch": 3.5130434782608697,
734
  "step": 2020
735
  },
736
  {
737
+ "loss": 2.6138,
738
+ "grad_norm": 4.2812910079956055,
739
+ "learning_rate": 0.00035845410628019327,
740
+ "epoch": 3.5478260869565217,
741
  "step": 2040
742
  },
743
  {
744
+ "loss": 2.5039,
745
+ "grad_norm": 4.400162220001221,
746
+ "learning_rate": 0.0003565217391304348,
747
+ "epoch": 3.5826086956521737,
748
  "step": 2060
749
  },
750
  {
751
+ "loss": 2.5249,
752
+ "grad_norm": 4.217800617218018,
753
+ "learning_rate": 0.0003545893719806763,
754
+ "epoch": 3.617391304347826,
755
  "step": 2080
756
  },
757
  {
758
+ "loss": 2.4547,
759
+ "grad_norm": 4.076215744018555,
760
+ "learning_rate": 0.0003526570048309179,
761
+ "epoch": 3.6521739130434785,
 
 
 
 
 
 
 
 
 
762
  "step": 2100
763
  },
764
  {
765
+ "loss": 2.4315,
766
+ "grad_norm": 4.139514446258545,
767
+ "learning_rate": 0.0003507246376811594,
768
+ "epoch": 3.6869565217391305,
769
  "step": 2120
770
  },
771
  {
772
+ "loss": 2.3836,
773
+ "grad_norm": 4.118022918701172,
774
+ "learning_rate": 0.00034879227053140094,
775
+ "epoch": 3.7217391304347824,
776
  "step": 2140
777
  },
778
  {
779
+ "loss": 2.3284,
780
+ "grad_norm": 4.137601852416992,
781
+ "learning_rate": 0.00034685990338164253,
782
+ "epoch": 3.756521739130435,
783
  "step": 2160
784
  },
785
  {
786
+ "loss": 2.3095,
787
+ "grad_norm": 4.023979663848877,
788
+ "learning_rate": 0.00034492753623188406,
789
+ "epoch": 3.791304347826087,
790
  "step": 2180
791
  },
792
  {
793
+ "loss": 2.305,
794
+ "grad_norm": 4.042725086212158,
795
+ "learning_rate": 0.00034299516908212565,
796
+ "epoch": 3.8260869565217392,
797
  "step": 2200
798
  },
799
  {
800
+ "loss": 2.3237,
801
+ "grad_norm": 4.265875339508057,
802
+ "learning_rate": 0.0003410628019323672,
803
+ "epoch": 3.860869565217391,
804
  "step": 2220
805
  },
806
  {
807
+ "loss": 2.335,
808
+ "grad_norm": 4.205041408538818,
809
+ "learning_rate": 0.00033913043478260867,
810
+ "epoch": 3.8956521739130436,
811
  "step": 2240
812
  },
813
  {
814
+ "loss": 2.2341,
815
+ "grad_norm": 4.1344709396362305,
816
+ "learning_rate": 0.00033719806763285025,
817
+ "epoch": 3.9304347826086956,
818
  "step": 2260
819
  },
820
  {
821
+ "loss": 2.251,
822
+ "grad_norm": 4.247790813446045,
823
+ "learning_rate": 0.0003352657004830918,
824
+ "epoch": 3.965217391304348,
825
  "step": 2280
826
  },
827
  {
828
+ "loss": 2.3212,
829
+ "grad_norm": 4.859626770019531,
830
+ "learning_rate": 0.0003333333333333333,
831
+ "epoch": 4.0,
832
  "step": 2300
833
  },
834
  {
835
+ "eval_loss": 1.2223739624023438,
836
+ "eval_accuracy": 0.9435104236718225,
837
+ "eval_runtime": 14.8513,
838
+ "eval_samples_per_second": 100.126,
839
+ "eval_steps_per_second": 100.126,
840
+ "epoch": 4.0,
841
+ "step": 2300
842
+ },
843
+ {
844
+ "loss": 1.9133,
845
+ "grad_norm": 4.098020553588867,
846
+ "learning_rate": 0.0003314009661835749,
847
+ "epoch": 4.034782608695652,
848
  "step": 2320
849
  },
850
  {
851
+ "loss": 1.9814,
852
+ "grad_norm": 4.198029041290283,
853
+ "learning_rate": 0.00032946859903381644,
854
+ "epoch": 4.069565217391304,
855
  "step": 2340
856
  },
857
  {
858
+ "loss": 1.9505,
859
+ "grad_norm": 3.960844039916992,
860
+ "learning_rate": 0.000327536231884058,
861
+ "epoch": 4.104347826086957,
862
  "step": 2360
863
  },
864
  {
865
+ "loss": 1.8815,
866
+ "grad_norm": 4.0190300941467285,
867
+ "learning_rate": 0.0003256038647342995,
868
+ "epoch": 4.139130434782609,
869
  "step": 2380
870
  },
871
  {
872
+ "loss": 1.8365,
873
+ "grad_norm": 4.040708541870117,
874
+ "learning_rate": 0.00032367149758454105,
875
+ "epoch": 4.173913043478261,
876
  "step": 2400
877
  },
878
  {
879
+ "loss": 1.84,
880
+ "grad_norm": 4.077364444732666,
881
+ "learning_rate": 0.0003217391304347826,
882
+ "epoch": 4.208695652173913,
883
  "step": 2420
884
  },
885
  {
886
+ "loss": 1.8864,
887
+ "grad_norm": 4.267309188842773,
888
+ "learning_rate": 0.0003199033816425121,
889
+ "epoch": 4.243478260869566,
890
  "step": 2440
891
  },
892
  {
893
+ "loss": 1.9015,
894
+ "grad_norm": 3.978663921356201,
895
+ "learning_rate": 0.00031797101449275363,
896
+ "epoch": 4.278260869565218,
897
  "step": 2460
898
  },
899
  {
900
+ "loss": 1.8388,
901
+ "grad_norm": 4.089256763458252,
902
+ "learning_rate": 0.0003160386473429952,
903
+ "epoch": 4.3130434782608695,
904
  "step": 2480
905
  },
906
  {
907
+ "loss": 1.7845,
908
+ "grad_norm": 3.9317057132720947,
909
+ "learning_rate": 0.0003141062801932367,
910
+ "epoch": 4.3478260869565215,
911
  "step": 2500
912
  },
913
  {
914
+ "loss": 1.7725,
915
+ "grad_norm": 3.9738080501556396,
916
+ "learning_rate": 0.00031217391304347823,
917
+ "epoch": 4.3826086956521735,
918
  "step": 2520
919
  },
920
  {
921
+ "loss": 1.852,
922
+ "grad_norm": 4.232215881347656,
923
+ "learning_rate": 0.0003102415458937198,
924
+ "epoch": 4.417391304347826,
925
  "step": 2540
926
  },
927
  {
928
+ "loss": 1.8234,
929
+ "grad_norm": 4.050131797790527,
930
+ "learning_rate": 0.00030830917874396136,
931
+ "epoch": 4.452173913043478,
932
  "step": 2560
933
  },
934
  {
935
+ "loss": 1.8148,
936
+ "grad_norm": 4.217935085296631,
937
+ "learning_rate": 0.0003063768115942029,
938
+ "epoch": 4.48695652173913,
939
  "step": 2580
940
  },
941
  {
942
+ "loss": 1.7134,
943
+ "grad_norm": 3.9807074069976807,
944
+ "learning_rate": 0.0003044444444444445,
945
+ "epoch": 4.521739130434782,
946
  "step": 2600
947
  },
948
  {
949
+ "loss": 1.6752,
950
+ "grad_norm": 4.05940580368042,
951
+ "learning_rate": 0.000302512077294686,
952
+ "epoch": 4.556521739130435,
 
 
 
 
 
 
 
 
 
953
  "step": 2620
954
  },
955
  {
956
+ "loss": 1.8413,
957
+ "grad_norm": 4.454566955566406,
958
+ "learning_rate": 0.00030057971014492755,
959
+ "epoch": 4.591304347826087,
960
  "step": 2640
961
  },
962
  {
963
+ "loss": 1.7948,
964
+ "grad_norm": 4.144088268280029,
965
+ "learning_rate": 0.0002986473429951691,
966
+ "epoch": 4.626086956521739,
967
  "step": 2660
968
  },
969
  {
970
+ "loss": 1.7468,
971
+ "grad_norm": 3.940176010131836,
972
+ "learning_rate": 0.0002967149758454106,
973
+ "epoch": 4.660869565217391,
974
  "step": 2680
975
  },
976
  {
977
+ "loss": 1.709,
978
+ "grad_norm": 4.198675632476807,
979
+ "learning_rate": 0.0002948792270531401,
980
+ "epoch": 4.695652173913043,
981
  "step": 2700
982
  },
983
  {
984
+ "loss": 1.6506,
985
+ "grad_norm": 3.976001501083374,
986
+ "learning_rate": 0.00029294685990338167,
987
+ "epoch": 4.730434782608696,
988
  "step": 2720
989
  },
990
  {
991
+ "loss": 1.7042,
992
+ "grad_norm": 4.033059120178223,
993
+ "learning_rate": 0.0002910144927536232,
994
+ "epoch": 4.765217391304348,
995
  "step": 2740
996
  },
997
  {
998
+ "loss": 1.6795,
999
+ "grad_norm": 4.062041759490967,
1000
+ "learning_rate": 0.0002890821256038648,
1001
+ "epoch": 4.8,
1002
  "step": 2760
1003
  },
1004
  {
1005
+ "loss": 1.7029,
1006
+ "grad_norm": 3.988589286804199,
1007
+ "learning_rate": 0.00028714975845410627,
1008
+ "epoch": 4.834782608695652,
1009
  "step": 2780
1010
  },
1011
  {
1012
+ "loss": 1.6641,
1013
+ "grad_norm": 4.16325044631958,
1014
+ "learning_rate": 0.0002852173913043478,
1015
+ "epoch": 4.869565217391305,
1016
  "step": 2800
1017
  },
1018
  {
1019
+ "loss": 1.6953,
1020
+ "grad_norm": 4.323537349700928,
1021
+ "learning_rate": 0.0002832850241545894,
1022
+ "epoch": 4.904347826086957,
1023
  "step": 2820
1024
  },
1025
  {
1026
+ "loss": 1.5863,
1027
+ "grad_norm": 3.8293144702911377,
1028
+ "learning_rate": 0.0002813526570048309,
1029
+ "epoch": 4.939130434782609,
1030
  "step": 2840
1031
  },
1032
  {
1033
+ "loss": 1.6276,
1034
+ "grad_norm": 3.8955535888671875,
1035
+ "learning_rate": 0.00027942028985507246,
1036
+ "epoch": 4.973913043478261,
1037
  "step": 2860
1038
  },
1039
  {
1040
+ "eval_loss": 0.8229038715362549,
1041
+ "eval_accuracy": 0.9677202420981843,
1042
+ "eval_runtime": 88.6744,
1043
+ "eval_samples_per_second": 16.769,
1044
+ "eval_steps_per_second": 16.769,
1045
+ "epoch": 5.0,
1046
+ "step": 2875
1047
+ },
1048
+ {
1049
+ "loss": 1.5701,
1050
+ "grad_norm": 3.8480091094970703,
1051
+ "learning_rate": 0.00027748792270531405,
1052
+ "epoch": 5.008695652173913,
1053
  "step": 2880
1054
  },
1055
  {
1056
+ "loss": 1.3786,
1057
+ "grad_norm": 3.679872512817383,
1058
+ "learning_rate": 0.0002755555555555556,
1059
+ "epoch": 5.043478260869565,
1060
  "step": 2900
1061
  },
1062
  {
1063
+ "loss": 1.3563,
1064
+ "grad_norm": 4.13381290435791,
1065
+ "learning_rate": 0.00027362318840579706,
1066
+ "epoch": 5.078260869565217,
1067
  "step": 2920
1068
  },
1069
  {
1070
+ "loss": 1.3588,
1071
+ "grad_norm": 3.7467329502105713,
1072
+ "learning_rate": 0.00027169082125603865,
1073
+ "epoch": 5.113043478260869,
1074
  "step": 2940
1075
  },
1076
  {
1077
+ "loss": 1.3782,
1078
+ "grad_norm": 3.5837419033050537,
1079
+ "learning_rate": 0.0002698550724637681,
1080
+ "epoch": 5.147826086956521,
1081
  "step": 2960
1082
  },
1083
  {
1084
+ "loss": 1.3969,
1085
+ "grad_norm": 4.077097415924072,
1086
+ "learning_rate": 0.00026792270531400964,
1087
+ "epoch": 5.182608695652174,
1088
  "step": 2980
1089
  },
1090
  {
1091
+ "loss": 1.3346,
1092
+ "grad_norm": 3.5995211601257324,
1093
+ "learning_rate": 0.00026599033816425123,
1094
+ "epoch": 5.217391304347826,
1095
  "step": 3000
1096
  },
1097
  {
1098
+ "loss": 1.3772,
1099
+ "grad_norm": 3.714010000228882,
1100
+ "learning_rate": 0.00026405797101449277,
1101
+ "epoch": 5.252173913043478,
1102
  "step": 3020
1103
  },
1104
  {
1105
+ "loss": 1.3452,
1106
+ "grad_norm": 3.807094097137451,
1107
+ "learning_rate": 0.00026231884057971016,
1108
+ "epoch": 5.28695652173913,
1109
  "step": 3040
1110
  },
1111
  {
1112
+ "loss": 1.3161,
1113
+ "grad_norm": 4.012477397918701,
1114
+ "learning_rate": 0.0002603864734299517,
1115
+ "epoch": 5.321739130434783,
1116
  "step": 3060
1117
  },
1118
  {
1119
+ "loss": 1.3146,
1120
+ "grad_norm": 3.850520372390747,
1121
+ "learning_rate": 0.0002584541062801932,
1122
+ "epoch": 5.356521739130435,
1123
  "step": 3080
1124
  },
1125
  {
1126
+ "loss": 1.3057,
1127
+ "grad_norm": NaN,
1128
+ "learning_rate": 0.00025661835748792274,
1129
+ "epoch": 5.391304347826087,
1130
  "step": 3100
1131
  },
1132
  {
1133
+ "loss": 1.2619,
1134
+ "grad_norm": 3.697744607925415,
1135
+ "learning_rate": 0.0002546859903381643,
1136
+ "epoch": 5.426086956521739,
1137
  "step": 3120
1138
  },
1139
  {
1140
+ "loss": 1.3436,
1141
+ "grad_norm": 4.125018119812012,
1142
+ "learning_rate": 0.00025275362318840576,
1143
+ "epoch": 5.460869565217392,
 
 
 
 
 
 
 
 
 
1144
  "step": 3140
1145
  },
1146
  {
1147
+ "loss": 1.3289,
1148
+ "grad_norm": 4.1491899490356445,
1149
+ "learning_rate": 0.00025082125603864735,
1150
+ "epoch": 5.495652173913044,
1151
  "step": 3160
1152
  },
1153
  {
1154
+ "loss": 1.218,
1155
+ "grad_norm": 3.9294846057891846,
1156
+ "learning_rate": 0.0002488888888888889,
1157
+ "epoch": 5.530434782608696,
1158
  "step": 3180
1159
  },
1160
  {
1161
+ "loss": 1.3219,
1162
+ "grad_norm": 3.9030706882476807,
1163
+ "learning_rate": 0.00024695652173913047,
1164
+ "epoch": 5.565217391304348,
1165
  "step": 3200
1166
  },
1167
  {
1168
+ "loss": 1.2694,
1169
+ "grad_norm": 4.124849319458008,
1170
+ "learning_rate": 0.000245024154589372,
1171
+ "epoch": 5.6,
1172
  "step": 3220
1173
  },
1174
  {
1175
+ "loss": 1.2379,
1176
+ "grad_norm": 4.1668500900268555,
1177
+ "learning_rate": 0.0002432850241545894,
1178
+ "epoch": 5.6347826086956525,
1179
  "step": 3240
1180
  },
1181
  {
1182
+ "loss": 1.2892,
1183
+ "grad_norm": 4.098198890686035,
1184
+ "learning_rate": 0.00024135265700483093,
1185
+ "epoch": 5.6695652173913045,
1186
  "step": 3260
1187
  },
1188
  {
1189
+ "loss": 1.2742,
1190
+ "grad_norm": 3.690241813659668,
1191
+ "learning_rate": 0.00023942028985507246,
1192
+ "epoch": 5.7043478260869565,
1193
  "step": 3280
1194
  },
1195
  {
1196
+ "loss": 1.1755,
1197
+ "grad_norm": 3.978963613510132,
1198
+ "learning_rate": 0.00023748792270531402,
1199
+ "epoch": 5.739130434782608,
1200
  "step": 3300
1201
  },
1202
  {
1203
+ "loss": 1.2256,
1204
+ "grad_norm": 3.7397215366363525,
1205
+ "learning_rate": 0.00023574879227053139,
1206
+ "epoch": 5.773913043478261,
1207
  "step": 3320
1208
  },
1209
  {
1210
+ "loss": 1.238,
1211
+ "grad_norm": 3.9201064109802246,
1212
+ "learning_rate": 0.00023391304347826088,
1213
+ "epoch": 5.808695652173913,
1214
  "step": 3340
1215
  },
1216
  {
1217
+ "loss": 1.1706,
1218
+ "grad_norm": 3.725389242172241,
1219
+ "learning_rate": 0.0002319806763285024,
1220
+ "epoch": 5.843478260869565,
1221
  "step": 3360
1222
  },
1223
  {
1224
+ "loss": 1.1644,
1225
+ "grad_norm": 3.5844123363494873,
1226
+ "learning_rate": 0.00023004830917874397,
1227
+ "epoch": 5.878260869565217,
1228
  "step": 3380
1229
  },
1230
  {
1231
+ "loss": 1.2256,
1232
+ "grad_norm": 3.79936146736145,
1233
+ "learning_rate": 0.00022821256038647343,
1234
+ "epoch": 5.913043478260869,
1235
  "step": 3400
1236
  },
1237
  {
1238
+ "loss": 1.2488,
1239
+ "grad_norm": 3.5947725772857666,
1240
+ "learning_rate": 0.00022628019323671497,
1241
+ "epoch": 5.947826086956522,
1242
  "step": 3420
1243
  },
1244
  {
1245
+ "loss": 1.1418,
1246
+ "grad_norm": NaN,
1247
+ "learning_rate": 0.00022444444444444446,
1248
+ "epoch": 5.982608695652174,
1249
  "step": 3440
1250
  },
1251
  {
1252
+ "eval_loss": 0.5840117335319519,
1253
+ "eval_accuracy": 0.9757901815736382,
1254
+ "eval_runtime": 97.2696,
1255
+ "eval_samples_per_second": 15.287,
1256
+ "eval_steps_per_second": 15.287,
1257
+ "epoch": 6.0,
1258
+ "step": 3450
1259
+ },
1260
+ {
1261
+ "loss": 1.1254,
1262
+ "grad_norm": 3.5959298610687256,
1263
+ "learning_rate": 0.00022260869565217392,
1264
+ "epoch": 6.017391304347826,
1265
  "step": 3460
1266
  },
1267
  {
1268
+ "loss": 1.0343,
1269
+ "grad_norm": 3.9623775482177734,
1270
+ "learning_rate": 0.00022067632850241545,
1271
+ "epoch": 6.052173913043478,
1272
  "step": 3480
1273
  },
1274
  {
1275
+ "loss": 1.0348,
1276
+ "grad_norm": 3.735102415084839,
1277
+ "learning_rate": 0.00021874396135265702,
1278
+ "epoch": 6.086956521739131,
1279
  "step": 3500
1280
  },
1281
  {
1282
+ "loss": 0.9796,
1283
+ "grad_norm": 3.4255013465881348,
1284
+ "learning_rate": 0.00021681159420289855,
1285
+ "epoch": 6.121739130434783,
1286
  "step": 3520
1287
  },
1288
  {
1289
+ "loss": 0.9865,
1290
+ "grad_norm": 3.981841564178467,
1291
+ "learning_rate": 0.00021497584541062804,
1292
+ "epoch": 6.156521739130435,
1293
  "step": 3540
1294
  },
1295
  {
1296
+ "loss": 1.0054,
1297
+ "grad_norm": 3.9057116508483887,
1298
+ "learning_rate": 0.00021314009661835748,
1299
+ "epoch": 6.191304347826087,
1300
  "step": 3560
1301
  },
1302
  {
1303
+ "loss": 1.0012,
1304
+ "grad_norm": 3.626560688018799,
1305
+ "learning_rate": 0.00021120772946859904,
1306
+ "epoch": 6.226086956521739,
1307
  "step": 3580
1308
  },
1309
  {
1310
+ "loss": 1.0129,
1311
+ "grad_norm": 3.687683582305908,
1312
+ "learning_rate": 0.0002093719806763285,
1313
+ "epoch": 6.260869565217392,
1314
  "step": 3600
1315
  },
1316
  {
1317
+ "loss": 0.9333,
1318
+ "grad_norm": 3.8632826805114746,
1319
+ "learning_rate": 0.00020763285024154592,
1320
+ "epoch": 6.2956521739130435,
1321
  "step": 3620
1322
  },
1323
  {
1324
+ "loss": 1.0259,
1325
+ "grad_norm": 4.089422702789307,
1326
+ "learning_rate": 0.0002058937198067633,
1327
+ "epoch": 6.3304347826086955,
1328
  "step": 3640
1329
  },
1330
  {
1331
+ "loss": 1.0184,
1332
+ "grad_norm": 4.261268615722656,
1333
+ "learning_rate": 0.00020415458937198067,
1334
+ "epoch": 6.3652173913043475,
1335
  "step": 3660
1336
  },
1337
  {
1338
+ "loss": 1.0293,
1339
+ "grad_norm": 2.3901586532592773,
1340
+ "learning_rate": 0.0002026086956521739,
1341
+ "epoch": 6.4,
 
 
 
 
 
 
 
 
 
1342
  "step": 3680
1343
  },
1344
  {
1345
+ "loss": 1.0026,
1346
+ "grad_norm": 2.233633518218994,
1347
+ "learning_rate": 0.00020067632850241546,
1348
+ "epoch": 6.434782608695652,
1349
  "step": 3700
1350
  },
1351
  {
1352
+ "loss": 1.0426,
1353
+ "grad_norm": 2.049773693084717,
1354
+ "learning_rate": 0.00019893719806763285,
1355
+ "epoch": 6.469565217391304,
1356
  "step": 3720
1357
  },
1358
  {
1359
+ "loss": 1.0324,
1360
+ "grad_norm": 2.21939754486084,
1361
+ "learning_rate": 0.0001970048309178744,
1362
+ "epoch": 6.504347826086956,
1363
  "step": 3740
1364
  },
1365
  {
1366
+ "loss": 1.0666,
1367
+ "grad_norm": 2.2138895988464355,
1368
+ "learning_rate": 0.00019516908212560387,
1369
+ "epoch": 6.539130434782608,
1370
  "step": 3760
1371
  },
1372
  {
1373
+ "loss": 1.0724,
1374
+ "grad_norm": 1.9186855554580688,
1375
+ "learning_rate": 0.0001932367149758454,
1376
+ "epoch": 6.573913043478261,
1377
  "step": 3780
1378
  },
1379
  {
1380
+ "loss": 1.0867,
1381
+ "grad_norm": 1.302451729774475,
1382
+ "learning_rate": 0.00019159420289855073,
1383
+ "epoch": 6.608695652173913,
1384
  "step": 3800
1385
  },
1386
  {
1387
+ "loss": 1.0659,
1388
+ "grad_norm": 1.1770459413528442,
1389
+ "learning_rate": 0.00018975845410628022,
1390
+ "epoch": 6.643478260869565,
1391
  "step": 3820
1392
  },
1393
  {
1394
+ "loss": 1.0494,
1395
+ "grad_norm": 0.2651650309562683,
1396
+ "learning_rate": 0.0001881159420289855,
1397
+ "epoch": 6.678260869565217,
1398
  "step": 3840
1399
  },
1400
  {
1401
+ "loss": 1.0464,
1402
+ "grad_norm": 0.0,
1403
+ "learning_rate": 0.0001867632850241546,
1404
+ "epoch": 6.71304347826087,
1405
  "step": 3860
1406
  },
1407
  {
1408
+ "loss": 1.0457,
1409
+ "grad_norm": 0.0,
1410
+ "learning_rate": 0.000185024154589372,
1411
+ "epoch": 6.747826086956522,
1412
  "step": 3880
1413
  },
1414
  {
1415
+ "loss": 0.9815,
1416
+ "grad_norm": 0.0,
1417
+ "learning_rate": 0.00018328502415458937,
1418
+ "epoch": 6.782608695652174,
1419
  "step": 3900
1420
  },
1421
  {
1422
+ "loss": 1.0094,
1423
+ "grad_norm": 0.0,
1424
+ "learning_rate": 0.0001816425120772947,
1425
+ "epoch": 6.817391304347826,
1426
  "step": 3920
1427
  },
1428
  {
1429
+ "loss": 1.0023,
1430
+ "grad_norm": NaN,
1431
+ "learning_rate": 0.00018028985507246377,
1432
+ "epoch": 6.852173913043478,
1433
  "step": 3940
1434
  },
1435
  {
1436
+ "loss": 1.0278,
1437
+ "grad_norm": 0.0,
1438
+ "learning_rate": 0.00017893719806763288,
1439
+ "epoch": 6.886956521739131,
1440
  "step": 3960
1441
  },
1442
  {
1443
+ "loss": 1.0123,
1444
+ "grad_norm": 0.0,
1445
+ "learning_rate": 0.0001771014492753623,
1446
+ "epoch": 6.921739130434783,
1447
  "step": 3980
1448
  },
1449
  {
1450
+ "loss": 1.0774,
1451
+ "grad_norm": 0.0,
1452
+ "learning_rate": 0.00017565217391304346,
1453
+ "epoch": 6.956521739130435,
1454
  "step": 4000
1455
  },
1456
  {
1457
+ "loss": 1.0484,
1458
+ "grad_norm": 0.0,
1459
+ "learning_rate": 0.00017391304347826088,
1460
+ "epoch": 6.9913043478260875,
1461
  "step": 4020
1462
  },
1463
  {
1464
+ "eval_loss": 0.5780686736106873,
1465
+ "eval_accuracy": 0.9737726967047747,
1466
+ "eval_runtime": 118.8154,
1467
+ "eval_samples_per_second": 12.515,
1468
+ "eval_steps_per_second": 12.515,
1469
+ "epoch": 7.0,
1470
+ "step": 4025
1471
+ },
1472
+ {
1473
+ "loss": 0.9799,
1474
+ "grad_norm": 0.0,
1475
+ "learning_rate": 0.0001723671497584541,
1476
+ "epoch": 7.026086956521739,
1477
  "step": 4040
1478
  },
1479
  {
1480
+ "loss": 0.9588,
1481
+ "grad_norm": 0.0,
1482
+ "learning_rate": 0.00017091787439613525,
1483
+ "epoch": 7.060869565217391,
1484
  "step": 4060
1485
  },
1486
  {
1487
+ "loss": 0.9421,
1488
+ "grad_norm": NaN,
1489
+ "learning_rate": 0.00016966183574879226,
1490
+ "epoch": 7.095652173913043,
1491
  "step": 4080
1492
  },
1493
  {
1494
+ "loss": 0.9551,
1495
+ "grad_norm": 0.0,
1496
+ "learning_rate": 0.00016782608695652175,
1497
+ "epoch": 7.130434782608695,
1498
  "step": 4100
1499
  },
1500
  {
1501
+ "loss": 0.9622,
1502
+ "grad_norm": 0.0,
1503
+ "learning_rate": 0.00016618357487922704,
1504
+ "epoch": 7.165217391304348,
1505
  "step": 4120
1506
  },
1507
  {
1508
+ "loss": 0.9712,
1509
+ "grad_norm": 0.0,
1510
+ "learning_rate": 0.00016444444444444446,
1511
+ "epoch": 7.2,
1512
  "step": 4140
1513
  },
1514
  {
1515
+ "loss": 0.9834,
1516
+ "grad_norm": 0.0,
1517
+ "learning_rate": 0.00016299516908212561,
1518
+ "epoch": 7.234782608695652,
1519
  "step": 4160
1520
  },
1521
  {
1522
+ "loss": 0.9968,
1523
+ "grad_norm": NaN,
1524
+ "learning_rate": 0.00016135265700483093,
1525
+ "epoch": 7.269565217391304,
1526
  "step": 4180
1527
  },
1528
  {
1529
+ "loss": 0.956,
1530
+ "grad_norm": 0.0,
1531
+ "learning_rate": 0.00015961352657004833,
1532
+ "epoch": 7.304347826086957,
 
 
 
 
 
 
 
 
 
1533
  "step": 4200
1534
  },
1535
  {
1536
+ "loss": 0.8981,
1537
+ "grad_norm": 0.0,
1538
+ "learning_rate": 0.00015806763285024155,
1539
+ "epoch": 7.339130434782609,
1540
  "step": 4220
1541
  },
1542
  {
1543
+ "loss": 0.9515,
1544
+ "grad_norm": 0.0,
1545
+ "learning_rate": 0.00015642512077294684,
1546
+ "epoch": 7.373913043478261,
1547
  "step": 4240
1548
  },
1549
  {
1550
+ "loss": 0.9535,
1551
+ "grad_norm": 0.0,
1552
+ "learning_rate": 0.0001548792270531401,
1553
+ "epoch": 7.408695652173913,
1554
  "step": 4260
1555
  },
1556
  {
1557
+ "loss": 0.9646,
1558
+ "grad_norm": NaN,
1559
+ "learning_rate": 0.00015333333333333334,
1560
+ "epoch": 7.443478260869565,
1561
  "step": 4280
1562
  },
1563
  {
1564
+ "loss": 0.9821,
1565
+ "grad_norm": 0.0,
1566
+ "learning_rate": 0.00015140096618357487,
1567
+ "epoch": 7.478260869565218,
1568
  "step": 4300
1569
  },
1570
  {
1571
+ "loss": 0.9259,
1572
+ "grad_norm": 0.0,
1573
+ "learning_rate": 0.00015014492753623188,
1574
+ "epoch": 7.51304347826087,
1575
  "step": 4320
1576
  },
1577
  {
1578
+ "loss": 0.9494,
1579
+ "grad_norm": 0.0,
1580
+ "learning_rate": 0.00014869565217391303,
1581
+ "epoch": 7.547826086956522,
1582
  "step": 4340
1583
  },
1584
  {
1585
+ "loss": 0.9305,
1586
+ "grad_norm": 0.0,
1587
+ "learning_rate": 0.00014714975845410628,
1588
+ "epoch": 7.582608695652174,
1589
  "step": 4360
1590
  },
1591
  {
1592
+ "loss": 0.8889,
1593
+ "grad_norm": 0.0,
1594
+ "learning_rate": 0.0001455072463768116,
1595
+ "epoch": 7.6173913043478265,
1596
  "step": 4380
1597
  },
1598
  {
1599
+ "loss": 0.9524,
1600
+ "grad_norm": 0.0,
1601
+ "learning_rate": 0.00014396135265700482,
1602
+ "epoch": 7.6521739130434785,
1603
  "step": 4400
1604
  },
1605
  {
1606
+ "loss": 0.9065,
1607
+ "grad_norm": 0.0,
1608
+ "learning_rate": 0.00014231884057971014,
1609
+ "epoch": 7.6869565217391305,
1610
  "step": 4420
1611
  },
1612
  {
1613
+ "loss": 0.9153,
1614
+ "grad_norm": 0.0,
1615
+ "learning_rate": 0.00014048309178743963,
1616
+ "epoch": 7.721739130434782,
1617
  "step": 4440
1618
  },
1619
  {
1620
+ "loss": 0.6675,
1621
+ "grad_norm": NaN,
1622
+ "learning_rate": 0.0001403864734299517,
1623
+ "epoch": 7.756521739130434,
1624
  "step": 4460
1625
  },
1626
  {
1627
+ "loss": 0.0,
1628
+ "grad_norm": NaN,
1629
+ "learning_rate": 0.0001403864734299517,
1630
+ "epoch": 7.791304347826087,
1631
  "step": 4480
1632
  },
1633
  {
1634
+ "loss": 0.0,
1635
+ "grad_norm": NaN,
1636
+ "learning_rate": 0.0001403864734299517,
1637
+ "epoch": 7.826086956521739,
1638
  "step": 4500
1639
  },
1640
  {
1641
+ "loss": 0.0,
1642
+ "grad_norm": NaN,
1643
+ "learning_rate": 0.0001403864734299517,
1644
+ "epoch": 7.860869565217391,
1645
  "step": 4520
1646
  },
1647
  {
1648
+ "loss": 0.0,
1649
+ "grad_norm": NaN,
1650
+ "learning_rate": 0.0001403864734299517,
1651
+ "epoch": 7.895652173913043,
1652
  "step": 4540
1653
  },
1654
  {
1655
+ "loss": 0.0,
1656
+ "grad_norm": NaN,
1657
+ "learning_rate": 0.0001403864734299517,
1658
+ "epoch": 7.930434782608696,
1659
  "step": 4560
1660
  },
1661
  {
1662
+ "loss": 0.0,
1663
+ "grad_norm": NaN,
1664
+ "learning_rate": 0.0001403864734299517,
1665
+ "epoch": 7.965217391304348,
1666
  "step": 4580
1667
  },
1668
  {
1669
+ "loss": 0.0,
1670
+ "grad_norm": NaN,
1671
+ "learning_rate": 0.0001403864734299517,
1672
+ "epoch": 8.0,
1673
  "step": 4600
1674
  },
1675
  {
1676
+ "eval_loss": NaN,
1677
+ "eval_accuracy": 0.0006724949562878278,
1678
+ "eval_runtime": 129.6238,
1679
+ "eval_samples_per_second": 11.472,
1680
+ "eval_steps_per_second": 11.472,
1681
+ "epoch": 8.0,
1682
+ "step": 4600
1683
+ },
1684
+ {
1685
+ "loss": 0.0,
1686
+ "grad_norm": NaN,
1687
+ "learning_rate": 0.0001403864734299517,
1688
+ "epoch": 8.034782608695652,
1689
  "step": 4620
1690
  },
1691
  {
1692
+ "loss": 0.0,
1693
+ "grad_norm": NaN,
1694
+ "learning_rate": 0.0001403864734299517,
1695
+ "epoch": 8.069565217391304,
1696
  "step": 4640
1697
  },
1698
  {
1699
+ "loss": 0.0,
1700
+ "grad_norm": NaN,
1701
+ "learning_rate": 0.0001403864734299517,
1702
+ "epoch": 8.104347826086956,
1703
  "step": 4660
1704
  },
1705
  {
1706
+ "loss": 0.0,
1707
+ "grad_norm": NaN,
1708
+ "learning_rate": 0.0001403864734299517,
1709
+ "epoch": 8.139130434782608,
1710
  "step": 4680
1711
  },
1712
  {
1713
+ "loss": 0.0,
1714
+ "grad_norm": NaN,
1715
+ "learning_rate": 0.0001403864734299517,
1716
+ "epoch": 8.173913043478262,
1717
  "step": 4700
1718
  },
1719
  {
1720
+ "loss": 0.0,
1721
+ "grad_norm": NaN,
1722
+ "learning_rate": 0.0001403864734299517,
1723
+ "epoch": 8.208695652173914,
 
 
 
 
 
 
 
 
 
1724
  "step": 4720
1725
  },
1726
  {
1727
+ "loss": 0.0,
1728
+ "grad_norm": NaN,
1729
+ "learning_rate": 0.0001403864734299517,
1730
+ "epoch": 8.243478260869566,
1731
  "step": 4740
1732
  },
1733
  {
1734
+ "loss": 0.0,
1735
+ "grad_norm": NaN,
1736
+ "learning_rate": 0.0001403864734299517,
1737
+ "epoch": 8.278260869565218,
1738
  "step": 4760
1739
  },
1740
  {
1741
+ "loss": 0.0,
1742
+ "grad_norm": NaN,
1743
+ "learning_rate": 0.0001403864734299517,
1744
+ "epoch": 8.31304347826087,
1745
  "step": 4780
1746
  },
1747
  {
1748
+ "loss": 0.0,
1749
+ "grad_norm": NaN,
1750
+ "learning_rate": 0.0001403864734299517,
1751
+ "epoch": 8.347826086956522,
1752
  "step": 4800
1753
  },
1754
  {
1755
+ "loss": 0.0,
1756
+ "grad_norm": NaN,
1757
+ "learning_rate": 0.0001403864734299517,
1758
+ "epoch": 8.382608695652173,
1759
  "step": 4820
1760
  },
1761
  {
1762
+ "loss": 0.0,
1763
+ "grad_norm": NaN,
1764
+ "learning_rate": 0.0001403864734299517,
1765
+ "epoch": 8.417391304347825,
1766
  "step": 4840
1767
  },
1768
  {
1769
+ "loss": 0.0,
1770
+ "grad_norm": NaN,
1771
+ "learning_rate": 0.0001403864734299517,
1772
+ "epoch": 8.452173913043477,
1773
  "step": 4860
1774
  },
1775
  {
1776
+ "loss": 0.0,
1777
+ "grad_norm": NaN,
1778
+ "learning_rate": 0.0001403864734299517,
1779
+ "epoch": 8.486956521739131,
1780
  "step": 4880
1781
  },
1782
  {
1783
+ "loss": 0.0,
1784
+ "grad_norm": NaN,
1785
+ "learning_rate": 0.0001403864734299517,
1786
+ "epoch": 8.521739130434783,
1787
  "step": 4900
1788
  },
1789
  {
1790
+ "loss": 0.0,
1791
+ "grad_norm": NaN,
1792
+ "learning_rate": 0.0001403864734299517,
1793
+ "epoch": 8.556521739130435,
1794
  "step": 4920
1795
  },
1796
  {
1797
+ "loss": 0.0,
1798
+ "grad_norm": NaN,
1799
+ "learning_rate": 0.0001403864734299517,
1800
+ "epoch": 8.591304347826087,
1801
  "step": 4940
1802
  },
1803
  {
1804
+ "loss": 0.0,
1805
+ "grad_norm": NaN,
1806
+ "learning_rate": 0.0001403864734299517,
1807
+ "epoch": 8.626086956521739,
1808
  "step": 4960
1809
  },
1810
  {
1811
+ "loss": 0.0,
1812
+ "grad_norm": NaN,
1813
+ "learning_rate": 0.0001403864734299517,
1814
+ "epoch": 8.660869565217391,
1815
  "step": 4980
1816
  },
1817
  {
1818
+ "loss": 0.0,
1819
+ "grad_norm": NaN,
1820
+ "learning_rate": 0.0001403864734299517,
1821
+ "epoch": 8.695652173913043,
1822
  "step": 5000
1823
  },
1824
  {
1825
+ "loss": 0.0,
1826
+ "grad_norm": NaN,
1827
+ "learning_rate": 0.0001403864734299517,
1828
+ "epoch": 8.730434782608695,
1829
  "step": 5020
1830
  },
1831
  {
1832
+ "loss": 0.0,
1833
+ "grad_norm": NaN,
1834
+ "learning_rate": 0.0001403864734299517,
1835
+ "epoch": 8.765217391304347,
1836
  "step": 5040
1837
  },
1838
  {
1839
+ "loss": 0.0,
1840
+ "grad_norm": NaN,
1841
+ "learning_rate": 0.0001403864734299517,
1842
+ "epoch": 8.8,
1843
  "step": 5060
1844
  },
1845
  {
1846
+ "loss": 0.0,
1847
+ "grad_norm": NaN,
1848
+ "learning_rate": 0.0001403864734299517,
1849
+ "epoch": 8.834782608695653,
1850
  "step": 5080
1851
  },
1852
  {
1853
+ "loss": 0.0,
1854
+ "grad_norm": NaN,
1855
+ "learning_rate": 0.0001403864734299517,
1856
+ "epoch": 8.869565217391305,
1857
  "step": 5100
1858
  },
1859
  {
1860
+ "loss": 0.0,
1861
+ "grad_norm": NaN,
1862
+ "learning_rate": 0.0001403864734299517,
1863
+ "epoch": 8.904347826086957,
1864
  "step": 5120
1865
  },
1866
  {
1867
+ "loss": 0.0,
1868
+ "grad_norm": NaN,
1869
+ "learning_rate": 0.0001403864734299517,
1870
+ "epoch": 8.939130434782609,
1871
  "step": 5140
1872
  },
1873
  {
1874
+ "loss": 0.0,
1875
+ "grad_norm": NaN,
1876
+ "learning_rate": 0.0001403864734299517,
1877
+ "epoch": 8.97391304347826,
1878
  "step": 5160
1879
  },
1880
  {
1881
+ "eval_loss": NaN,
1882
+ "eval_accuracy": 0.0006724949562878278,
1883
+ "eval_runtime": 117.1288,
1884
+ "eval_samples_per_second": 12.695,
1885
+ "eval_steps_per_second": 12.695,
1886
+ "epoch": 9.0,
1887
+ "step": 5175
1888
+ },
1889
+ {
1890
+ "loss": 0.0,
1891
+ "grad_norm": NaN,
1892
+ "learning_rate": 0.0001403864734299517,
1893
+ "epoch": 9.008695652173913,
1894
  "step": 5180
1895
  },
1896
  {
1897
+ "loss": 0.0,
1898
+ "grad_norm": NaN,
1899
+ "learning_rate": 0.0001403864734299517,
1900
+ "epoch": 9.043478260869565,
1901
  "step": 5200
1902
  },
1903
  {
1904
+ "loss": 0.0,
1905
+ "grad_norm": NaN,
1906
+ "learning_rate": 0.0001403864734299517,
1907
+ "epoch": 9.078260869565218,
1908
  "step": 5220
1909
  },
1910
  {
1911
+ "loss": 0.0,
1912
+ "grad_norm": NaN,
1913
+ "learning_rate": 0.0001403864734299517,
1914
+ "epoch": 9.11304347826087,
1915
+ "step": 5240
1916
+ },
1917
+ {
1918
+ "loss": 0.0,
1919
+ "grad_norm": NaN,
1920
+ "learning_rate": 0.0001403864734299517,
1921
+ "epoch": 9.147826086956522,
1922
+ "step": 5260
1923
+ },
1924
+ {
1925
+ "loss": 0.0,
1926
+ "grad_norm": NaN,
1927
+ "learning_rate": 0.0001403864734299517,
1928
+ "epoch": 9.182608695652174,
1929
+ "step": 5280
1930
+ },
1931
+ {
1932
+ "loss": 0.0,
1933
+ "grad_norm": NaN,
1934
+ "learning_rate": 0.0001403864734299517,
1935
+ "epoch": 9.217391304347826,
1936
+ "step": 5300
1937
+ },
1938
+ {
1939
+ "loss": 0.0,
1940
+ "grad_norm": NaN,
1941
+ "learning_rate": 0.0001403864734299517,
1942
+ "epoch": 9.252173913043478,
1943
+ "step": 5320
1944
+ },
1945
+ {
1946
+ "loss": 0.0,
1947
+ "grad_norm": NaN,
1948
+ "learning_rate": 0.0001403864734299517,
1949
+ "epoch": 9.28695652173913,
1950
+ "step": 5340
1951
+ },
1952
+ {
1953
+ "loss": 0.0,
1954
+ "grad_norm": NaN,
1955
+ "learning_rate": 0.0001403864734299517,
1956
+ "epoch": 9.321739130434782,
1957
+ "step": 5360
1958
+ },
1959
+ {
1960
+ "loss": 0.0,
1961
+ "grad_norm": NaN,
1962
+ "learning_rate": 0.0001403864734299517,
1963
+ "epoch": 9.356521739130434,
1964
+ "step": 5380
1965
+ },
1966
+ {
1967
+ "loss": 0.0,
1968
+ "grad_norm": NaN,
1969
+ "learning_rate": 0.0001403864734299517,
1970
+ "epoch": 9.391304347826088,
1971
+ "step": 5400
1972
+ },
1973
+ {
1974
+ "loss": 0.0,
1975
+ "grad_norm": NaN,
1976
+ "learning_rate": 0.0001403864734299517,
1977
+ "epoch": 9.42608695652174,
1978
+ "step": 5420
1979
+ },
1980
+ {
1981
+ "loss": 0.0,
1982
+ "grad_norm": NaN,
1983
+ "learning_rate": 0.0001403864734299517,
1984
+ "epoch": 9.460869565217392,
1985
+ "step": 5440
1986
+ },
1987
+ {
1988
+ "loss": 0.0,
1989
+ "grad_norm": NaN,
1990
+ "learning_rate": 0.0001403864734299517,
1991
+ "epoch": 9.495652173913044,
1992
+ "step": 5460
1993
+ },
1994
+ {
1995
+ "loss": 0.0,
1996
+ "grad_norm": NaN,
1997
+ "learning_rate": 0.0001403864734299517,
1998
+ "epoch": 9.530434782608696,
1999
+ "step": 5480
2000
+ },
2001
+ {
2002
+ "loss": 0.0,
2003
+ "grad_norm": NaN,
2004
+ "learning_rate": 0.0001403864734299517,
2005
+ "epoch": 9.565217391304348,
2006
+ "step": 5500
2007
+ },
2008
+ {
2009
+ "loss": 0.0,
2010
+ "grad_norm": NaN,
2011
+ "learning_rate": 0.0001403864734299517,
2012
+ "epoch": 9.6,
2013
+ "step": 5520
2014
+ },
2015
+ {
2016
+ "loss": 0.0,
2017
+ "grad_norm": NaN,
2018
+ "learning_rate": 0.0001403864734299517,
2019
+ "epoch": 9.634782608695652,
2020
+ "step": 5540
2021
+ },
2022
+ {
2023
+ "loss": 0.0,
2024
+ "grad_norm": NaN,
2025
+ "learning_rate": 0.0001403864734299517,
2026
+ "epoch": 9.669565217391304,
2027
+ "step": 5560
2028
+ },
2029
+ {
2030
+ "loss": 0.0,
2031
+ "grad_norm": NaN,
2032
+ "learning_rate": 0.0001403864734299517,
2033
+ "epoch": 9.704347826086957,
2034
+ "step": 5580
2035
+ },
2036
+ {
2037
+ "loss": 0.0,
2038
+ "grad_norm": NaN,
2039
+ "learning_rate": 0.0001403864734299517,
2040
+ "epoch": 9.73913043478261,
2041
+ "step": 5600
2042
+ },
2043
+ {
2044
+ "loss": 0.0,
2045
+ "grad_norm": NaN,
2046
+ "learning_rate": 0.0001403864734299517,
2047
+ "epoch": 9.773913043478261,
2048
+ "step": 5620
2049
+ },
2050
+ {
2051
+ "loss": 0.0,
2052
+ "grad_norm": NaN,
2053
+ "learning_rate": 0.0001403864734299517,
2054
+ "epoch": 9.808695652173913,
2055
+ "step": 5640
2056
+ },
2057
+ {
2058
+ "loss": 0.0,
2059
+ "grad_norm": NaN,
2060
+ "learning_rate": 0.0001403864734299517,
2061
+ "epoch": 9.843478260869565,
2062
+ "step": 5660
2063
+ },
2064
+ {
2065
+ "loss": 0.0,
2066
+ "grad_norm": NaN,
2067
+ "learning_rate": 0.0001403864734299517,
2068
+ "epoch": 9.878260869565217,
2069
+ "step": 5680
2070
+ },
2071
+ {
2072
+ "loss": 0.0,
2073
+ "grad_norm": NaN,
2074
+ "learning_rate": 0.0001403864734299517,
2075
+ "epoch": 9.91304347826087,
2076
+ "step": 5700
2077
+ },
2078
+ {
2079
+ "loss": 0.0,
2080
+ "grad_norm": NaN,
2081
+ "learning_rate": 0.0001403864734299517,
2082
+ "epoch": 9.947826086956521,
2083
+ "step": 5720
2084
+ },
2085
+ {
2086
+ "loss": 0.0,
2087
+ "grad_norm": NaN,
2088
+ "learning_rate": 0.0001403864734299517,
2089
+ "epoch": 9.982608695652173,
2090
+ "step": 5740
2091
+ },
2092
+ {
2093
+ "eval_loss": NaN,
2094
+ "eval_accuracy": 0.0006724949562878278,
2095
+ "eval_runtime": 103.3199,
2096
+ "eval_samples_per_second": 14.392,
2097
+ "eval_steps_per_second": 14.392,
2098
  "epoch": 10.0,
2099
+ "step": 5750
2100
  },
2101
  {
2102
+ "train_runtime": 59857.6179,
2103
+ "train_samples_per_second": 24.584,
2104
+ "train_steps_per_second": 0.096,
2105
+ "total_flos": 2.7398100529152e+18,
2106
+ "train_loss": 2.9414075751926587,
2107
  "epoch": 10.0,
2108
+ "step": 5750
2109
  }
2110
  ]
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 10.0,
3
- "total_flos": 2.49073133395968e+18,
4
- "train_loss": 7.888943860726876,
5
- "train_runtime": 28748.4048,
6
- "train_samples_per_second": 46.534,
7
- "train_steps_per_second": 0.182
8
  }
 
1
  {
2
  "epoch": 10.0,
3
+ "total_flos": 2.7398100529152e+18,
4
+ "train_loss": 2.9414075751926587,
5
+ "train_runtime": 59857.6179,
6
+ "train_samples_per_second": 24.584,
7
+ "train_steps_per_second": 0.096
8
  }
trainer_state.json CHANGED
@@ -1,1942 +1,2124 @@
1
  {
2
- "best_metric": 0.8030272452068618,
3
- "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/ecapa-tdnn/voxceleb1/pretrain/c512-aam-len3-bs256-lr1e-4/checkpoint-5230",
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 5230,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.03824091778202677,
13
- "grad_norm": 6.157718181610107,
14
- "learning_rate": 3.824091778202677e-06,
15
- "loss": 13.2232,
16
  "step": 20
17
  },
18
  {
19
- "epoch": 0.07648183556405354,
20
- "grad_norm": 6.144223213195801,
21
- "learning_rate": 7.648183556405354e-06,
22
- "loss": 13.2113,
23
  "step": 40
24
  },
25
  {
26
- "epoch": 0.1147227533460803,
27
- "grad_norm": 6.032691955566406,
28
- "learning_rate": 1.147227533460803e-05,
29
- "loss": 13.1625,
30
  "step": 60
31
  },
32
  {
33
- "epoch": 0.15296367112810708,
34
- "grad_norm": 5.916826248168945,
35
- "learning_rate": 1.529636711281071e-05,
36
- "loss": 13.1174,
37
  "step": 80
38
  },
39
  {
40
- "epoch": 0.19120458891013384,
41
- "grad_norm": 5.7198004722595215,
42
- "learning_rate": 1.9120458891013384e-05,
43
- "loss": 13.0512,
44
  "step": 100
45
  },
46
  {
47
- "epoch": 0.2294455066921606,
48
- "grad_norm": 5.554529666900635,
49
- "learning_rate": 2.294455066921606e-05,
50
- "loss": 12.9931,
51
  "step": 120
52
  },
53
  {
54
- "epoch": 0.2676864244741874,
55
- "grad_norm": 5.364482879638672,
56
- "learning_rate": 2.6768642447418742e-05,
57
- "loss": 12.9042,
58
  "step": 140
59
  },
60
  {
61
- "epoch": 0.30592734225621415,
62
- "grad_norm": 5.091818809509277,
63
- "learning_rate": 3.059273422562142e-05,
64
- "loss": 12.8488,
65
  "step": 160
66
  },
67
  {
68
- "epoch": 0.3441682600382409,
69
- "grad_norm": 5.035643577575684,
70
- "learning_rate": 3.441682600382409e-05,
71
- "loss": 12.7715,
72
  "step": 180
73
  },
74
  {
75
- "epoch": 0.3824091778202677,
76
- "grad_norm": 4.819056987762451,
77
- "learning_rate": 3.824091778202677e-05,
78
- "loss": 12.6747,
79
  "step": 200
80
  },
81
  {
82
- "epoch": 0.42065009560229444,
83
- "grad_norm": 4.597919464111328,
84
- "learning_rate": 4.2065009560229444e-05,
85
- "loss": 12.6366,
86
  "step": 220
87
  },
88
  {
89
- "epoch": 0.4588910133843212,
90
- "grad_norm": 4.551054954528809,
91
- "learning_rate": 4.588910133843212e-05,
92
- "loss": 12.5388,
93
  "step": 240
94
  },
95
  {
96
- "epoch": 0.497131931166348,
97
- "grad_norm": 4.289029598236084,
98
- "learning_rate": 4.97131931166348e-05,
99
- "loss": 12.4527,
100
  "step": 260
101
  },
102
  {
103
- "epoch": 0.5353728489483748,
104
- "grad_norm": 4.291126728057861,
105
- "learning_rate": 5.3537284894837484e-05,
106
- "loss": 12.3809,
107
  "step": 280
108
  },
109
  {
110
- "epoch": 0.5736137667304015,
111
- "grad_norm": 4.090356826782227,
112
- "learning_rate": 5.736137667304016e-05,
113
- "loss": 12.3185,
114
  "step": 300
115
  },
116
  {
117
- "epoch": 0.6118546845124283,
118
- "grad_norm": 3.9066805839538574,
119
- "learning_rate": 6.118546845124283e-05,
120
- "loss": 12.2101,
121
  "step": 320
122
  },
123
  {
124
- "epoch": 0.6500956022944551,
125
- "grad_norm": 3.937908887863159,
126
- "learning_rate": 6.50095602294455e-05,
127
- "loss": 12.1255,
128
  "step": 340
129
  },
130
  {
131
- "epoch": 0.6883365200764818,
132
- "grad_norm": 3.919820547103882,
133
- "learning_rate": 6.883365200764819e-05,
134
- "loss": 12.0543,
135
  "step": 360
136
  },
137
  {
138
- "epoch": 0.7265774378585086,
139
- "grad_norm": 3.8298187255859375,
140
- "learning_rate": 7.265774378585087e-05,
141
- "loss": 11.9417,
142
  "step": 380
143
  },
144
  {
145
- "epoch": 0.7648183556405354,
146
- "grad_norm": 3.7290520668029785,
147
- "learning_rate": 7.648183556405354e-05,
148
- "loss": 11.8644,
149
  "step": 400
150
  },
151
  {
152
- "epoch": 0.8030592734225621,
153
- "grad_norm": 3.76938533782959,
154
- "learning_rate": 8.030592734225622e-05,
155
- "loss": 11.8122,
156
  "step": 420
157
  },
158
  {
159
- "epoch": 0.8413001912045889,
160
- "grad_norm": 3.8729827404022217,
161
- "learning_rate": 8.413001912045889e-05,
162
- "loss": 11.7117,
163
  "step": 440
164
  },
165
  {
166
- "epoch": 0.8795411089866156,
167
- "grad_norm": 3.7178924083709717,
168
- "learning_rate": 8.795411089866157e-05,
169
- "loss": 11.6245,
170
  "step": 460
171
  },
172
  {
173
- "epoch": 0.9177820267686424,
174
- "grad_norm": 3.7744827270507812,
175
- "learning_rate": 9.177820267686424e-05,
176
- "loss": 11.547,
177
  "step": 480
178
  },
179
  {
180
- "epoch": 0.9560229445506692,
181
- "grad_norm": 3.6705052852630615,
182
- "learning_rate": 9.560229445506692e-05,
183
- "loss": 11.4699,
184
  "step": 500
185
  },
186
  {
187
- "epoch": 0.994263862332696,
188
- "grad_norm": 3.6992719173431396,
189
- "learning_rate": 9.94263862332696e-05,
190
- "loss": 11.3851,
191
  "step": 520
192
  },
193
  {
194
- "epoch": 1.0,
195
- "eval_accuracy": 0.18062563067608475,
196
- "eval_loss": 11.029301643371582,
197
- "eval_runtime": 592.6353,
198
- "eval_samples_per_second": 25.083,
199
- "eval_steps_per_second": 25.083,
200
- "step": 523
201
- },
202
- {
203
- "epoch": 1.0325047801147227,
204
- "grad_norm": 3.6838159561157227,
205
- "learning_rate": 9.963883577650308e-05,
206
- "loss": 11.2589,
207
  "step": 540
208
  },
209
  {
210
- "epoch": 1.0707456978967496,
211
- "grad_norm": 3.7846293449401855,
212
- "learning_rate": 9.921393669003612e-05,
213
- "loss": 11.1668,
214
  "step": 560
215
  },
216
  {
217
- "epoch": 1.1089866156787762,
218
- "grad_norm": 3.688416004180908,
219
- "learning_rate": 9.878903760356916e-05,
220
- "loss": 11.1053,
 
 
 
 
 
 
 
 
 
221
  "step": 580
222
  },
223
  {
224
- "epoch": 1.147227533460803,
225
- "grad_norm": 3.724273204803467,
226
- "learning_rate": 9.836413851710219e-05,
227
- "loss": 11.019,
228
  "step": 600
229
  },
230
  {
231
- "epoch": 1.1854684512428297,
232
- "grad_norm": 3.840388536453247,
233
- "learning_rate": 9.793923943063523e-05,
234
- "loss": 10.9731,
235
  "step": 620
236
  },
237
  {
238
- "epoch": 1.2237093690248566,
239
- "grad_norm": 3.828228235244751,
240
- "learning_rate": 9.751434034416827e-05,
241
- "loss": 10.875,
242
  "step": 640
243
  },
244
  {
245
- "epoch": 1.2619502868068833,
246
- "grad_norm": 3.891911745071411,
247
- "learning_rate": 9.70894412577013e-05,
248
- "loss": 10.8111,
249
  "step": 660
250
  },
251
  {
252
- "epoch": 1.3001912045889101,
253
- "grad_norm": 3.8076562881469727,
254
- "learning_rate": 9.666454217123433e-05,
255
- "loss": 10.7717,
256
  "step": 680
257
  },
258
  {
259
- "epoch": 1.338432122370937,
260
- "grad_norm": 3.8521881103515625,
261
- "learning_rate": 9.623964308476737e-05,
262
- "loss": 10.6723,
263
  "step": 700
264
  },
265
  {
266
- "epoch": 1.3766730401529637,
267
- "grad_norm": 3.8576488494873047,
268
- "learning_rate": 9.58147439983004e-05,
269
- "loss": 10.5961,
270
  "step": 720
271
  },
272
  {
273
- "epoch": 1.4149139579349903,
274
- "grad_norm": 4.002715587615967,
275
- "learning_rate": 9.538984491183345e-05,
276
- "loss": 10.5392,
277
  "step": 740
278
  },
279
  {
280
- "epoch": 1.4531548757170172,
281
- "grad_norm": 3.8657026290893555,
282
- "learning_rate": 9.496494582536648e-05,
283
- "loss": 10.5018,
284
  "step": 760
285
  },
286
  {
287
- "epoch": 1.491395793499044,
288
- "grad_norm": 3.9424169063568115,
289
- "learning_rate": 9.454004673889951e-05,
290
- "loss": 10.4325,
291
  "step": 780
292
  },
293
  {
294
- "epoch": 1.5296367112810707,
295
- "grad_norm": 3.9783968925476074,
296
- "learning_rate": 9.411514765243256e-05,
297
- "loss": 10.3722,
298
  "step": 800
299
  },
300
  {
301
- "epoch": 1.5678776290630974,
302
- "grad_norm": 4.081951141357422,
303
- "learning_rate": 9.369024856596559e-05,
304
- "loss": 10.3069,
305
  "step": 820
306
  },
307
  {
308
- "epoch": 1.6061185468451242,
309
- "grad_norm": 4.141290187835693,
310
- "learning_rate": 9.326534947949863e-05,
311
- "loss": 10.2527,
312
  "step": 840
313
  },
314
  {
315
- "epoch": 1.644359464627151,
316
- "grad_norm": 4.294083595275879,
317
- "learning_rate": 9.284045039303167e-05,
318
- "loss": 10.2271,
319
  "step": 860
320
  },
321
  {
322
- "epoch": 1.682600382409178,
323
- "grad_norm": 4.727543354034424,
324
- "learning_rate": 9.241555130656469e-05,
325
- "loss": 10.1756,
326
  "step": 880
327
  },
328
  {
329
- "epoch": 1.7208413001912046,
330
- "grad_norm": 4.068965911865234,
331
- "learning_rate": 9.199065222009773e-05,
332
- "loss": 10.0936,
333
  "step": 900
334
  },
335
  {
336
- "epoch": 1.7590822179732313,
337
- "grad_norm": 4.025643825531006,
338
- "learning_rate": 9.156575313363077e-05,
339
- "loss": 10.0937,
340
  "step": 920
341
  },
342
  {
343
- "epoch": 1.7973231357552581,
344
- "grad_norm": 4.317354679107666,
345
- "learning_rate": 9.11408540471638e-05,
346
- "loss": 10.0217,
347
  "step": 940
348
  },
349
  {
350
- "epoch": 1.835564053537285,
351
- "grad_norm": 4.101060390472412,
352
- "learning_rate": 9.071595496069684e-05,
353
- "loss": 9.9743,
354
  "step": 960
355
  },
356
  {
357
- "epoch": 1.8738049713193117,
358
- "grad_norm": 4.225609302520752,
359
- "learning_rate": 9.029105587422988e-05,
360
- "loss": 9.9879,
361
  "step": 980
362
  },
363
  {
364
- "epoch": 1.9120458891013383,
365
- "grad_norm": 4.3140668869018555,
366
- "learning_rate": 8.986615678776292e-05,
367
- "loss": 9.8273,
368
  "step": 1000
369
  },
370
  {
371
- "epoch": 1.9502868068833652,
372
- "grad_norm": 4.199500560760498,
373
- "learning_rate": 8.944125770129594e-05,
374
- "loss": 9.8136,
375
  "step": 1020
376
  },
377
  {
378
- "epoch": 1.988527724665392,
379
- "grad_norm": 4.457912445068359,
380
- "learning_rate": 8.901635861482898e-05,
381
- "loss": 9.7596,
382
  "step": 1040
383
  },
384
  {
385
- "epoch": 2.0,
386
- "eval_accuracy": 0.3849983181971073,
387
- "eval_loss": 9.140138626098633,
388
- "eval_runtime": 461.2724,
389
- "eval_samples_per_second": 32.226,
390
- "eval_steps_per_second": 32.226,
391
- "step": 1046
392
- },
393
- {
394
- "epoch": 2.026768642447419,
395
- "grad_norm": 4.428006172180176,
396
- "learning_rate": 8.859145952836202e-05,
397
- "loss": 9.714,
398
  "step": 1060
399
  },
400
  {
401
- "epoch": 2.0650095602294454,
402
- "grad_norm": 4.372852325439453,
403
- "learning_rate": 8.816656044189505e-05,
404
- "loss": 9.5508,
405
  "step": 1080
406
  },
407
  {
408
- "epoch": 2.1032504780114722,
409
- "grad_norm": 4.381687641143799,
410
- "learning_rate": 8.774166135542809e-05,
411
- "loss": 9.6096,
412
  "step": 1100
413
  },
414
  {
415
- "epoch": 2.141491395793499,
416
- "grad_norm": 4.5865631103515625,
417
- "learning_rate": 8.731676226896113e-05,
418
- "loss": 9.5077,
419
  "step": 1120
420
  },
421
  {
422
- "epoch": 2.179732313575526,
423
- "grad_norm": 4.363910675048828,
424
- "learning_rate": 8.689186318249416e-05,
425
- "loss": 9.5044,
426
  "step": 1140
427
  },
428
  {
429
- "epoch": 2.2179732313575524,
430
- "grad_norm": 4.577084541320801,
431
- "learning_rate": 8.646696409602721e-05,
432
- "loss": 9.4205,
 
 
 
 
 
 
 
 
 
433
  "step": 1160
434
  },
435
  {
436
- "epoch": 2.2562141491395793,
437
- "grad_norm": 4.576254367828369,
438
- "learning_rate": 8.604206500956024e-05,
439
- "loss": 9.4317,
440
  "step": 1180
441
  },
442
  {
443
- "epoch": 2.294455066921606,
444
- "grad_norm": 4.4399847984313965,
445
- "learning_rate": 8.561716592309326e-05,
446
- "loss": 9.3607,
447
  "step": 1200
448
  },
449
  {
450
- "epoch": 2.332695984703633,
451
- "grad_norm": 4.595015525817871,
452
- "learning_rate": 8.51922668366263e-05,
453
- "loss": 9.2533,
454
  "step": 1220
455
  },
456
  {
457
- "epoch": 2.3709369024856595,
458
- "grad_norm": 4.900874614715576,
459
- "learning_rate": 8.476736775015934e-05,
460
- "loss": 9.3384,
461
  "step": 1240
462
  },
463
  {
464
- "epoch": 2.4091778202676863,
465
- "grad_norm": 4.594742774963379,
466
- "learning_rate": 8.434246866369238e-05,
467
- "loss": 9.293,
468
  "step": 1260
469
  },
470
  {
471
- "epoch": 2.447418738049713,
472
- "grad_norm": 4.587216377258301,
473
- "learning_rate": 8.391756957722541e-05,
474
- "loss": 9.1986,
475
  "step": 1280
476
  },
477
  {
478
- "epoch": 2.48565965583174,
479
- "grad_norm": 4.735275745391846,
480
- "learning_rate": 8.349267049075845e-05,
481
- "loss": 9.1358,
482
  "step": 1300
483
  },
484
  {
485
- "epoch": 2.5239005736137665,
486
- "grad_norm": 4.627840995788574,
487
- "learning_rate": 8.306777140429149e-05,
488
- "loss": 9.1284,
489
  "step": 1320
490
  },
491
  {
492
- "epoch": 2.5621414913957934,
493
- "grad_norm": 4.658718585968018,
494
- "learning_rate": 8.264287231782451e-05,
495
- "loss": 9.0949,
496
  "step": 1340
497
  },
498
  {
499
- "epoch": 2.6003824091778203,
500
- "grad_norm": 4.875549793243408,
501
- "learning_rate": 8.221797323135755e-05,
502
- "loss": 9.0312,
503
  "step": 1360
504
  },
505
  {
506
- "epoch": 2.638623326959847,
507
- "grad_norm": 4.683437347412109,
508
- "learning_rate": 8.179307414489059e-05,
509
- "loss": 8.9949,
510
  "step": 1380
511
  },
512
  {
513
- "epoch": 2.676864244741874,
514
- "grad_norm": 4.861114025115967,
515
- "learning_rate": 8.136817505842362e-05,
516
- "loss": 8.9705,
517
  "step": 1400
518
  },
519
  {
520
- "epoch": 2.7151051625239004,
521
- "grad_norm": 4.727562427520752,
522
- "learning_rate": 8.094327597195667e-05,
523
- "loss": 8.9483,
524
  "step": 1420
525
  },
526
  {
527
- "epoch": 2.7533460803059273,
528
- "grad_norm": 4.8202948570251465,
529
- "learning_rate": 8.05183768854897e-05,
530
- "loss": 8.9254,
531
  "step": 1440
532
  },
533
  {
534
- "epoch": 2.791586998087954,
535
- "grad_norm": 4.926464557647705,
536
- "learning_rate": 8.009347779902273e-05,
537
- "loss": 8.8768,
538
  "step": 1460
539
  },
540
  {
541
- "epoch": 2.8298279158699806,
542
- "grad_norm": 4.7756028175354,
543
- "learning_rate": 7.966857871255578e-05,
544
- "loss": 8.8044,
545
  "step": 1480
546
  },
547
  {
548
- "epoch": 2.8680688336520075,
549
- "grad_norm": 4.888403415679932,
550
- "learning_rate": 7.92436796260888e-05,
551
- "loss": 8.7788,
552
  "step": 1500
553
  },
554
  {
555
- "epoch": 2.9063097514340344,
556
- "grad_norm": 4.943230152130127,
557
- "learning_rate": 7.881878053962184e-05,
558
- "loss": 8.8032,
559
  "step": 1520
560
  },
561
  {
562
- "epoch": 2.9445506692160612,
563
- "grad_norm": 5.011119842529297,
564
- "learning_rate": 7.839388145315488e-05,
565
- "loss": 8.7507,
566
  "step": 1540
567
  },
568
  {
569
- "epoch": 2.982791586998088,
570
- "grad_norm": 5.068637847900391,
571
- "learning_rate": 7.796898236668791e-05,
572
- "loss": 8.7136,
573
  "step": 1560
574
  },
575
  {
576
- "epoch": 3.0,
577
- "eval_accuracy": 0.52418432559704,
578
- "eval_loss": 7.882061958312988,
579
- "eval_runtime": 418.5795,
580
- "eval_samples_per_second": 35.513,
581
- "eval_steps_per_second": 35.513,
582
- "step": 1569
583
- },
584
- {
585
- "epoch": 3.0210325047801145,
586
- "grad_norm": 4.895749092102051,
587
- "learning_rate": 7.754408328022095e-05,
588
- "loss": 8.6104,
589
  "step": 1580
590
  },
591
  {
592
- "epoch": 3.0592734225621414,
593
- "grad_norm": 5.138400077819824,
594
- "learning_rate": 7.711918419375399e-05,
595
- "loss": 8.6136,
596
  "step": 1600
597
  },
598
  {
599
- "epoch": 3.0975143403441683,
600
- "grad_norm": 5.270049571990967,
601
- "learning_rate": 7.669428510728702e-05,
602
- "loss": 8.5866,
603
  "step": 1620
604
  },
605
  {
606
- "epoch": 3.135755258126195,
607
- "grad_norm": 5.178355693817139,
608
- "learning_rate": 7.626938602082006e-05,
609
- "loss": 8.492,
610
  "step": 1640
611
  },
612
  {
613
- "epoch": 3.173996175908222,
614
- "grad_norm": 5.312692165374756,
615
- "learning_rate": 7.58444869343531e-05,
616
- "loss": 8.4897,
617
  "step": 1660
618
  },
619
  {
620
- "epoch": 3.2122370936902485,
621
- "grad_norm": 5.227985382080078,
622
- "learning_rate": 7.541958784788614e-05,
623
- "loss": 8.4441,
624
  "step": 1680
625
  },
626
  {
627
- "epoch": 3.2504780114722753,
628
- "grad_norm": 5.042078495025635,
629
- "learning_rate": 7.499468876141916e-05,
630
- "loss": 8.4722,
631
  "step": 1700
632
  },
633
  {
634
- "epoch": 3.288718929254302,
635
- "grad_norm": 5.250526428222656,
636
- "learning_rate": 7.45697896749522e-05,
637
- "loss": 8.3105,
638
  "step": 1720
639
  },
640
  {
641
- "epoch": 3.3269598470363286,
642
- "grad_norm": 5.22187614440918,
643
- "learning_rate": 7.414489058848524e-05,
644
- "loss": 8.3308,
 
 
 
 
 
 
 
 
 
645
  "step": 1740
646
  },
647
  {
648
- "epoch": 3.3652007648183555,
649
- "grad_norm": 5.491254806518555,
650
- "learning_rate": 7.371999150201827e-05,
651
- "loss": 8.2969,
652
  "step": 1760
653
  },
654
  {
655
- "epoch": 3.4034416826003824,
656
- "grad_norm": 5.482990741729736,
657
- "learning_rate": 7.329509241555131e-05,
658
- "loss": 8.2593,
659
  "step": 1780
660
  },
661
  {
662
- "epoch": 3.4416826003824093,
663
- "grad_norm": 5.359766960144043,
664
- "learning_rate": 7.287019332908435e-05,
665
- "loss": 8.3087,
666
  "step": 1800
667
  },
668
  {
669
- "epoch": 3.479923518164436,
670
- "grad_norm": 5.788363456726074,
671
- "learning_rate": 7.244529424261737e-05,
672
- "loss": 8.2664,
673
  "step": 1820
674
  },
675
  {
676
- "epoch": 3.5181644359464626,
677
- "grad_norm": 5.335551738739014,
678
- "learning_rate": 7.202039515615043e-05,
679
- "loss": 8.2543,
680
  "step": 1840
681
  },
682
  {
683
- "epoch": 3.5564053537284894,
684
- "grad_norm": 5.465627193450928,
685
- "learning_rate": 7.159549606968345e-05,
686
- "loss": 8.2604,
687
  "step": 1860
688
  },
689
  {
690
- "epoch": 3.5946462715105163,
691
- "grad_norm": 5.594823837280273,
692
- "learning_rate": 7.117059698321648e-05,
693
- "loss": 8.1616,
694
  "step": 1880
695
  },
696
  {
697
- "epoch": 3.632887189292543,
698
- "grad_norm": 5.58858060836792,
699
- "learning_rate": 7.074569789674953e-05,
700
- "loss": 8.1582,
701
  "step": 1900
702
  },
703
  {
704
- "epoch": 3.67112810707457,
705
- "grad_norm": 5.514508247375488,
706
- "learning_rate": 7.032079881028256e-05,
707
- "loss": 8.1061,
708
  "step": 1920
709
  },
710
  {
711
- "epoch": 3.7093690248565965,
712
- "grad_norm": 5.644900321960449,
713
- "learning_rate": 6.98958997238156e-05,
714
- "loss": 8.0912,
715
  "step": 1940
716
  },
717
  {
718
- "epoch": 3.7476099426386233,
719
- "grad_norm": 5.701168060302734,
720
- "learning_rate": 6.947100063734864e-05,
721
- "loss": 7.9596,
722
  "step": 1960
723
  },
724
  {
725
- "epoch": 3.78585086042065,
726
- "grad_norm": 5.880733013153076,
727
- "learning_rate": 6.904610155088167e-05,
728
- "loss": 8.0403,
729
  "step": 1980
730
  },
731
  {
732
- "epoch": 3.8240917782026767,
733
- "grad_norm": 5.638689994812012,
734
- "learning_rate": 6.86212024644147e-05,
735
- "loss": 7.9666,
736
  "step": 2000
737
  },
738
  {
739
- "epoch": 3.8623326959847035,
740
- "grad_norm": 6.002101421356201,
741
- "learning_rate": 6.819630337794775e-05,
742
- "loss": 7.9633,
743
  "step": 2020
744
  },
745
  {
746
- "epoch": 3.9005736137667304,
747
- "grad_norm": 5.628067493438721,
748
- "learning_rate": 6.777140429148077e-05,
749
- "loss": 7.8817,
750
  "step": 2040
751
  },
752
  {
753
- "epoch": 3.9388145315487573,
754
- "grad_norm": 6.128510475158691,
755
- "learning_rate": 6.734650520501381e-05,
756
- "loss": 7.9118,
757
  "step": 2060
758
  },
759
  {
760
- "epoch": 3.977055449330784,
761
- "grad_norm": 5.620929718017578,
762
- "learning_rate": 6.692160611854685e-05,
763
- "loss": 7.848,
764
  "step": 2080
765
  },
766
  {
767
- "epoch": 4.0,
768
- "eval_accuracy": 0.6143962327615203,
769
- "eval_loss": 6.945113658905029,
770
- "eval_runtime": 367.1966,
771
- "eval_samples_per_second": 40.482,
772
- "eval_steps_per_second": 40.482,
773
- "step": 2092
774
- },
775
- {
776
- "epoch": 4.015296367112811,
777
- "grad_norm": 5.820804595947266,
778
- "learning_rate": 6.649670703207989e-05,
779
- "loss": 7.8607,
780
  "step": 2100
781
  },
782
  {
783
- "epoch": 4.053537284894838,
784
- "grad_norm": 5.6448493003845215,
785
- "learning_rate": 6.607180794561292e-05,
786
- "loss": 7.7072,
787
  "step": 2120
788
  },
789
  {
790
- "epoch": 4.091778202676864,
791
- "grad_norm": 6.283373832702637,
792
- "learning_rate": 6.564690885914596e-05,
793
- "loss": 7.772,
794
  "step": 2140
795
  },
796
  {
797
- "epoch": 4.130019120458891,
798
- "grad_norm": 6.125846862792969,
799
- "learning_rate": 6.5222009772679e-05,
800
- "loss": 7.7211,
801
  "step": 2160
802
  },
803
  {
804
- "epoch": 4.168260038240918,
805
- "grad_norm": 5.701002597808838,
806
- "learning_rate": 6.479711068621202e-05,
807
- "loss": 7.6563,
808
  "step": 2180
809
  },
810
  {
811
- "epoch": 4.2065009560229445,
812
- "grad_norm": 5.910340785980225,
813
- "learning_rate": 6.437221159974506e-05,
814
- "loss": 7.711,
815
  "step": 2200
816
  },
817
  {
818
- "epoch": 4.244741873804971,
819
- "grad_norm": 5.8003082275390625,
820
- "learning_rate": 6.39473125132781e-05,
821
- "loss": 7.7582,
822
  "step": 2220
823
  },
824
  {
825
- "epoch": 4.282982791586998,
826
- "grad_norm": 5.95621395111084,
827
- "learning_rate": 6.352241342681113e-05,
828
- "loss": 7.6215,
829
  "step": 2240
830
  },
831
  {
832
- "epoch": 4.321223709369025,
833
- "grad_norm": 5.836912155151367,
834
- "learning_rate": 6.309751434034417e-05,
835
- "loss": 7.5932,
836
  "step": 2260
837
  },
838
  {
839
- "epoch": 4.359464627151052,
840
- "grad_norm": 6.156320095062256,
841
- "learning_rate": 6.267261525387721e-05,
842
- "loss": 7.5122,
843
  "step": 2280
844
  },
845
  {
846
- "epoch": 4.397705544933078,
847
- "grad_norm": 5.937085151672363,
848
- "learning_rate": 6.224771616741024e-05,
849
- "loss": 7.5488,
 
 
 
 
 
 
 
 
 
850
  "step": 2300
851
  },
852
  {
853
- "epoch": 4.435946462715105,
854
- "grad_norm": 5.949016571044922,
855
- "learning_rate": 6.182281708094328e-05,
856
- "loss": 7.5972,
857
  "step": 2320
858
  },
859
  {
860
- "epoch": 4.474187380497132,
861
- "grad_norm": 6.26347541809082,
862
- "learning_rate": 6.139791799447631e-05,
863
- "loss": 7.4327,
864
  "step": 2340
865
  },
866
  {
867
- "epoch": 4.512428298279159,
868
- "grad_norm": 6.376476287841797,
869
- "learning_rate": 6.097301890800935e-05,
870
- "loss": 7.555,
871
  "step": 2360
872
  },
873
  {
874
- "epoch": 4.550669216061186,
875
- "grad_norm": 6.2988200187683105,
876
- "learning_rate": 6.054811982154238e-05,
877
- "loss": 7.5463,
878
  "step": 2380
879
  },
880
  {
881
- "epoch": 4.588910133843212,
882
- "grad_norm": 5.916903972625732,
883
- "learning_rate": 6.012322073507543e-05,
884
- "loss": 7.4637,
885
  "step": 2400
886
  },
887
  {
888
- "epoch": 4.627151051625239,
889
- "grad_norm": 5.896063327789307,
890
- "learning_rate": 5.969832164860846e-05,
891
- "loss": 7.3857,
892
  "step": 2420
893
  },
894
  {
895
- "epoch": 4.665391969407266,
896
- "grad_norm": 6.14431619644165,
897
- "learning_rate": 5.927342256214149e-05,
898
- "loss": 7.4363,
899
  "step": 2440
900
  },
901
  {
902
- "epoch": 4.7036328871892925,
903
- "grad_norm": 6.2994256019592285,
904
- "learning_rate": 5.8848523475674533e-05,
905
- "loss": 7.406,
906
  "step": 2460
907
  },
908
  {
909
- "epoch": 4.741873804971319,
910
- "grad_norm": 6.134793758392334,
911
- "learning_rate": 5.8423624389207567e-05,
912
- "loss": 7.338,
913
  "step": 2480
914
  },
915
  {
916
- "epoch": 4.780114722753346,
917
- "grad_norm": 6.245213031768799,
918
- "learning_rate": 5.79987253027406e-05,
919
- "loss": 7.3912,
920
  "step": 2500
921
  },
922
  {
923
- "epoch": 4.818355640535373,
924
- "grad_norm": 6.118636131286621,
925
- "learning_rate": 5.757382621627364e-05,
926
- "loss": 7.3548,
927
  "step": 2520
928
  },
929
  {
930
- "epoch": 4.8565965583174,
931
- "grad_norm": 6.391002178192139,
932
- "learning_rate": 5.714892712980667e-05,
933
- "loss": 7.3119,
934
  "step": 2540
935
  },
936
  {
937
- "epoch": 4.894837476099426,
938
- "grad_norm": 6.539446830749512,
939
- "learning_rate": 5.6724028043339705e-05,
940
- "loss": 7.2119,
941
  "step": 2560
942
  },
943
  {
944
- "epoch": 4.933078393881453,
945
- "grad_norm": 6.162653923034668,
946
- "learning_rate": 5.6299128956872745e-05,
947
- "loss": 7.2505,
948
  "step": 2580
949
  },
950
  {
951
- "epoch": 4.97131931166348,
952
- "grad_norm": 6.580591678619385,
953
- "learning_rate": 5.587422987040578e-05,
954
- "loss": 7.1912,
955
  "step": 2600
956
  },
957
  {
958
- "epoch": 5.0,
959
- "eval_accuracy": 0.6821392532795156,
960
- "eval_loss": 6.262951850891113,
961
- "eval_runtime": 76.4531,
962
- "eval_samples_per_second": 194.433,
963
- "eval_steps_per_second": 194.433,
964
- "step": 2615
965
- },
966
- {
967
- "epoch": 5.009560229445507,
968
- "grad_norm": 6.838705062866211,
969
- "learning_rate": 5.544933078393881e-05,
970
- "loss": 7.1863,
971
  "step": 2620
972
  },
973
  {
974
- "epoch": 5.047801147227533,
975
- "grad_norm": 6.260281562805176,
976
- "learning_rate": 5.502443169747186e-05,
977
- "loss": 7.1259,
978
  "step": 2640
979
  },
980
  {
981
- "epoch": 5.08604206500956,
982
- "grad_norm": 6.463006496429443,
983
- "learning_rate": 5.459953261100489e-05,
984
- "loss": 7.1559,
985
  "step": 2660
986
  },
987
  {
988
- "epoch": 5.124282982791587,
989
- "grad_norm": 6.499185562133789,
990
- "learning_rate": 5.4174633524537924e-05,
991
- "loss": 7.1318,
992
  "step": 2680
993
  },
994
  {
995
- "epoch": 5.162523900573614,
996
- "grad_norm": 6.508650302886963,
997
- "learning_rate": 5.3749734438070964e-05,
998
- "loss": 7.0993,
999
  "step": 2700
1000
  },
1001
  {
1002
- "epoch": 5.2007648183556405,
1003
- "grad_norm": 6.573218822479248,
1004
- "learning_rate": 5.3324835351604e-05,
1005
- "loss": 7.0823,
1006
  "step": 2720
1007
  },
1008
  {
1009
- "epoch": 5.239005736137667,
1010
- "grad_norm": 6.863697052001953,
1011
- "learning_rate": 5.289993626513703e-05,
1012
- "loss": 7.0839,
1013
  "step": 2740
1014
  },
1015
  {
1016
- "epoch": 5.277246653919694,
1017
- "grad_norm": 6.305070877075195,
1018
- "learning_rate": 5.247503717867007e-05,
1019
- "loss": 7.0723,
1020
  "step": 2760
1021
  },
1022
  {
1023
- "epoch": 5.315487571701721,
1024
- "grad_norm": 6.715279579162598,
1025
- "learning_rate": 5.20501380922031e-05,
1026
- "loss": 6.9592,
1027
  "step": 2780
1028
  },
1029
  {
1030
- "epoch": 5.353728489483748,
1031
- "grad_norm": 6.625701904296875,
1032
- "learning_rate": 5.1625239005736136e-05,
1033
- "loss": 7.0275,
1034
  "step": 2800
1035
  },
1036
  {
1037
- "epoch": 5.3919694072657744,
1038
- "grad_norm": 6.717496871948242,
1039
- "learning_rate": 5.120033991926918e-05,
1040
- "loss": 6.9146,
1041
  "step": 2820
1042
  },
1043
  {
1044
- "epoch": 5.430210325047801,
1045
- "grad_norm": 6.500243186950684,
1046
- "learning_rate": 5.0775440832802216e-05,
1047
- "loss": 6.9984,
1048
  "step": 2840
1049
  },
1050
  {
1051
- "epoch": 5.468451242829828,
1052
- "grad_norm": 6.41347074508667,
1053
- "learning_rate": 5.035054174633524e-05,
1054
- "loss": 6.9367,
1055
  "step": 2860
1056
  },
1057
  {
1058
- "epoch": 5.506692160611855,
1059
- "grad_norm": 6.83429479598999,
1060
- "learning_rate": 4.992564265986828e-05,
1061
- "loss": 6.9997,
 
 
 
 
 
 
 
 
 
1062
  "step": 2880
1063
  },
1064
  {
1065
- "epoch": 5.544933078393882,
1066
- "grad_norm": 6.565597057342529,
1067
- "learning_rate": 4.950074357340132e-05,
1068
- "loss": 6.9204,
1069
  "step": 2900
1070
  },
1071
  {
1072
- "epoch": 5.583173996175908,
1073
- "grad_norm": 6.9456095695495605,
1074
- "learning_rate": 4.907584448693436e-05,
1075
- "loss": 6.8926,
1076
  "step": 2920
1077
  },
1078
  {
1079
- "epoch": 5.621414913957935,
1080
- "grad_norm": 7.052099704742432,
1081
- "learning_rate": 4.865094540046739e-05,
1082
- "loss": 6.8993,
1083
  "step": 2940
1084
  },
1085
  {
1086
- "epoch": 5.659655831739962,
1087
- "grad_norm": 7.128490924835205,
1088
- "learning_rate": 4.822604631400043e-05,
1089
- "loss": 6.8474,
1090
  "step": 2960
1091
  },
1092
  {
1093
- "epoch": 5.6978967495219885,
1094
- "grad_norm": 6.792144298553467,
1095
- "learning_rate": 4.780114722753346e-05,
1096
- "loss": 6.8509,
1097
  "step": 2980
1098
  },
1099
  {
1100
- "epoch": 5.736137667304015,
1101
- "grad_norm": 6.853285312652588,
1102
- "learning_rate": 4.73762481410665e-05,
1103
- "loss": 6.9141,
1104
  "step": 3000
1105
  },
1106
  {
1107
- "epoch": 5.774378585086042,
1108
- "grad_norm": 7.153258800506592,
1109
- "learning_rate": 4.695134905459953e-05,
1110
- "loss": 6.7391,
1111
  "step": 3020
1112
  },
1113
  {
1114
- "epoch": 5.812619502868069,
1115
- "grad_norm": 6.9271321296691895,
1116
- "learning_rate": 4.6526449968132566e-05,
1117
- "loss": 6.7554,
1118
  "step": 3040
1119
  },
1120
  {
1121
- "epoch": 5.850860420650095,
1122
- "grad_norm": 7.218133926391602,
1123
- "learning_rate": 4.6101550881665606e-05,
1124
- "loss": 6.8172,
1125
  "step": 3060
1126
  },
1127
  {
1128
- "epoch": 5.8891013384321225,
1129
- "grad_norm": 7.0558695793151855,
1130
- "learning_rate": 4.5676651795198646e-05,
1131
- "loss": 6.8442,
1132
  "step": 3080
1133
  },
1134
  {
1135
- "epoch": 5.927342256214149,
1136
- "grad_norm": 6.762065887451172,
1137
- "learning_rate": 4.525175270873168e-05,
1138
- "loss": 6.696,
1139
  "step": 3100
1140
  },
1141
  {
1142
- "epoch": 5.965583173996176,
1143
- "grad_norm": 6.8173604011535645,
1144
- "learning_rate": 4.482685362226471e-05,
1145
- "loss": 6.6763,
1146
  "step": 3120
1147
  },
1148
  {
1149
- "epoch": 6.0,
1150
- "eval_accuracy": 0.7291624621594349,
1151
- "eval_loss": 5.7182440757751465,
1152
- "eval_runtime": 444.003,
1153
- "eval_samples_per_second": 33.48,
1154
- "eval_steps_per_second": 33.48,
1155
- "step": 3138
1156
- },
1157
- {
1158
- "epoch": 6.003824091778203,
1159
- "grad_norm": 7.1014723777771,
1160
- "learning_rate": 4.440195453579775e-05,
1161
- "loss": 6.6927,
1162
  "step": 3140
1163
  },
1164
  {
1165
- "epoch": 6.042065009560229,
1166
- "grad_norm": 6.958450794219971,
1167
- "learning_rate": 4.3977055449330785e-05,
1168
- "loss": 6.6538,
1169
  "step": 3160
1170
  },
1171
  {
1172
- "epoch": 6.080305927342256,
1173
- "grad_norm": 6.920003890991211,
1174
- "learning_rate": 4.3552156362863825e-05,
1175
- "loss": 6.5479,
1176
  "step": 3180
1177
  },
1178
  {
1179
- "epoch": 6.118546845124283,
1180
- "grad_norm": 7.053244113922119,
1181
- "learning_rate": 4.312725727639686e-05,
1182
- "loss": 6.5668,
1183
  "step": 3200
1184
  },
1185
  {
1186
- "epoch": 6.15678776290631,
1187
- "grad_norm": 6.9157185554504395,
1188
- "learning_rate": 4.270235818992989e-05,
1189
- "loss": 6.6722,
1190
  "step": 3220
1191
  },
1192
  {
1193
- "epoch": 6.195028680688337,
1194
- "grad_norm": 7.149935722351074,
1195
- "learning_rate": 4.227745910346293e-05,
1196
- "loss": 6.6397,
1197
  "step": 3240
1198
  },
1199
  {
1200
- "epoch": 6.233269598470363,
1201
- "grad_norm": 7.318164825439453,
1202
- "learning_rate": 4.185256001699597e-05,
1203
- "loss": 6.6041,
1204
  "step": 3260
1205
  },
1206
  {
1207
- "epoch": 6.27151051625239,
1208
- "grad_norm": 7.044018268585205,
1209
- "learning_rate": 4.1427660930529e-05,
1210
- "loss": 6.5492,
1211
  "step": 3280
1212
  },
1213
  {
1214
- "epoch": 6.309751434034417,
1215
- "grad_norm": 7.045164585113525,
1216
- "learning_rate": 4.1002761844062037e-05,
1217
- "loss": 6.5679,
1218
  "step": 3300
1219
  },
1220
  {
1221
- "epoch": 6.347992351816444,
1222
- "grad_norm": 7.092489242553711,
1223
- "learning_rate": 4.0577862757595076e-05,
1224
- "loss": 6.5695,
1225
  "step": 3320
1226
  },
1227
  {
1228
- "epoch": 6.3862332695984705,
1229
- "grad_norm": 6.940147399902344,
1230
- "learning_rate": 4.015296367112811e-05,
1231
- "loss": 6.4842,
1232
  "step": 3340
1233
  },
1234
  {
1235
- "epoch": 6.424474187380497,
1236
- "grad_norm": 7.10172176361084,
1237
- "learning_rate": 3.972806458466114e-05,
1238
- "loss": 6.5317,
1239
  "step": 3360
1240
  },
1241
  {
1242
- "epoch": 6.462715105162524,
1243
- "grad_norm": 7.129051208496094,
1244
- "learning_rate": 3.930316549819418e-05,
1245
- "loss": 6.4702,
1246
  "step": 3380
1247
  },
1248
  {
1249
- "epoch": 6.500956022944551,
1250
- "grad_norm": 7.501070499420166,
1251
- "learning_rate": 3.8878266411727215e-05,
1252
- "loss": 6.3999,
1253
  "step": 3400
1254
  },
1255
  {
1256
- "epoch": 6.539196940726577,
1257
- "grad_norm": 7.325244426727295,
1258
- "learning_rate": 3.8453367325260255e-05,
1259
- "loss": 6.4932,
1260
  "step": 3420
1261
  },
1262
  {
1263
- "epoch": 6.577437858508604,
1264
- "grad_norm": 7.361093521118164,
1265
- "learning_rate": 3.802846823879329e-05,
1266
- "loss": 6.3927,
1267
  "step": 3440
1268
  },
1269
  {
1270
- "epoch": 6.615678776290631,
1271
- "grad_norm": 7.228673458099365,
1272
- "learning_rate": 3.760356915232632e-05,
1273
- "loss": 6.4861,
 
 
 
 
 
 
 
 
 
1274
  "step": 3460
1275
  },
1276
  {
1277
- "epoch": 6.653919694072657,
1278
- "grad_norm": 7.602611064910889,
1279
- "learning_rate": 3.717867006585936e-05,
1280
- "loss": 6.4623,
1281
  "step": 3480
1282
  },
1283
  {
1284
- "epoch": 6.692160611854685,
1285
- "grad_norm": 7.901960372924805,
1286
- "learning_rate": 3.6753770979392394e-05,
1287
- "loss": 6.4282,
1288
  "step": 3500
1289
  },
1290
  {
1291
- "epoch": 6.730401529636711,
1292
- "grad_norm": 7.1125383377075195,
1293
- "learning_rate": 3.6328871892925434e-05,
1294
- "loss": 6.3799,
1295
  "step": 3520
1296
  },
1297
  {
1298
- "epoch": 6.768642447418738,
1299
- "grad_norm": 7.1385884284973145,
1300
- "learning_rate": 3.590397280645847e-05,
1301
- "loss": 6.3707,
1302
  "step": 3540
1303
  },
1304
  {
1305
- "epoch": 6.806883365200765,
1306
- "grad_norm": 7.548192977905273,
1307
- "learning_rate": 3.54790737199915e-05,
1308
- "loss": 6.4388,
1309
  "step": 3560
1310
  },
1311
  {
1312
- "epoch": 6.845124282982791,
1313
- "grad_norm": 7.492359161376953,
1314
- "learning_rate": 3.505417463352454e-05,
1315
- "loss": 6.4223,
1316
  "step": 3580
1317
  },
1318
  {
1319
- "epoch": 6.8833652007648185,
1320
- "grad_norm": 7.575985431671143,
1321
- "learning_rate": 3.462927554705758e-05,
1322
- "loss": 6.3552,
1323
  "step": 3600
1324
  },
1325
  {
1326
- "epoch": 6.921606118546845,
1327
- "grad_norm": 7.351112365722656,
1328
- "learning_rate": 3.4204376460590606e-05,
1329
- "loss": 6.3379,
1330
  "step": 3620
1331
  },
1332
  {
1333
- "epoch": 6.959847036328872,
1334
- "grad_norm": 7.33430290222168,
1335
- "learning_rate": 3.3779477374123646e-05,
1336
- "loss": 6.3429,
1337
  "step": 3640
1338
  },
1339
  {
1340
- "epoch": 6.998087954110899,
1341
- "grad_norm": 7.511825084686279,
1342
- "learning_rate": 3.3354578287656686e-05,
1343
- "loss": 6.3112,
1344
  "step": 3660
1345
  },
1346
  {
1347
- "epoch": 7.0,
1348
- "eval_accuracy": 0.7632021527077026,
1349
- "eval_loss": 5.265278339385986,
1350
- "eval_runtime": 484.395,
1351
- "eval_samples_per_second": 30.688,
1352
- "eval_steps_per_second": 30.688,
1353
- "step": 3661
1354
- },
1355
- {
1356
- "epoch": 7.036328871892925,
1357
- "grad_norm": 7.424711227416992,
1358
- "learning_rate": 3.292967920118972e-05,
1359
- "loss": 6.1764,
1360
  "step": 3680
1361
  },
1362
  {
1363
- "epoch": 7.074569789674952,
1364
- "grad_norm": 7.648799896240234,
1365
- "learning_rate": 3.250478011472275e-05,
1366
- "loss": 6.2389,
1367
  "step": 3700
1368
  },
1369
  {
1370
- "epoch": 7.112810707456979,
1371
- "grad_norm": 7.4450483322143555,
1372
- "learning_rate": 3.207988102825579e-05,
1373
- "loss": 6.2506,
1374
  "step": 3720
1375
  },
1376
  {
1377
- "epoch": 7.151051625239006,
1378
- "grad_norm": 7.422061443328857,
1379
- "learning_rate": 3.1654981941788825e-05,
1380
- "loss": 6.2049,
1381
  "step": 3740
1382
  },
1383
  {
1384
- "epoch": 7.189292543021033,
1385
- "grad_norm": 7.345204830169678,
1386
- "learning_rate": 3.1230082855321864e-05,
1387
- "loss": 6.2906,
1388
  "step": 3760
1389
  },
1390
  {
1391
- "epoch": 7.227533460803059,
1392
- "grad_norm": 7.486473083496094,
1393
- "learning_rate": 3.08051837688549e-05,
1394
- "loss": 6.2644,
1395
  "step": 3780
1396
  },
1397
  {
1398
- "epoch": 7.265774378585086,
1399
- "grad_norm": 7.317290782928467,
1400
- "learning_rate": 3.0380284682387934e-05,
1401
- "loss": 6.2421,
1402
  "step": 3800
1403
  },
1404
  {
1405
- "epoch": 7.304015296367113,
1406
- "grad_norm": 7.4384002685546875,
1407
- "learning_rate": 2.995538559592097e-05,
1408
- "loss": 6.1406,
1409
  "step": 3820
1410
  },
1411
  {
1412
- "epoch": 7.342256214149139,
1413
- "grad_norm": 7.7606000900268555,
1414
- "learning_rate": 2.9530486509454007e-05,
1415
- "loss": 6.2031,
1416
  "step": 3840
1417
  },
1418
  {
1419
- "epoch": 7.3804971319311665,
1420
- "grad_norm": 7.305050373077393,
1421
- "learning_rate": 2.910558742298704e-05,
1422
- "loss": 6.127,
1423
  "step": 3860
1424
  },
1425
  {
1426
- "epoch": 7.418738049713193,
1427
- "grad_norm": 7.713500022888184,
1428
- "learning_rate": 2.868068833652008e-05,
1429
- "loss": 6.1474,
1430
  "step": 3880
1431
  },
1432
  {
1433
- "epoch": 7.45697896749522,
1434
- "grad_norm": 8.028603553771973,
1435
- "learning_rate": 2.8255789250053116e-05,
1436
- "loss": 6.1542,
1437
  "step": 3900
1438
  },
1439
  {
1440
- "epoch": 7.495219885277247,
1441
- "grad_norm": 7.4730329513549805,
1442
- "learning_rate": 2.783089016358615e-05,
1443
- "loss": 6.225,
1444
  "step": 3920
1445
  },
1446
  {
1447
- "epoch": 7.533460803059273,
1448
- "grad_norm": 7.52304220199585,
1449
- "learning_rate": 2.7405991077119186e-05,
1450
- "loss": 6.1674,
1451
  "step": 3940
1452
  },
1453
  {
1454
- "epoch": 7.5717017208413,
1455
- "grad_norm": 7.616427898406982,
1456
- "learning_rate": 2.6981091990652225e-05,
1457
- "loss": 6.1169,
1458
  "step": 3960
1459
  },
1460
  {
1461
- "epoch": 7.609942638623327,
1462
- "grad_norm": 7.784472465515137,
1463
- "learning_rate": 2.6556192904185255e-05,
1464
- "loss": 6.1041,
1465
  "step": 3980
1466
  },
1467
  {
1468
- "epoch": 7.648183556405353,
1469
- "grad_norm": 7.819777011871338,
1470
- "learning_rate": 2.6131293817718295e-05,
1471
- "loss": 6.1069,
1472
  "step": 4000
1473
  },
1474
  {
1475
- "epoch": 7.686424474187381,
1476
- "grad_norm": 7.889120101928711,
1477
- "learning_rate": 2.5706394731251328e-05,
1478
- "loss": 5.9985,
1479
  "step": 4020
1480
  },
1481
  {
1482
- "epoch": 7.724665391969407,
1483
- "grad_norm": 7.858097076416016,
1484
- "learning_rate": 2.5281495644784364e-05,
1485
- "loss": 6.0437,
 
 
 
 
 
 
 
 
 
1486
  "step": 4040
1487
  },
1488
  {
1489
- "epoch": 7.762906309751434,
1490
- "grad_norm": 7.739562511444092,
1491
- "learning_rate": 2.48565965583174e-05,
1492
- "loss": 6.1376,
1493
  "step": 4060
1494
  },
1495
  {
1496
- "epoch": 7.801147227533461,
1497
- "grad_norm": 7.778552532196045,
1498
- "learning_rate": 2.4431697471850437e-05,
1499
- "loss": 6.2084,
1500
  "step": 4080
1501
  },
1502
  {
1503
- "epoch": 7.839388145315487,
1504
- "grad_norm": 7.536991596221924,
1505
- "learning_rate": 2.4006798385383474e-05,
1506
- "loss": 6.0325,
1507
  "step": 4100
1508
  },
1509
  {
1510
- "epoch": 7.8776290630975145,
1511
- "grad_norm": 7.846856594085693,
1512
- "learning_rate": 2.3581899298916507e-05,
1513
- "loss": 6.098,
1514
  "step": 4120
1515
  },
1516
  {
1517
- "epoch": 7.915869980879541,
1518
- "grad_norm": 7.760807991027832,
1519
- "learning_rate": 2.3157000212449547e-05,
1520
- "loss": 5.9765,
1521
  "step": 4140
1522
  },
1523
  {
1524
- "epoch": 7.954110898661568,
1525
- "grad_norm": 7.827345371246338,
1526
- "learning_rate": 2.273210112598258e-05,
1527
- "loss": 5.9915,
1528
  "step": 4160
1529
  },
1530
  {
1531
- "epoch": 7.992351816443595,
1532
- "grad_norm": 8.129748344421387,
1533
- "learning_rate": 2.2307202039515616e-05,
1534
- "loss": 6.0255,
1535
  "step": 4180
1536
  },
1537
  {
1538
- "epoch": 8.0,
1539
- "eval_accuracy": 0.782643794147326,
1540
- "eval_loss": 4.966301918029785,
1541
- "eval_runtime": 260.149,
1542
- "eval_samples_per_second": 57.14,
1543
- "eval_steps_per_second": 57.14,
1544
- "step": 4184
1545
- },
1546
- {
1547
- "epoch": 8.030592734225621,
1548
- "grad_norm": 7.686340808868408,
1549
- "learning_rate": 2.1882302953048652e-05,
1550
- "loss": 6.0763,
1551
  "step": 4200
1552
  },
1553
  {
1554
- "epoch": 8.068833652007648,
1555
- "grad_norm": 7.666318893432617,
1556
- "learning_rate": 2.145740386658169e-05,
1557
- "loss": 5.868,
1558
  "step": 4220
1559
  },
1560
  {
1561
- "epoch": 8.107074569789676,
1562
- "grad_norm": 7.686400890350342,
1563
- "learning_rate": 2.1032504780114722e-05,
1564
- "loss": 5.8964,
1565
  "step": 4240
1566
  },
1567
  {
1568
- "epoch": 8.145315487571702,
1569
- "grad_norm": 7.418490886688232,
1570
- "learning_rate": 2.0607605693647762e-05,
1571
- "loss": 5.8408,
1572
  "step": 4260
1573
  },
1574
  {
1575
- "epoch": 8.183556405353729,
1576
- "grad_norm": 7.769067287445068,
1577
- "learning_rate": 2.0182706607180795e-05,
1578
- "loss": 5.9742,
1579
  "step": 4280
1580
  },
1581
  {
1582
- "epoch": 8.221797323135755,
1583
- "grad_norm": 7.915468215942383,
1584
- "learning_rate": 1.975780752071383e-05,
1585
- "loss": 5.913,
1586
  "step": 4300
1587
  },
1588
  {
1589
- "epoch": 8.260038240917781,
1590
- "grad_norm": 7.884761810302734,
1591
- "learning_rate": 1.9332908434246868e-05,
1592
- "loss": 5.8613,
1593
  "step": 4320
1594
  },
1595
  {
1596
- "epoch": 8.29827915869981,
1597
- "grad_norm": 7.765011787414551,
1598
- "learning_rate": 1.8908009347779904e-05,
1599
- "loss": 5.9791,
1600
  "step": 4340
1601
  },
1602
  {
1603
- "epoch": 8.336520076481836,
1604
- "grad_norm": 8.110984802246094,
1605
- "learning_rate": 1.8483110261312937e-05,
1606
- "loss": 5.9675,
1607
  "step": 4360
1608
  },
1609
  {
1610
- "epoch": 8.374760994263863,
1611
- "grad_norm": 8.114306449890137,
1612
- "learning_rate": 1.8058211174845974e-05,
1613
- "loss": 5.9804,
1614
  "step": 4380
1615
  },
1616
  {
1617
- "epoch": 8.413001912045889,
1618
- "grad_norm": 7.981202125549316,
1619
- "learning_rate": 1.763331208837901e-05,
1620
- "loss": 5.8832,
1621
  "step": 4400
1622
  },
1623
  {
1624
- "epoch": 8.451242829827915,
1625
- "grad_norm": 7.628136157989502,
1626
- "learning_rate": 1.7208413001912046e-05,
1627
- "loss": 5.9301,
1628
  "step": 4420
1629
  },
1630
  {
1631
- "epoch": 8.489483747609942,
1632
- "grad_norm": 7.863382816314697,
1633
- "learning_rate": 1.6783513915445083e-05,
1634
- "loss": 5.8983,
1635
  "step": 4440
1636
  },
1637
  {
1638
- "epoch": 8.52772466539197,
1639
- "grad_norm": 7.82211971282959,
1640
- "learning_rate": 1.635861482897812e-05,
1641
- "loss": 5.8938,
1642
  "step": 4460
1643
  },
1644
  {
1645
- "epoch": 8.565965583173996,
1646
- "grad_norm": 8.038976669311523,
1647
- "learning_rate": 1.5933715742511156e-05,
1648
- "loss": 5.8945,
1649
  "step": 4480
1650
  },
1651
  {
1652
- "epoch": 8.604206500956023,
1653
- "grad_norm": 7.884932518005371,
1654
- "learning_rate": 1.550881665604419e-05,
1655
- "loss": 5.8895,
1656
  "step": 4500
1657
  },
1658
  {
1659
- "epoch": 8.64244741873805,
1660
- "grad_norm": 7.975419521331787,
1661
- "learning_rate": 1.5083917569577227e-05,
1662
- "loss": 5.9617,
1663
  "step": 4520
1664
  },
1665
  {
1666
- "epoch": 8.680688336520076,
1667
- "grad_norm": 7.786068916320801,
1668
- "learning_rate": 1.4659018483110262e-05,
1669
- "loss": 5.8659,
1670
  "step": 4540
1671
  },
1672
  {
1673
- "epoch": 8.718929254302104,
1674
- "grad_norm": 8.130301475524902,
1675
- "learning_rate": 1.4234119396643298e-05,
1676
- "loss": 5.9116,
1677
  "step": 4560
1678
  },
1679
  {
1680
- "epoch": 8.75717017208413,
1681
- "grad_norm": 8.042682647705078,
1682
- "learning_rate": 1.3809220310176335e-05,
1683
- "loss": 5.8536,
1684
  "step": 4580
1685
  },
1686
  {
1687
- "epoch": 8.795411089866157,
1688
- "grad_norm": 8.327803611755371,
1689
- "learning_rate": 1.3384321223709371e-05,
1690
- "loss": 5.9241,
 
 
 
 
 
 
 
 
 
1691
  "step": 4600
1692
  },
1693
  {
1694
- "epoch": 8.833652007648183,
1695
- "grad_norm": 7.880401134490967,
1696
- "learning_rate": 1.2959422137242406e-05,
1697
- "loss": 5.864,
1698
  "step": 4620
1699
  },
1700
  {
1701
- "epoch": 8.87189292543021,
1702
- "grad_norm": 7.6825127601623535,
1703
- "learning_rate": 1.253452305077544e-05,
1704
- "loss": 5.9457,
1705
  "step": 4640
1706
  },
1707
  {
1708
- "epoch": 8.910133843212238,
1709
- "grad_norm": 7.971193313598633,
1710
- "learning_rate": 1.2109623964308479e-05,
1711
- "loss": 5.8329,
1712
  "step": 4660
1713
  },
1714
  {
1715
- "epoch": 8.948374760994264,
1716
- "grad_norm": 8.04354476928711,
1717
- "learning_rate": 1.1684724877841513e-05,
1718
- "loss": 5.8671,
1719
  "step": 4680
1720
  },
1721
  {
1722
- "epoch": 8.98661567877629,
1723
- "grad_norm": 7.942180633544922,
1724
- "learning_rate": 1.125982579137455e-05,
1725
- "loss": 5.8091,
1726
  "step": 4700
1727
  },
1728
  {
1729
- "epoch": 9.0,
1730
- "eval_accuracy": 0.7956945845946855,
1731
- "eval_loss": 4.778744220733643,
1732
- "eval_runtime": 531.1827,
1733
- "eval_samples_per_second": 27.985,
1734
- "eval_steps_per_second": 27.985,
1735
- "step": 4707
1736
- },
1737
- {
1738
- "epoch": 9.024856596558317,
1739
- "grad_norm": 7.77038049697876,
1740
- "learning_rate": 1.0834926704907584e-05,
1741
- "loss": 5.7978,
1742
  "step": 4720
1743
  },
1744
  {
1745
- "epoch": 9.063097514340344,
1746
- "grad_norm": 7.850288391113281,
1747
- "learning_rate": 1.0410027618440621e-05,
1748
- "loss": 5.7849,
1749
  "step": 4740
1750
  },
1751
  {
1752
- "epoch": 9.101338432122372,
1753
- "grad_norm": 8.032878875732422,
1754
- "learning_rate": 9.985128531973657e-06,
1755
- "loss": 5.7891,
1756
  "step": 4760
1757
  },
1758
  {
1759
- "epoch": 9.139579349904398,
1760
- "grad_norm": 7.886658668518066,
1761
- "learning_rate": 9.560229445506692e-06,
1762
- "loss": 5.781,
1763
  "step": 4780
1764
  },
1765
  {
1766
- "epoch": 9.177820267686425,
1767
- "grad_norm": 7.953343868255615,
1768
- "learning_rate": 9.135330359039729e-06,
1769
- "loss": 5.8584,
1770
  "step": 4800
1771
  },
1772
  {
1773
- "epoch": 9.216061185468451,
1774
- "grad_norm": 7.899537563323975,
1775
- "learning_rate": 8.710431272572763e-06,
1776
- "loss": 5.8192,
1777
  "step": 4820
1778
  },
1779
  {
1780
- "epoch": 9.254302103250478,
1781
- "grad_norm": 8.269824028015137,
1782
- "learning_rate": 8.2855321861058e-06,
1783
- "loss": 5.7122,
1784
  "step": 4840
1785
  },
1786
  {
1787
- "epoch": 9.292543021032504,
1788
- "grad_norm": 7.824770450592041,
1789
- "learning_rate": 7.860633099638836e-06,
1790
- "loss": 5.7634,
1791
  "step": 4860
1792
  },
1793
  {
1794
- "epoch": 9.330783938814532,
1795
- "grad_norm": 7.953860759735107,
1796
- "learning_rate": 7.435734013171872e-06,
1797
- "loss": 5.8083,
1798
  "step": 4880
1799
  },
1800
  {
1801
- "epoch": 9.369024856596559,
1802
- "grad_norm": 8.25514030456543,
1803
- "learning_rate": 7.010834926704908e-06,
1804
- "loss": 5.8012,
1805
  "step": 4900
1806
  },
1807
  {
1808
- "epoch": 9.407265774378585,
1809
- "grad_norm": 8.2761869430542,
1810
- "learning_rate": 6.585935840237943e-06,
1811
- "loss": 5.7938,
1812
  "step": 4920
1813
  },
1814
  {
1815
- "epoch": 9.445506692160611,
1816
- "grad_norm": 7.865163803100586,
1817
- "learning_rate": 6.161036753770979e-06,
1818
- "loss": 5.6735,
1819
  "step": 4940
1820
  },
1821
  {
1822
- "epoch": 9.483747609942638,
1823
- "grad_norm": 8.172937393188477,
1824
- "learning_rate": 5.736137667304015e-06,
1825
- "loss": 5.7914,
1826
  "step": 4960
1827
  },
1828
  {
1829
- "epoch": 9.521988527724666,
1830
- "grad_norm": 8.558911323547363,
1831
- "learning_rate": 5.311238580837051e-06,
1832
- "loss": 5.7702,
1833
  "step": 4980
1834
  },
1835
  {
1836
- "epoch": 9.560229445506693,
1837
- "grad_norm": 8.265515327453613,
1838
- "learning_rate": 4.886339494370088e-06,
1839
- "loss": 5.7283,
1840
  "step": 5000
1841
  },
1842
  {
1843
- "epoch": 9.598470363288719,
1844
- "grad_norm": 8.17795467376709,
1845
- "learning_rate": 4.461440407903123e-06,
1846
- "loss": 5.8007,
1847
  "step": 5020
1848
  },
1849
  {
1850
- "epoch": 9.636711281070745,
1851
- "grad_norm": 8.109586715698242,
1852
- "learning_rate": 4.036541321436159e-06,
1853
- "loss": 5.8121,
1854
  "step": 5040
1855
  },
1856
  {
1857
- "epoch": 9.674952198852772,
1858
- "grad_norm": 7.911646842956543,
1859
- "learning_rate": 3.6116422349691954e-06,
1860
- "loss": 5.789,
1861
  "step": 5060
1862
  },
1863
  {
1864
- "epoch": 9.7131931166348,
1865
- "grad_norm": 8.030941009521484,
1866
- "learning_rate": 3.186743148502231e-06,
1867
- "loss": 5.7266,
1868
  "step": 5080
1869
  },
1870
  {
1871
- "epoch": 9.751434034416826,
1872
- "grad_norm": 8.059958457946777,
1873
- "learning_rate": 2.7618440620352666e-06,
1874
- "loss": 5.761,
1875
  "step": 5100
1876
  },
1877
  {
1878
- "epoch": 9.789674952198853,
1879
- "grad_norm": 8.002403259277344,
1880
- "learning_rate": 2.3369449755683026e-06,
1881
- "loss": 5.7338,
1882
  "step": 5120
1883
  },
1884
  {
1885
- "epoch": 9.82791586998088,
1886
- "grad_norm": 8.306962966918945,
1887
- "learning_rate": 1.9120458891013386e-06,
1888
- "loss": 5.7088,
1889
  "step": 5140
1890
  },
1891
  {
1892
- "epoch": 9.866156787762906,
1893
- "grad_norm": 8.018095970153809,
1894
- "learning_rate": 1.4871468026343744e-06,
1895
- "loss": 5.6973,
1896
  "step": 5160
1897
  },
1898
  {
1899
- "epoch": 9.904397705544934,
1900
- "grad_norm": 8.168917655944824,
1901
- "learning_rate": 1.0622477161674104e-06,
1902
- "loss": 5.6422,
 
 
 
 
 
 
 
 
 
1903
  "step": 5180
1904
  },
1905
  {
1906
- "epoch": 9.94263862332696,
1907
- "grad_norm": 7.939206123352051,
1908
- "learning_rate": 6.373486297004462e-07,
1909
- "loss": 5.7399,
1910
  "step": 5200
1911
  },
1912
  {
1913
- "epoch": 9.980879541108987,
1914
- "grad_norm": 7.970940589904785,
1915
- "learning_rate": 2.1244954323348205e-07,
1916
- "loss": 5.7269,
1917
  "step": 5220
1918
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1919
  {
1920
  "epoch": 10.0,
1921
- "eval_accuracy": 0.8030272452068618,
1922
- "eval_loss": 4.700281620025635,
1923
- "eval_runtime": 552.5641,
1924
- "eval_samples_per_second": 26.902,
1925
- "eval_steps_per_second": 26.902,
1926
- "step": 5230
1927
  },
1928
  {
1929
  "epoch": 10.0,
1930
- "step": 5230,
1931
- "total_flos": 2.49073133395968e+18,
1932
- "train_loss": 7.888943860726876,
1933
- "train_runtime": 28748.4048,
1934
- "train_samples_per_second": 46.534,
1935
- "train_steps_per_second": 0.182
1936
  }
1937
  ],
1938
  "logging_steps": 20,
1939
- "max_steps": 5230,
1940
  "num_input_tokens_seen": 0,
1941
  "num_train_epochs": 10,
1942
  "save_steps": 500,
@@ -1952,7 +2134,7 @@
1952
  "attributes": {}
1953
  }
1954
  },
1955
- "total_flos": 2.49073133395968e+18,
1956
  "train_batch_size": 256,
1957
  "trial_name": null,
1958
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9757901815736382,
3
+ "best_model_checkpoint": "/mnt/data4_HDD_14TB/yang/voxceleb-checkpoints/ecapa-tdnn/voxceleb1/pretrain/c512-aam-len3-bs256-lr5e-4/checkpoint-3450",
4
  "epoch": 10.0,
5
  "eval_steps": 500,
6
+ "global_step": 5750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.034782608695652174,
13
+ "grad_norm": 6.155358791351318,
14
+ "learning_rate": 1.739130434782609e-05,
15
+ "loss": 13.2026,
16
  "step": 20
17
  },
18
  {
19
+ "epoch": 0.06956521739130435,
20
+ "grad_norm": 5.816741943359375,
21
+ "learning_rate": 3.478260869565218e-05,
22
+ "loss": 13.1252,
23
  "step": 40
24
  },
25
  {
26
+ "epoch": 0.10434782608695652,
27
+ "grad_norm": 5.273156642913818,
28
+ "learning_rate": 5.2173913043478256e-05,
29
+ "loss": 13.0001,
30
  "step": 60
31
  },
32
  {
33
+ "epoch": 0.1391304347826087,
34
+ "grad_norm": 4.86655330657959,
35
+ "learning_rate": 6.956521739130436e-05,
36
+ "loss": 12.8639,
37
  "step": 80
38
  },
39
  {
40
+ "epoch": 0.17391304347826086,
41
+ "grad_norm": 4.438321113586426,
42
+ "learning_rate": 8.695652173913044e-05,
43
+ "loss": 12.7376,
44
  "step": 100
45
  },
46
  {
47
+ "epoch": 0.20869565217391303,
48
+ "grad_norm": 4.164404392242432,
49
+ "learning_rate": 0.00010434782608695651,
50
+ "loss": 12.5722,
51
  "step": 120
52
  },
53
  {
54
+ "epoch": 0.24347826086956523,
55
+ "grad_norm": 3.858990430831909,
56
+ "learning_rate": 0.00012173913043478261,
57
+ "loss": 12.4229,
58
  "step": 140
59
  },
60
  {
61
+ "epoch": 0.2782608695652174,
62
+ "grad_norm": 3.6574394702911377,
63
+ "learning_rate": 0.0001391304347826087,
64
+ "loss": 12.2581,
65
  "step": 160
66
  },
67
  {
68
+ "epoch": 0.3130434782608696,
69
+ "grad_norm": 3.3787951469421387,
70
+ "learning_rate": 0.0001565217391304348,
71
+ "loss": 12.0753,
72
  "step": 180
73
  },
74
  {
75
+ "epoch": 0.34782608695652173,
76
+ "grad_norm": 3.323820114135742,
77
+ "learning_rate": 0.00017391304347826088,
78
+ "loss": 11.9261,
79
  "step": 200
80
  },
81
  {
82
+ "epoch": 0.3826086956521739,
83
+ "grad_norm": 3.247619152069092,
84
+ "learning_rate": 0.00019130434782608697,
85
+ "loss": 11.7417,
86
  "step": 220
87
  },
88
  {
89
+ "epoch": 0.41739130434782606,
90
+ "grad_norm": 3.2254152297973633,
91
+ "learning_rate": 0.00020869565217391303,
92
+ "loss": 11.5771,
93
  "step": 240
94
  },
95
  {
96
+ "epoch": 0.45217391304347826,
97
+ "grad_norm": 3.1803464889526367,
98
+ "learning_rate": 0.00022608695652173914,
99
+ "loss": 11.3969,
100
  "step": 260
101
  },
102
  {
103
+ "epoch": 0.48695652173913045,
104
+ "grad_norm": 3.41034197807312,
105
+ "learning_rate": 0.00024347826086956522,
106
+ "loss": 11.2684,
107
  "step": 280
108
  },
109
  {
110
+ "epoch": 0.5217391304347826,
111
+ "grad_norm": 3.246403217315674,
112
+ "learning_rate": 0.0002608695652173913,
113
+ "loss": 11.0744,
114
  "step": 300
115
  },
116
  {
117
+ "epoch": 0.5565217391304348,
118
+ "grad_norm": 3.202021360397339,
119
+ "learning_rate": 0.0002782608695652174,
120
+ "loss": 10.8929,
121
  "step": 320
122
  },
123
  {
124
+ "epoch": 0.591304347826087,
125
+ "grad_norm": 3.1231367588043213,
126
+ "learning_rate": 0.0002956521739130435,
127
+ "loss": 10.7468,
128
  "step": 340
129
  },
130
  {
131
+ "epoch": 0.6260869565217392,
132
+ "grad_norm": 3.1820390224456787,
133
+ "learning_rate": 0.0003130434782608696,
134
+ "loss": 10.606,
135
  "step": 360
136
  },
137
  {
138
+ "epoch": 0.6608695652173913,
139
+ "grad_norm": 3.2470555305480957,
140
+ "learning_rate": 0.0003304347826086956,
141
+ "loss": 10.4871,
142
  "step": 380
143
  },
144
  {
145
+ "epoch": 0.6956521739130435,
146
+ "grad_norm": 3.2452709674835205,
147
+ "learning_rate": 0.00034782608695652176,
148
+ "loss": 10.2836,
149
  "step": 400
150
  },
151
  {
152
+ "epoch": 0.7304347826086957,
153
+ "grad_norm": 3.203894853591919,
154
+ "learning_rate": 0.00036521739130434785,
155
+ "loss": 10.1154,
156
  "step": 420
157
  },
158
  {
159
+ "epoch": 0.7652173913043478,
160
+ "grad_norm": 3.269970178604126,
161
+ "learning_rate": 0.00038260869565217393,
162
+ "loss": 9.9283,
163
  "step": 440
164
  },
165
  {
166
+ "epoch": 0.8,
167
+ "grad_norm": 3.261357545852661,
168
+ "learning_rate": 0.0004,
169
+ "loss": 9.8674,
170
  "step": 460
171
  },
172
  {
173
+ "epoch": 0.8347826086956521,
174
+ "grad_norm": 3.393953323364258,
175
+ "learning_rate": 0.00041739130434782605,
176
+ "loss": 9.6224,
177
  "step": 480
178
  },
179
  {
180
+ "epoch": 0.8695652173913043,
181
+ "grad_norm": 3.321411609649658,
182
+ "learning_rate": 0.0004347826086956522,
183
+ "loss": 9.524,
184
  "step": 500
185
  },
186
  {
187
+ "epoch": 0.9043478260869565,
188
+ "grad_norm": 3.3886823654174805,
189
+ "learning_rate": 0.0004521739130434783,
190
+ "loss": 9.384,
191
  "step": 520
192
  },
193
  {
194
+ "epoch": 0.9391304347826087,
195
+ "grad_norm": 3.4735491275787354,
196
+ "learning_rate": 0.00046956521739130436,
197
+ "loss": 9.1767,
 
 
 
 
 
 
 
 
 
198
  "step": 540
199
  },
200
  {
201
+ "epoch": 0.9739130434782609,
202
+ "grad_norm": 3.416966676712036,
203
+ "learning_rate": 0.00048695652173913045,
204
+ "loss": 9.047,
205
  "step": 560
206
  },
207
  {
208
+ "epoch": 1.0,
209
+ "eval_accuracy": 0.43039677202420984,
210
+ "eval_loss": 8.366157531738281,
211
+ "eval_runtime": 42.3364,
212
+ "eval_samples_per_second": 35.123,
213
+ "eval_steps_per_second": 35.123,
214
+ "step": 575
215
+ },
216
+ {
217
+ "epoch": 1.008695652173913,
218
+ "grad_norm": 3.446899890899658,
219
+ "learning_rate": 0.0004995169082125604,
220
+ "loss": 8.8835,
221
  "step": 580
222
  },
223
  {
224
+ "epoch": 1.0434782608695652,
225
+ "grad_norm": 3.5842247009277344,
226
+ "learning_rate": 0.0004975845410628019,
227
+ "loss": 8.6436,
228
  "step": 600
229
  },
230
  {
231
+ "epoch": 1.0782608695652174,
232
+ "grad_norm": 3.5029306411743164,
233
+ "learning_rate": 0.0004956521739130435,
234
+ "loss": 8.4775,
235
  "step": 620
236
  },
237
  {
238
+ "epoch": 1.1130434782608696,
239
+ "grad_norm": 3.5451033115386963,
240
+ "learning_rate": 0.0004937198067632851,
241
+ "loss": 8.322,
242
  "step": 640
243
  },
244
  {
245
+ "epoch": 1.1478260869565218,
246
+ "grad_norm": 3.5502634048461914,
247
+ "learning_rate": 0.0004917874396135266,
248
+ "loss": 8.1264,
249
  "step": 660
250
  },
251
  {
252
+ "epoch": 1.182608695652174,
253
+ "grad_norm": 3.607395648956299,
254
+ "learning_rate": 0.0004898550724637681,
255
+ "loss": 7.9905,
256
  "step": 680
257
  },
258
  {
259
+ "epoch": 1.2173913043478262,
260
+ "grad_norm": 3.6438565254211426,
261
+ "learning_rate": 0.0004879227053140097,
262
+ "loss": 7.8252,
263
  "step": 700
264
  },
265
  {
266
+ "epoch": 1.2521739130434781,
267
+ "grad_norm": 3.656705141067505,
268
+ "learning_rate": 0.0004859903381642512,
269
+ "loss": 7.7737,
270
  "step": 720
271
  },
272
  {
273
+ "epoch": 1.2869565217391306,
274
+ "grad_norm": 3.7424328327178955,
275
+ "learning_rate": 0.0004840579710144928,
276
+ "loss": 7.5822,
277
  "step": 740
278
  },
279
  {
280
+ "epoch": 1.3217391304347825,
281
+ "grad_norm": 3.673156261444092,
282
+ "learning_rate": 0.0004821256038647343,
283
+ "loss": 7.4563,
284
  "step": 760
285
  },
286
  {
287
+ "epoch": 1.3565217391304347,
288
+ "grad_norm": 3.6774067878723145,
289
+ "learning_rate": 0.0004801932367149758,
290
+ "loss": 7.3379,
291
  "step": 780
292
  },
293
  {
294
+ "epoch": 1.391304347826087,
295
+ "grad_norm": 3.811283826828003,
296
+ "learning_rate": 0.0004782608695652174,
297
+ "loss": 7.1559,
298
  "step": 800
299
  },
300
  {
301
+ "epoch": 1.4260869565217391,
302
+ "grad_norm": 3.7899839878082275,
303
+ "learning_rate": 0.00047632850241545894,
304
+ "loss": 7.0834,
305
  "step": 820
306
  },
307
  {
308
+ "epoch": 1.4608695652173913,
309
+ "grad_norm": 3.583247423171997,
310
+ "learning_rate": 0.00047439613526570047,
311
+ "loss": 6.9172,
312
  "step": 840
313
  },
314
  {
315
+ "epoch": 1.4956521739130435,
316
+ "grad_norm": 3.8192331790924072,
317
+ "learning_rate": 0.00047246376811594206,
318
+ "loss": 6.7251,
319
  "step": 860
320
  },
321
  {
322
+ "epoch": 1.5304347826086957,
323
+ "grad_norm": 3.8098299503326416,
324
+ "learning_rate": 0.0004705314009661836,
325
+ "loss": 6.7871,
326
  "step": 880
327
  },
328
  {
329
+ "epoch": 1.5652173913043477,
330
+ "grad_norm": 3.7341325283050537,
331
+ "learning_rate": 0.0004685990338164252,
332
+ "loss": 6.6103,
333
  "step": 900
334
  },
335
  {
336
+ "epoch": 1.6,
337
+ "grad_norm": 3.9190495014190674,
338
+ "learning_rate": 0.00046666666666666666,
339
+ "loss": 6.4507,
340
  "step": 920
341
  },
342
  {
343
+ "epoch": 1.634782608695652,
344
+ "grad_norm": 3.9456422328948975,
345
+ "learning_rate": 0.0004647342995169082,
346
+ "loss": 6.3619,
347
  "step": 940
348
  },
349
  {
350
+ "epoch": 1.6695652173913045,
351
+ "grad_norm": 3.899134874343872,
352
+ "learning_rate": 0.0004628019323671498,
353
+ "loss": 6.2957,
354
  "step": 960
355
  },
356
  {
357
+ "epoch": 1.7043478260869565,
358
+ "grad_norm": 3.878810167312622,
359
+ "learning_rate": 0.0004608695652173913,
360
+ "loss": 6.1362,
361
  "step": 980
362
  },
363
  {
364
+ "epoch": 1.7391304347826086,
365
+ "grad_norm": 3.9270784854888916,
366
+ "learning_rate": 0.00045893719806763285,
367
+ "loss": 5.9814,
368
  "step": 1000
369
  },
370
  {
371
+ "epoch": 1.7739130434782608,
372
+ "grad_norm": 3.8247644901275635,
373
+ "learning_rate": 0.00045700483091787444,
374
+ "loss": 5.9095,
375
  "step": 1020
376
  },
377
  {
378
+ "epoch": 1.808695652173913,
379
+ "grad_norm": 3.8870134353637695,
380
+ "learning_rate": 0.000455072463768116,
381
+ "loss": 5.7793,
382
  "step": 1040
383
  },
384
  {
385
+ "epoch": 1.8434782608695652,
386
+ "grad_norm": 3.9533441066741943,
387
+ "learning_rate": 0.00045314009661835745,
388
+ "loss": 5.7754,
 
 
 
 
 
 
 
 
 
389
  "step": 1060
390
  },
391
  {
392
+ "epoch": 1.8782608695652174,
393
+ "grad_norm": 3.9928998947143555,
394
+ "learning_rate": 0.00045120772946859904,
395
+ "loss": 5.5886,
396
  "step": 1080
397
  },
398
  {
399
+ "epoch": 1.9130434782608696,
400
+ "grad_norm": 4.030064582824707,
401
+ "learning_rate": 0.0004492753623188406,
402
+ "loss": 5.5482,
403
  "step": 1100
404
  },
405
  {
406
+ "epoch": 1.9478260869565216,
407
+ "grad_norm": 3.961806297302246,
408
+ "learning_rate": 0.0004473429951690821,
409
+ "loss": 5.4807,
410
  "step": 1120
411
  },
412
  {
413
+ "epoch": 1.982608695652174,
414
+ "grad_norm": 4.003119945526123,
415
+ "learning_rate": 0.0004454106280193237,
416
+ "loss": 5.3508,
417
  "step": 1140
418
  },
419
  {
420
+ "epoch": 2.0,
421
+ "eval_accuracy": 0.8190988567585743,
422
+ "eval_loss": 4.025164604187012,
423
+ "eval_runtime": 42.7144,
424
+ "eval_samples_per_second": 34.813,
425
+ "eval_steps_per_second": 34.813,
426
+ "step": 1150
427
+ },
428
+ {
429
+ "epoch": 2.017391304347826,
430
+ "grad_norm": 3.958116292953491,
431
+ "learning_rate": 0.00044347826086956523,
432
+ "loss": 5.1229,
433
  "step": 1160
434
  },
435
  {
436
+ "epoch": 2.0521739130434784,
437
+ "grad_norm": 3.864279270172119,
438
+ "learning_rate": 0.00044154589371980677,
439
+ "loss": 4.8146,
440
  "step": 1180
441
  },
442
  {
443
+ "epoch": 2.0869565217391304,
444
+ "grad_norm": 4.045077323913574,
445
+ "learning_rate": 0.0004396135265700483,
446
+ "loss": 4.8843,
447
  "step": 1200
448
  },
449
  {
450
+ "epoch": 2.121739130434783,
451
+ "grad_norm": 4.061978816986084,
452
+ "learning_rate": 0.00043768115942028983,
453
+ "loss": 4.8078,
454
  "step": 1220
455
  },
456
  {
457
+ "epoch": 2.1565217391304348,
458
+ "grad_norm": 4.040159225463867,
459
+ "learning_rate": 0.0004357487922705314,
460
+ "loss": 4.6812,
461
  "step": 1240
462
  },
463
  {
464
+ "epoch": 2.1913043478260867,
465
+ "grad_norm": 4.234623908996582,
466
+ "learning_rate": 0.00043381642512077296,
467
+ "loss": 4.6701,
468
  "step": 1260
469
  },
470
  {
471
+ "epoch": 2.226086956521739,
472
+ "grad_norm": 4.030038356781006,
473
+ "learning_rate": 0.0004318840579710145,
474
+ "loss": 4.6221,
475
  "step": 1280
476
  },
477
  {
478
+ "epoch": 2.260869565217391,
479
+ "grad_norm": 3.9954497814178467,
480
+ "learning_rate": 0.0004299516908212561,
481
+ "loss": 4.5647,
482
  "step": 1300
483
  },
484
  {
485
+ "epoch": 2.2956521739130435,
486
+ "grad_norm": 4.188636779785156,
487
+ "learning_rate": 0.0004280193236714976,
488
+ "loss": 4.4502,
489
  "step": 1320
490
  },
491
  {
492
+ "epoch": 2.3304347826086955,
493
+ "grad_norm": 4.185456275939941,
494
+ "learning_rate": 0.00042608695652173915,
495
+ "loss": 4.359,
496
  "step": 1340
497
  },
498
  {
499
+ "epoch": 2.365217391304348,
500
+ "grad_norm": 4.123263359069824,
501
+ "learning_rate": 0.0004241545893719807,
502
+ "loss": 4.2863,
503
  "step": 1360
504
  },
505
  {
506
+ "epoch": 2.4,
507
+ "grad_norm": 4.194387435913086,
508
+ "learning_rate": 0.0004222222222222222,
509
+ "loss": 4.3354,
510
  "step": 1380
511
  },
512
  {
513
+ "epoch": 2.4347826086956523,
514
+ "grad_norm": 4.065763473510742,
515
+ "learning_rate": 0.00042028985507246375,
516
+ "loss": 4.2176,
517
  "step": 1400
518
  },
519
  {
520
+ "epoch": 2.4695652173913043,
521
+ "grad_norm": 4.120363712310791,
522
+ "learning_rate": 0.00041835748792270534,
523
+ "loss": 4.0597,
524
  "step": 1420
525
  },
526
  {
527
+ "epoch": 2.5043478260869563,
528
+ "grad_norm": 4.3197174072265625,
529
+ "learning_rate": 0.00041642512077294687,
530
+ "loss": 4.028,
531
  "step": 1440
532
  },
533
  {
534
+ "epoch": 2.5391304347826087,
535
+ "grad_norm": 4.2683610916137695,
536
+ "learning_rate": 0.0004144927536231884,
537
+ "loss": 3.9833,
538
  "step": 1460
539
  },
540
  {
541
+ "epoch": 2.573913043478261,
542
+ "grad_norm": 4.15448522567749,
543
+ "learning_rate": 0.00041256038647343,
544
+ "loss": 4.0065,
545
  "step": 1480
546
  },
547
  {
548
+ "epoch": 2.608695652173913,
549
+ "grad_norm": 4.348177433013916,
550
+ "learning_rate": 0.0004106280193236715,
551
+ "loss": 3.8134,
552
  "step": 1500
553
  },
554
  {
555
+ "epoch": 2.643478260869565,
556
+ "grad_norm": 4.100021839141846,
557
+ "learning_rate": 0.00040869565217391306,
558
+ "loss": 3.8548,
559
  "step": 1520
560
  },
561
  {
562
+ "epoch": 2.6782608695652175,
563
+ "grad_norm": 4.344174385070801,
564
+ "learning_rate": 0.0004067632850241546,
565
+ "loss": 3.7814,
566
  "step": 1540
567
  },
568
  {
569
+ "epoch": 2.7130434782608694,
570
+ "grad_norm": 4.240079402923584,
571
+ "learning_rate": 0.00040483091787439613,
572
+ "loss": 3.7578,
573
  "step": 1560
574
  },
575
  {
576
+ "epoch": 2.747826086956522,
577
+ "grad_norm": 4.468689918518066,
578
+ "learning_rate": 0.0004028985507246377,
579
+ "loss": 3.7331,
 
 
 
 
 
 
 
 
 
580
  "step": 1580
581
  },
582
  {
583
+ "epoch": 2.782608695652174,
584
+ "grad_norm": 4.28464937210083,
585
+ "learning_rate": 0.00040096618357487925,
586
+ "loss": 3.6396,
587
  "step": 1600
588
  },
589
  {
590
+ "epoch": 2.8173913043478263,
591
+ "grad_norm": 4.166805744171143,
592
+ "learning_rate": 0.0003990338164251208,
593
+ "loss": 3.5799,
594
  "step": 1620
595
  },
596
  {
597
+ "epoch": 2.8521739130434782,
598
+ "grad_norm": 4.237683296203613,
599
+ "learning_rate": 0.0003971014492753624,
600
+ "loss": 3.4734,
601
  "step": 1640
602
  },
603
  {
604
+ "epoch": 2.8869565217391306,
605
+ "grad_norm": 4.153097152709961,
606
+ "learning_rate": 0.00039516908212560385,
607
+ "loss": 3.5183,
608
  "step": 1660
609
  },
610
  {
611
+ "epoch": 2.9217391304347826,
612
+ "grad_norm": 4.2313947677612305,
613
+ "learning_rate": 0.0003932367149758454,
614
+ "loss": 3.3963,
615
  "step": 1680
616
  },
617
  {
618
+ "epoch": 2.9565217391304346,
619
+ "grad_norm": 3.992475748062134,
620
+ "learning_rate": 0.000391304347826087,
621
+ "loss": 3.3081,
622
  "step": 1700
623
  },
624
  {
625
+ "epoch": 2.991304347826087,
626
+ "grad_norm": 4.4731059074401855,
627
+ "learning_rate": 0.0003893719806763285,
628
+ "loss": 3.3124,
629
  "step": 1720
630
  },
631
  {
632
+ "epoch": 3.0,
633
+ "eval_accuracy": 0.9260255548083389,
634
+ "eval_loss": 2.1082653999328613,
635
+ "eval_runtime": 22.1676,
636
+ "eval_samples_per_second": 67.08,
637
+ "eval_steps_per_second": 67.08,
638
+ "step": 1725
639
+ },
640
+ {
641
+ "epoch": 3.026086956521739,
642
+ "grad_norm": 4.272000312805176,
643
+ "learning_rate": 0.00038743961352657004,
644
+ "loss": 3.1247,
645
  "step": 1740
646
  },
647
  {
648
+ "epoch": 3.0608695652173914,
649
+ "grad_norm": 4.102330207824707,
650
+ "learning_rate": 0.00038550724637681163,
651
+ "loss": 3.1064,
652
  "step": 1760
653
  },
654
  {
655
+ "epoch": 3.0956521739130434,
656
+ "grad_norm": 4.381846904754639,
657
+ "learning_rate": 0.00038357487922705317,
658
+ "loss": 2.9371,
659
  "step": 1780
660
  },
661
  {
662
+ "epoch": 3.130434782608696,
663
+ "grad_norm": 4.1588921546936035,
664
+ "learning_rate": 0.00038164251207729465,
665
+ "loss": 2.9355,
666
  "step": 1800
667
  },
668
  {
669
+ "epoch": 3.1652173913043478,
670
+ "grad_norm": 4.279609203338623,
671
+ "learning_rate": 0.00037971014492753623,
672
+ "loss": 2.8545,
673
  "step": 1820
674
  },
675
  {
676
+ "epoch": 3.2,
677
+ "grad_norm": 4.240756988525391,
678
+ "learning_rate": 0.00037777777777777777,
679
+ "loss": 2.8096,
680
  "step": 1840
681
  },
682
  {
683
+ "epoch": 3.234782608695652,
684
+ "grad_norm": 4.11091947555542,
685
+ "learning_rate": 0.00037584541062801936,
686
+ "loss": 2.8138,
687
  "step": 1860
688
  },
689
  {
690
+ "epoch": 3.269565217391304,
691
+ "grad_norm": 4.078794479370117,
692
+ "learning_rate": 0.0003739130434782609,
693
+ "loss": 2.7417,
694
  "step": 1880
695
  },
696
  {
697
+ "epoch": 3.3043478260869565,
698
+ "grad_norm": 4.368116855621338,
699
+ "learning_rate": 0.0003719806763285024,
700
+ "loss": 2.7937,
701
  "step": 1900
702
  },
703
  {
704
+ "epoch": 3.3391304347826085,
705
+ "grad_norm": 4.044319152832031,
706
+ "learning_rate": 0.000370048309178744,
707
+ "loss": 2.7361,
708
  "step": 1920
709
  },
710
  {
711
+ "epoch": 3.373913043478261,
712
+ "grad_norm": 4.314040184020996,
713
+ "learning_rate": 0.0003681159420289855,
714
+ "loss": 2.7054,
715
  "step": 1940
716
  },
717
  {
718
+ "epoch": 3.408695652173913,
719
+ "grad_norm": 4.185855388641357,
720
+ "learning_rate": 0.000366183574879227,
721
+ "loss": 2.6682,
722
  "step": 1960
723
  },
724
  {
725
+ "epoch": 3.4434782608695653,
726
+ "grad_norm": 4.433622360229492,
727
+ "learning_rate": 0.0003642512077294686,
728
+ "loss": 2.6644,
729
  "step": 1980
730
  },
731
  {
732
+ "epoch": 3.4782608695652173,
733
+ "grad_norm": 4.048947811126709,
734
+ "learning_rate": 0.00036231884057971015,
735
+ "loss": 2.618,
736
  "step": 2000
737
  },
738
  {
739
+ "epoch": 3.5130434782608697,
740
+ "grad_norm": 4.145406246185303,
741
+ "learning_rate": 0.0003603864734299517,
742
+ "loss": 2.5982,
743
  "step": 2020
744
  },
745
  {
746
+ "epoch": 3.5478260869565217,
747
+ "grad_norm": 4.2812910079956055,
748
+ "learning_rate": 0.00035845410628019327,
749
+ "loss": 2.6138,
750
  "step": 2040
751
  },
752
  {
753
+ "epoch": 3.5826086956521737,
754
+ "grad_norm": 4.400162220001221,
755
+ "learning_rate": 0.0003565217391304348,
756
+ "loss": 2.5039,
757
  "step": 2060
758
  },
759
  {
760
+ "epoch": 3.617391304347826,
761
+ "grad_norm": 4.217800617218018,
762
+ "learning_rate": 0.0003545893719806763,
763
+ "loss": 2.5249,
764
  "step": 2080
765
  },
766
  {
767
+ "epoch": 3.6521739130434785,
768
+ "grad_norm": 4.076215744018555,
769
+ "learning_rate": 0.0003526570048309179,
770
+ "loss": 2.4547,
 
 
 
 
 
 
 
 
 
771
  "step": 2100
772
  },
773
  {
774
+ "epoch": 3.6869565217391305,
775
+ "grad_norm": 4.139514446258545,
776
+ "learning_rate": 0.0003507246376811594,
777
+ "loss": 2.4315,
778
  "step": 2120
779
  },
780
  {
781
+ "epoch": 3.7217391304347824,
782
+ "grad_norm": 4.118022918701172,
783
+ "learning_rate": 0.00034879227053140094,
784
+ "loss": 2.3836,
785
  "step": 2140
786
  },
787
  {
788
+ "epoch": 3.756521739130435,
789
+ "grad_norm": 4.137601852416992,
790
+ "learning_rate": 0.00034685990338164253,
791
+ "loss": 2.3284,
792
  "step": 2160
793
  },
794
  {
795
+ "epoch": 3.791304347826087,
796
+ "grad_norm": 4.023979663848877,
797
+ "learning_rate": 0.00034492753623188406,
798
+ "loss": 2.3095,
799
  "step": 2180
800
  },
801
  {
802
+ "epoch": 3.8260869565217392,
803
+ "grad_norm": 4.042725086212158,
804
+ "learning_rate": 0.00034299516908212565,
805
+ "loss": 2.305,
806
  "step": 2200
807
  },
808
  {
809
+ "epoch": 3.860869565217391,
810
+ "grad_norm": 4.265875339508057,
811
+ "learning_rate": 0.0003410628019323672,
812
+ "loss": 2.3237,
813
  "step": 2220
814
  },
815
  {
816
+ "epoch": 3.8956521739130436,
817
+ "grad_norm": 4.205041408538818,
818
+ "learning_rate": 0.00033913043478260867,
819
+ "loss": 2.335,
820
  "step": 2240
821
  },
822
  {
823
+ "epoch": 3.9304347826086956,
824
+ "grad_norm": 4.1344709396362305,
825
+ "learning_rate": 0.00033719806763285025,
826
+ "loss": 2.2341,
827
  "step": 2260
828
  },
829
  {
830
+ "epoch": 3.965217391304348,
831
+ "grad_norm": 4.247790813446045,
832
+ "learning_rate": 0.0003352657004830918,
833
+ "loss": 2.251,
834
  "step": 2280
835
  },
836
  {
837
+ "epoch": 4.0,
838
+ "grad_norm": 4.859626770019531,
839
+ "learning_rate": 0.0003333333333333333,
840
+ "loss": 2.3212,
841
+ "step": 2300
842
+ },
843
+ {
844
+ "epoch": 4.0,
845
+ "eval_accuracy": 0.9435104236718225,
846
+ "eval_loss": 1.2223739624023438,
847
+ "eval_runtime": 14.8513,
848
+ "eval_samples_per_second": 100.126,
849
+ "eval_steps_per_second": 100.126,
850
  "step": 2300
851
  },
852
  {
853
+ "epoch": 4.034782608695652,
854
+ "grad_norm": 4.098020553588867,
855
+ "learning_rate": 0.0003314009661835749,
856
+ "loss": 1.9133,
857
  "step": 2320
858
  },
859
  {
860
+ "epoch": 4.069565217391304,
861
+ "grad_norm": 4.198029041290283,
862
+ "learning_rate": 0.00032946859903381644,
863
+ "loss": 1.9814,
864
  "step": 2340
865
  },
866
  {
867
+ "epoch": 4.104347826086957,
868
+ "grad_norm": 3.960844039916992,
869
+ "learning_rate": 0.000327536231884058,
870
+ "loss": 1.9505,
871
  "step": 2360
872
  },
873
  {
874
+ "epoch": 4.139130434782609,
875
+ "grad_norm": 4.0190300941467285,
876
+ "learning_rate": 0.0003256038647342995,
877
+ "loss": 1.8815,
878
  "step": 2380
879
  },
880
  {
881
+ "epoch": 4.173913043478261,
882
+ "grad_norm": 4.040708541870117,
883
+ "learning_rate": 0.00032367149758454105,
884
+ "loss": 1.8365,
885
  "step": 2400
886
  },
887
  {
888
+ "epoch": 4.208695652173913,
889
+ "grad_norm": 4.077364444732666,
890
+ "learning_rate": 0.0003217391304347826,
891
+ "loss": 1.84,
892
  "step": 2420
893
  },
894
  {
895
+ "epoch": 4.243478260869566,
896
+ "grad_norm": 4.267309188842773,
897
+ "learning_rate": 0.0003199033816425121,
898
+ "loss": 1.8864,
899
  "step": 2440
900
  },
901
  {
902
+ "epoch": 4.278260869565218,
903
+ "grad_norm": 3.978663921356201,
904
+ "learning_rate": 0.00031797101449275363,
905
+ "loss": 1.9015,
906
  "step": 2460
907
  },
908
  {
909
+ "epoch": 4.3130434782608695,
910
+ "grad_norm": 4.089256763458252,
911
+ "learning_rate": 0.0003160386473429952,
912
+ "loss": 1.8388,
913
  "step": 2480
914
  },
915
  {
916
+ "epoch": 4.3478260869565215,
917
+ "grad_norm": 3.9317057132720947,
918
+ "learning_rate": 0.0003141062801932367,
919
+ "loss": 1.7845,
920
  "step": 2500
921
  },
922
  {
923
+ "epoch": 4.3826086956521735,
924
+ "grad_norm": 3.9738080501556396,
925
+ "learning_rate": 0.00031217391304347823,
926
+ "loss": 1.7725,
927
  "step": 2520
928
  },
929
  {
930
+ "epoch": 4.417391304347826,
931
+ "grad_norm": 4.232215881347656,
932
+ "learning_rate": 0.0003102415458937198,
933
+ "loss": 1.852,
934
  "step": 2540
935
  },
936
  {
937
+ "epoch": 4.452173913043478,
938
+ "grad_norm": 4.050131797790527,
939
+ "learning_rate": 0.00030830917874396136,
940
+ "loss": 1.8234,
941
  "step": 2560
942
  },
943
  {
944
+ "epoch": 4.48695652173913,
945
+ "grad_norm": 4.217935085296631,
946
+ "learning_rate": 0.0003063768115942029,
947
+ "loss": 1.8148,
948
  "step": 2580
949
  },
950
  {
951
+ "epoch": 4.521739130434782,
952
+ "grad_norm": 3.9807074069976807,
953
+ "learning_rate": 0.0003044444444444445,
954
+ "loss": 1.7134,
955
  "step": 2600
956
  },
957
  {
958
+ "epoch": 4.556521739130435,
959
+ "grad_norm": 4.05940580368042,
960
+ "learning_rate": 0.000302512077294686,
961
+ "loss": 1.6752,
 
 
 
 
 
 
 
 
 
962
  "step": 2620
963
  },
964
  {
965
+ "epoch": 4.591304347826087,
966
+ "grad_norm": 4.454566955566406,
967
+ "learning_rate": 0.00030057971014492755,
968
+ "loss": 1.8413,
969
  "step": 2640
970
  },
971
  {
972
+ "epoch": 4.626086956521739,
973
+ "grad_norm": 4.144088268280029,
974
+ "learning_rate": 0.0002986473429951691,
975
+ "loss": 1.7948,
976
  "step": 2660
977
  },
978
  {
979
+ "epoch": 4.660869565217391,
980
+ "grad_norm": 3.940176010131836,
981
+ "learning_rate": 0.0002967149758454106,
982
+ "loss": 1.7468,
983
  "step": 2680
984
  },
985
  {
986
+ "epoch": 4.695652173913043,
987
+ "grad_norm": 4.198675632476807,
988
+ "learning_rate": 0.0002948792270531401,
989
+ "loss": 1.709,
990
  "step": 2700
991
  },
992
  {
993
+ "epoch": 4.730434782608696,
994
+ "grad_norm": 3.976001501083374,
995
+ "learning_rate": 0.00029294685990338167,
996
+ "loss": 1.6506,
997
  "step": 2720
998
  },
999
  {
1000
+ "epoch": 4.765217391304348,
1001
+ "grad_norm": 4.033059120178223,
1002
+ "learning_rate": 0.0002910144927536232,
1003
+ "loss": 1.7042,
1004
  "step": 2740
1005
  },
1006
  {
1007
+ "epoch": 4.8,
1008
+ "grad_norm": 4.062041759490967,
1009
+ "learning_rate": 0.0002890821256038648,
1010
+ "loss": 1.6795,
1011
  "step": 2760
1012
  },
1013
  {
1014
+ "epoch": 4.834782608695652,
1015
+ "grad_norm": 3.988589286804199,
1016
+ "learning_rate": 0.00028714975845410627,
1017
+ "loss": 1.7029,
1018
  "step": 2780
1019
  },
1020
  {
1021
+ "epoch": 4.869565217391305,
1022
+ "grad_norm": 4.16325044631958,
1023
+ "learning_rate": 0.0002852173913043478,
1024
+ "loss": 1.6641,
1025
  "step": 2800
1026
  },
1027
  {
1028
+ "epoch": 4.904347826086957,
1029
+ "grad_norm": 4.323537349700928,
1030
+ "learning_rate": 0.0002832850241545894,
1031
+ "loss": 1.6953,
1032
  "step": 2820
1033
  },
1034
  {
1035
+ "epoch": 4.939130434782609,
1036
+ "grad_norm": 3.8293144702911377,
1037
+ "learning_rate": 0.0002813526570048309,
1038
+ "loss": 1.5863,
1039
  "step": 2840
1040
  },
1041
  {
1042
+ "epoch": 4.973913043478261,
1043
+ "grad_norm": 3.8955535888671875,
1044
+ "learning_rate": 0.00027942028985507246,
1045
+ "loss": 1.6276,
1046
  "step": 2860
1047
  },
1048
  {
1049
+ "epoch": 5.0,
1050
+ "eval_accuracy": 0.9677202420981843,
1051
+ "eval_loss": 0.8229038715362549,
1052
+ "eval_runtime": 88.6744,
1053
+ "eval_samples_per_second": 16.769,
1054
+ "eval_steps_per_second": 16.769,
1055
+ "step": 2875
1056
+ },
1057
+ {
1058
+ "epoch": 5.008695652173913,
1059
+ "grad_norm": 3.8480091094970703,
1060
+ "learning_rate": 0.00027748792270531405,
1061
+ "loss": 1.5701,
1062
  "step": 2880
1063
  },
1064
  {
1065
+ "epoch": 5.043478260869565,
1066
+ "grad_norm": 3.679872512817383,
1067
+ "learning_rate": 0.0002755555555555556,
1068
+ "loss": 1.3786,
1069
  "step": 2900
1070
  },
1071
  {
1072
+ "epoch": 5.078260869565217,
1073
+ "grad_norm": 4.13381290435791,
1074
+ "learning_rate": 0.00027362318840579706,
1075
+ "loss": 1.3563,
1076
  "step": 2920
1077
  },
1078
  {
1079
+ "epoch": 5.113043478260869,
1080
+ "grad_norm": 3.7467329502105713,
1081
+ "learning_rate": 0.00027169082125603865,
1082
+ "loss": 1.3588,
1083
  "step": 2940
1084
  },
1085
  {
1086
+ "epoch": 5.147826086956521,
1087
+ "grad_norm": 3.5837419033050537,
1088
+ "learning_rate": 0.0002698550724637681,
1089
+ "loss": 1.3782,
1090
  "step": 2960
1091
  },
1092
  {
1093
+ "epoch": 5.182608695652174,
1094
+ "grad_norm": 4.077097415924072,
1095
+ "learning_rate": 0.00026792270531400964,
1096
+ "loss": 1.3969,
1097
  "step": 2980
1098
  },
1099
  {
1100
+ "epoch": 5.217391304347826,
1101
+ "grad_norm": 3.5995211601257324,
1102
+ "learning_rate": 0.00026599033816425123,
1103
+ "loss": 1.3346,
1104
  "step": 3000
1105
  },
1106
  {
1107
+ "epoch": 5.252173913043478,
1108
+ "grad_norm": 3.714010000228882,
1109
+ "learning_rate": 0.00026405797101449277,
1110
+ "loss": 1.3772,
1111
  "step": 3020
1112
  },
1113
  {
1114
+ "epoch": 5.28695652173913,
1115
+ "grad_norm": 3.807094097137451,
1116
+ "learning_rate": 0.00026231884057971016,
1117
+ "loss": 1.3452,
1118
  "step": 3040
1119
  },
1120
  {
1121
+ "epoch": 5.321739130434783,
1122
+ "grad_norm": 4.012477397918701,
1123
+ "learning_rate": 0.0002603864734299517,
1124
+ "loss": 1.3161,
1125
  "step": 3060
1126
  },
1127
  {
1128
+ "epoch": 5.356521739130435,
1129
+ "grad_norm": 3.850520372390747,
1130
+ "learning_rate": 0.0002584541062801932,
1131
+ "loss": 1.3146,
1132
  "step": 3080
1133
  },
1134
  {
1135
+ "epoch": 5.391304347826087,
1136
+ "grad_norm": NaN,
1137
+ "learning_rate": 0.00025661835748792274,
1138
+ "loss": 1.3057,
1139
  "step": 3100
1140
  },
1141
  {
1142
+ "epoch": 5.426086956521739,
1143
+ "grad_norm": 3.697744607925415,
1144
+ "learning_rate": 0.0002546859903381643,
1145
+ "loss": 1.2619,
1146
  "step": 3120
1147
  },
1148
  {
1149
+ "epoch": 5.460869565217392,
1150
+ "grad_norm": 4.125018119812012,
1151
+ "learning_rate": 0.00025275362318840576,
1152
+ "loss": 1.3436,
 
 
 
 
 
 
 
 
 
1153
  "step": 3140
1154
  },
1155
  {
1156
+ "epoch": 5.495652173913044,
1157
+ "grad_norm": 4.1491899490356445,
1158
+ "learning_rate": 0.00025082125603864735,
1159
+ "loss": 1.3289,
1160
  "step": 3160
1161
  },
1162
  {
1163
+ "epoch": 5.530434782608696,
1164
+ "grad_norm": 3.9294846057891846,
1165
+ "learning_rate": 0.0002488888888888889,
1166
+ "loss": 1.218,
1167
  "step": 3180
1168
  },
1169
  {
1170
+ "epoch": 5.565217391304348,
1171
+ "grad_norm": 3.9030706882476807,
1172
+ "learning_rate": 0.00024695652173913047,
1173
+ "loss": 1.3219,
1174
  "step": 3200
1175
  },
1176
  {
1177
+ "epoch": 5.6,
1178
+ "grad_norm": 4.124849319458008,
1179
+ "learning_rate": 0.000245024154589372,
1180
+ "loss": 1.2694,
1181
  "step": 3220
1182
  },
1183
  {
1184
+ "epoch": 5.6347826086956525,
1185
+ "grad_norm": 4.1668500900268555,
1186
+ "learning_rate": 0.0002432850241545894,
1187
+ "loss": 1.2379,
1188
  "step": 3240
1189
  },
1190
  {
1191
+ "epoch": 5.6695652173913045,
1192
+ "grad_norm": 4.098198890686035,
1193
+ "learning_rate": 0.00024135265700483093,
1194
+ "loss": 1.2892,
1195
  "step": 3260
1196
  },
1197
  {
1198
+ "epoch": 5.7043478260869565,
1199
+ "grad_norm": 3.690241813659668,
1200
+ "learning_rate": 0.00023942028985507246,
1201
+ "loss": 1.2742,
1202
  "step": 3280
1203
  },
1204
  {
1205
+ "epoch": 5.739130434782608,
1206
+ "grad_norm": 3.978963613510132,
1207
+ "learning_rate": 0.00023748792270531402,
1208
+ "loss": 1.1755,
1209
  "step": 3300
1210
  },
1211
  {
1212
+ "epoch": 5.773913043478261,
1213
+ "grad_norm": 3.7397215366363525,
1214
+ "learning_rate": 0.00023574879227053139,
1215
+ "loss": 1.2256,
1216
  "step": 3320
1217
  },
1218
  {
1219
+ "epoch": 5.808695652173913,
1220
+ "grad_norm": 3.9201064109802246,
1221
+ "learning_rate": 0.00023391304347826088,
1222
+ "loss": 1.238,
1223
  "step": 3340
1224
  },
1225
  {
1226
+ "epoch": 5.843478260869565,
1227
+ "grad_norm": 3.725389242172241,
1228
+ "learning_rate": 0.0002319806763285024,
1229
+ "loss": 1.1706,
1230
  "step": 3360
1231
  },
1232
  {
1233
+ "epoch": 5.878260869565217,
1234
+ "grad_norm": 3.5844123363494873,
1235
+ "learning_rate": 0.00023004830917874397,
1236
+ "loss": 1.1644,
1237
  "step": 3380
1238
  },
1239
  {
1240
+ "epoch": 5.913043478260869,
1241
+ "grad_norm": 3.79936146736145,
1242
+ "learning_rate": 0.00022821256038647343,
1243
+ "loss": 1.2256,
1244
  "step": 3400
1245
  },
1246
  {
1247
+ "epoch": 5.947826086956522,
1248
+ "grad_norm": 3.5947725772857666,
1249
+ "learning_rate": 0.00022628019323671497,
1250
+ "loss": 1.2488,
1251
  "step": 3420
1252
  },
1253
  {
1254
+ "epoch": 5.982608695652174,
1255
+ "grad_norm": NaN,
1256
+ "learning_rate": 0.00022444444444444446,
1257
+ "loss": 1.1418,
1258
  "step": 3440
1259
  },
1260
  {
1261
+ "epoch": 6.0,
1262
+ "eval_accuracy": 0.9757901815736382,
1263
+ "eval_loss": 0.5840117335319519,
1264
+ "eval_runtime": 97.2696,
1265
+ "eval_samples_per_second": 15.287,
1266
+ "eval_steps_per_second": 15.287,
1267
+ "step": 3450
1268
+ },
1269
+ {
1270
+ "epoch": 6.017391304347826,
1271
+ "grad_norm": 3.5959298610687256,
1272
+ "learning_rate": 0.00022260869565217392,
1273
+ "loss": 1.1254,
1274
  "step": 3460
1275
  },
1276
  {
1277
+ "epoch": 6.052173913043478,
1278
+ "grad_norm": 3.9623775482177734,
1279
+ "learning_rate": 0.00022067632850241545,
1280
+ "loss": 1.0343,
1281
  "step": 3480
1282
  },
1283
  {
1284
+ "epoch": 6.086956521739131,
1285
+ "grad_norm": 3.735102415084839,
1286
+ "learning_rate": 0.00021874396135265702,
1287
+ "loss": 1.0348,
1288
  "step": 3500
1289
  },
1290
  {
1291
+ "epoch": 6.121739130434783,
1292
+ "grad_norm": 3.4255013465881348,
1293
+ "learning_rate": 0.00021681159420289855,
1294
+ "loss": 0.9796,
1295
  "step": 3520
1296
  },
1297
  {
1298
+ "epoch": 6.156521739130435,
1299
+ "grad_norm": 3.981841564178467,
1300
+ "learning_rate": 0.00021497584541062804,
1301
+ "loss": 0.9865,
1302
  "step": 3540
1303
  },
1304
  {
1305
+ "epoch": 6.191304347826087,
1306
+ "grad_norm": 3.9057116508483887,
1307
+ "learning_rate": 0.00021314009661835748,
1308
+ "loss": 1.0054,
1309
  "step": 3560
1310
  },
1311
  {
1312
+ "epoch": 6.226086956521739,
1313
+ "grad_norm": 3.626560688018799,
1314
+ "learning_rate": 0.00021120772946859904,
1315
+ "loss": 1.0012,
1316
  "step": 3580
1317
  },
1318
  {
1319
+ "epoch": 6.260869565217392,
1320
+ "grad_norm": 3.687683582305908,
1321
+ "learning_rate": 0.0002093719806763285,
1322
+ "loss": 1.0129,
1323
  "step": 3600
1324
  },
1325
  {
1326
+ "epoch": 6.2956521739130435,
1327
+ "grad_norm": 3.8632826805114746,
1328
+ "learning_rate": 0.00020763285024154592,
1329
+ "loss": 0.9333,
1330
  "step": 3620
1331
  },
1332
  {
1333
+ "epoch": 6.3304347826086955,
1334
+ "grad_norm": 4.089422702789307,
1335
+ "learning_rate": 0.0002058937198067633,
1336
+ "loss": 1.0259,
1337
  "step": 3640
1338
  },
1339
  {
1340
+ "epoch": 6.3652173913043475,
1341
+ "grad_norm": 4.261268615722656,
1342
+ "learning_rate": 0.00020415458937198067,
1343
+ "loss": 1.0184,
1344
  "step": 3660
1345
  },
1346
  {
1347
+ "epoch": 6.4,
1348
+ "grad_norm": 2.3901586532592773,
1349
+ "learning_rate": 0.0002026086956521739,
1350
+ "loss": 1.0293,
 
 
 
 
 
 
 
 
 
1351
  "step": 3680
1352
  },
1353
  {
1354
+ "epoch": 6.434782608695652,
1355
+ "grad_norm": 2.233633518218994,
1356
+ "learning_rate": 0.00020067632850241546,
1357
+ "loss": 1.0026,
1358
  "step": 3700
1359
  },
1360
  {
1361
+ "epoch": 6.469565217391304,
1362
+ "grad_norm": 2.049773693084717,
1363
+ "learning_rate": 0.00019893719806763285,
1364
+ "loss": 1.0426,
1365
  "step": 3720
1366
  },
1367
  {
1368
+ "epoch": 6.504347826086956,
1369
+ "grad_norm": 2.21939754486084,
1370
+ "learning_rate": 0.0001970048309178744,
1371
+ "loss": 1.0324,
1372
  "step": 3740
1373
  },
1374
  {
1375
+ "epoch": 6.539130434782608,
1376
+ "grad_norm": 2.2138895988464355,
1377
+ "learning_rate": 0.00019516908212560387,
1378
+ "loss": 1.0666,
1379
  "step": 3760
1380
  },
1381
  {
1382
+ "epoch": 6.573913043478261,
1383
+ "grad_norm": 1.9186855554580688,
1384
+ "learning_rate": 0.0001932367149758454,
1385
+ "loss": 1.0724,
1386
  "step": 3780
1387
  },
1388
  {
1389
+ "epoch": 6.608695652173913,
1390
+ "grad_norm": 1.302451729774475,
1391
+ "learning_rate": 0.00019159420289855073,
1392
+ "loss": 1.0867,
1393
  "step": 3800
1394
  },
1395
  {
1396
+ "epoch": 6.643478260869565,
1397
+ "grad_norm": 1.1770459413528442,
1398
+ "learning_rate": 0.00018975845410628022,
1399
+ "loss": 1.0659,
1400
  "step": 3820
1401
  },
1402
  {
1403
+ "epoch": 6.678260869565217,
1404
+ "grad_norm": 0.2651650309562683,
1405
+ "learning_rate": 0.0001881159420289855,
1406
+ "loss": 1.0494,
1407
  "step": 3840
1408
  },
1409
  {
1410
+ "epoch": 6.71304347826087,
1411
+ "grad_norm": 0.0,
1412
+ "learning_rate": 0.0001867632850241546,
1413
+ "loss": 1.0464,
1414
  "step": 3860
1415
  },
1416
  {
1417
+ "epoch": 6.747826086956522,
1418
+ "grad_norm": 0.0,
1419
+ "learning_rate": 0.000185024154589372,
1420
+ "loss": 1.0457,
1421
  "step": 3880
1422
  },
1423
  {
1424
+ "epoch": 6.782608695652174,
1425
+ "grad_norm": 0.0,
1426
+ "learning_rate": 0.00018328502415458937,
1427
+ "loss": 0.9815,
1428
  "step": 3900
1429
  },
1430
  {
1431
+ "epoch": 6.817391304347826,
1432
+ "grad_norm": 0.0,
1433
+ "learning_rate": 0.0001816425120772947,
1434
+ "loss": 1.0094,
1435
  "step": 3920
1436
  },
1437
  {
1438
+ "epoch": 6.852173913043478,
1439
+ "grad_norm": NaN,
1440
+ "learning_rate": 0.00018028985507246377,
1441
+ "loss": 1.0023,
1442
  "step": 3940
1443
  },
1444
  {
1445
+ "epoch": 6.886956521739131,
1446
+ "grad_norm": 0.0,
1447
+ "learning_rate": 0.00017893719806763288,
1448
+ "loss": 1.0278,
1449
  "step": 3960
1450
  },
1451
  {
1452
+ "epoch": 6.921739130434783,
1453
+ "grad_norm": 0.0,
1454
+ "learning_rate": 0.0001771014492753623,
1455
+ "loss": 1.0123,
1456
  "step": 3980
1457
  },
1458
  {
1459
+ "epoch": 6.956521739130435,
1460
+ "grad_norm": 0.0,
1461
+ "learning_rate": 0.00017565217391304346,
1462
+ "loss": 1.0774,
1463
  "step": 4000
1464
  },
1465
  {
1466
+ "epoch": 6.9913043478260875,
1467
+ "grad_norm": 0.0,
1468
+ "learning_rate": 0.00017391304347826088,
1469
+ "loss": 1.0484,
1470
  "step": 4020
1471
  },
1472
  {
1473
+ "epoch": 7.0,
1474
+ "eval_accuracy": 0.9737726967047747,
1475
+ "eval_loss": 0.5780686736106873,
1476
+ "eval_runtime": 118.8154,
1477
+ "eval_samples_per_second": 12.515,
1478
+ "eval_steps_per_second": 12.515,
1479
+ "step": 4025
1480
+ },
1481
+ {
1482
+ "epoch": 7.026086956521739,
1483
+ "grad_norm": 0.0,
1484
+ "learning_rate": 0.0001723671497584541,
1485
+ "loss": 0.9799,
1486
  "step": 4040
1487
  },
1488
  {
1489
+ "epoch": 7.060869565217391,
1490
+ "grad_norm": 0.0,
1491
+ "learning_rate": 0.00017091787439613525,
1492
+ "loss": 0.9588,
1493
  "step": 4060
1494
  },
1495
  {
1496
+ "epoch": 7.095652173913043,
1497
+ "grad_norm": NaN,
1498
+ "learning_rate": 0.00016966183574879226,
1499
+ "loss": 0.9421,
1500
  "step": 4080
1501
  },
1502
  {
1503
+ "epoch": 7.130434782608695,
1504
+ "grad_norm": 0.0,
1505
+ "learning_rate": 0.00016782608695652175,
1506
+ "loss": 0.9551,
1507
  "step": 4100
1508
  },
1509
  {
1510
+ "epoch": 7.165217391304348,
1511
+ "grad_norm": 0.0,
1512
+ "learning_rate": 0.00016618357487922704,
1513
+ "loss": 0.9622,
1514
  "step": 4120
1515
  },
1516
  {
1517
+ "epoch": 7.2,
1518
+ "grad_norm": 0.0,
1519
+ "learning_rate": 0.00016444444444444446,
1520
+ "loss": 0.9712,
1521
  "step": 4140
1522
  },
1523
  {
1524
+ "epoch": 7.234782608695652,
1525
+ "grad_norm": 0.0,
1526
+ "learning_rate": 0.00016299516908212561,
1527
+ "loss": 0.9834,
1528
  "step": 4160
1529
  },
1530
  {
1531
+ "epoch": 7.269565217391304,
1532
+ "grad_norm": NaN,
1533
+ "learning_rate": 0.00016135265700483093,
1534
+ "loss": 0.9968,
1535
  "step": 4180
1536
  },
1537
  {
1538
+ "epoch": 7.304347826086957,
1539
+ "grad_norm": 0.0,
1540
+ "learning_rate": 0.00015961352657004833,
1541
+ "loss": 0.956,
 
 
 
 
 
 
 
 
 
1542
  "step": 4200
1543
  },
1544
  {
1545
+ "epoch": 7.339130434782609,
1546
+ "grad_norm": 0.0,
1547
+ "learning_rate": 0.00015806763285024155,
1548
+ "loss": 0.8981,
1549
  "step": 4220
1550
  },
1551
  {
1552
+ "epoch": 7.373913043478261,
1553
+ "grad_norm": 0.0,
1554
+ "learning_rate": 0.00015642512077294684,
1555
+ "loss": 0.9515,
1556
  "step": 4240
1557
  },
1558
  {
1559
+ "epoch": 7.408695652173913,
1560
+ "grad_norm": 0.0,
1561
+ "learning_rate": 0.0001548792270531401,
1562
+ "loss": 0.9535,
1563
  "step": 4260
1564
  },
1565
  {
1566
+ "epoch": 7.443478260869565,
1567
+ "grad_norm": NaN,
1568
+ "learning_rate": 0.00015333333333333334,
1569
+ "loss": 0.9646,
1570
  "step": 4280
1571
  },
1572
  {
1573
+ "epoch": 7.478260869565218,
1574
+ "grad_norm": 0.0,
1575
+ "learning_rate": 0.00015140096618357487,
1576
+ "loss": 0.9821,
1577
  "step": 4300
1578
  },
1579
  {
1580
+ "epoch": 7.51304347826087,
1581
+ "grad_norm": 0.0,
1582
+ "learning_rate": 0.00015014492753623188,
1583
+ "loss": 0.9259,
1584
  "step": 4320
1585
  },
1586
  {
1587
+ "epoch": 7.547826086956522,
1588
+ "grad_norm": 0.0,
1589
+ "learning_rate": 0.00014869565217391303,
1590
+ "loss": 0.9494,
1591
  "step": 4340
1592
  },
1593
  {
1594
+ "epoch": 7.582608695652174,
1595
+ "grad_norm": 0.0,
1596
+ "learning_rate": 0.00014714975845410628,
1597
+ "loss": 0.9305,
1598
  "step": 4360
1599
  },
1600
  {
1601
+ "epoch": 7.6173913043478265,
1602
+ "grad_norm": 0.0,
1603
+ "learning_rate": 0.0001455072463768116,
1604
+ "loss": 0.8889,
1605
  "step": 4380
1606
  },
1607
  {
1608
+ "epoch": 7.6521739130434785,
1609
+ "grad_norm": 0.0,
1610
+ "learning_rate": 0.00014396135265700482,
1611
+ "loss": 0.9524,
1612
  "step": 4400
1613
  },
1614
  {
1615
+ "epoch": 7.6869565217391305,
1616
+ "grad_norm": 0.0,
1617
+ "learning_rate": 0.00014231884057971014,
1618
+ "loss": 0.9065,
1619
  "step": 4420
1620
  },
1621
  {
1622
+ "epoch": 7.721739130434782,
1623
+ "grad_norm": 0.0,
1624
+ "learning_rate": 0.00014048309178743963,
1625
+ "loss": 0.9153,
1626
  "step": 4440
1627
  },
1628
  {
1629
+ "epoch": 7.756521739130434,
1630
+ "grad_norm": NaN,
1631
+ "learning_rate": 0.0001403864734299517,
1632
+ "loss": 0.6675,
1633
  "step": 4460
1634
  },
1635
  {
1636
+ "epoch": 7.791304347826087,
1637
+ "grad_norm": NaN,
1638
+ "learning_rate": 0.0001403864734299517,
1639
+ "loss": 0.0,
1640
  "step": 4480
1641
  },
1642
  {
1643
+ "epoch": 7.826086956521739,
1644
+ "grad_norm": NaN,
1645
+ "learning_rate": 0.0001403864734299517,
1646
+ "loss": 0.0,
1647
  "step": 4500
1648
  },
1649
  {
1650
+ "epoch": 7.860869565217391,
1651
+ "grad_norm": NaN,
1652
+ "learning_rate": 0.0001403864734299517,
1653
+ "loss": 0.0,
1654
  "step": 4520
1655
  },
1656
  {
1657
+ "epoch": 7.895652173913043,
1658
+ "grad_norm": NaN,
1659
+ "learning_rate": 0.0001403864734299517,
1660
+ "loss": 0.0,
1661
  "step": 4540
1662
  },
1663
  {
1664
+ "epoch": 7.930434782608696,
1665
+ "grad_norm": NaN,
1666
+ "learning_rate": 0.0001403864734299517,
1667
+ "loss": 0.0,
1668
  "step": 4560
1669
  },
1670
  {
1671
+ "epoch": 7.965217391304348,
1672
+ "grad_norm": NaN,
1673
+ "learning_rate": 0.0001403864734299517,
1674
+ "loss": 0.0,
1675
  "step": 4580
1676
  },
1677
  {
1678
+ "epoch": 8.0,
1679
+ "grad_norm": NaN,
1680
+ "learning_rate": 0.0001403864734299517,
1681
+ "loss": 0.0,
1682
+ "step": 4600
1683
+ },
1684
+ {
1685
+ "epoch": 8.0,
1686
+ "eval_accuracy": 0.0006724949562878278,
1687
+ "eval_loss": NaN,
1688
+ "eval_runtime": 129.6238,
1689
+ "eval_samples_per_second": 11.472,
1690
+ "eval_steps_per_second": 11.472,
1691
  "step": 4600
1692
  },
1693
  {
1694
+ "epoch": 8.034782608695652,
1695
+ "grad_norm": NaN,
1696
+ "learning_rate": 0.0001403864734299517,
1697
+ "loss": 0.0,
1698
  "step": 4620
1699
  },
1700
  {
1701
+ "epoch": 8.069565217391304,
1702
+ "grad_norm": NaN,
1703
+ "learning_rate": 0.0001403864734299517,
1704
+ "loss": 0.0,
1705
  "step": 4640
1706
  },
1707
  {
1708
+ "epoch": 8.104347826086956,
1709
+ "grad_norm": NaN,
1710
+ "learning_rate": 0.0001403864734299517,
1711
+ "loss": 0.0,
1712
  "step": 4660
1713
  },
1714
  {
1715
+ "epoch": 8.139130434782608,
1716
+ "grad_norm": NaN,
1717
+ "learning_rate": 0.0001403864734299517,
1718
+ "loss": 0.0,
1719
  "step": 4680
1720
  },
1721
  {
1722
+ "epoch": 8.173913043478262,
1723
+ "grad_norm": NaN,
1724
+ "learning_rate": 0.0001403864734299517,
1725
+ "loss": 0.0,
1726
  "step": 4700
1727
  },
1728
  {
1729
+ "epoch": 8.208695652173914,
1730
+ "grad_norm": NaN,
1731
+ "learning_rate": 0.0001403864734299517,
1732
+ "loss": 0.0,
 
 
 
 
 
 
 
 
 
1733
  "step": 4720
1734
  },
1735
  {
1736
+ "epoch": 8.243478260869566,
1737
+ "grad_norm": NaN,
1738
+ "learning_rate": 0.0001403864734299517,
1739
+ "loss": 0.0,
1740
  "step": 4740
1741
  },
1742
  {
1743
+ "epoch": 8.278260869565218,
1744
+ "grad_norm": NaN,
1745
+ "learning_rate": 0.0001403864734299517,
1746
+ "loss": 0.0,
1747
  "step": 4760
1748
  },
1749
  {
1750
+ "epoch": 8.31304347826087,
1751
+ "grad_norm": NaN,
1752
+ "learning_rate": 0.0001403864734299517,
1753
+ "loss": 0.0,
1754
  "step": 4780
1755
  },
1756
  {
1757
+ "epoch": 8.347826086956522,
1758
+ "grad_norm": NaN,
1759
+ "learning_rate": 0.0001403864734299517,
1760
+ "loss": 0.0,
1761
  "step": 4800
1762
  },
1763
  {
1764
+ "epoch": 8.382608695652173,
1765
+ "grad_norm": NaN,
1766
+ "learning_rate": 0.0001403864734299517,
1767
+ "loss": 0.0,
1768
  "step": 4820
1769
  },
1770
  {
1771
+ "epoch": 8.417391304347825,
1772
+ "grad_norm": NaN,
1773
+ "learning_rate": 0.0001403864734299517,
1774
+ "loss": 0.0,
1775
  "step": 4840
1776
  },
1777
  {
1778
+ "epoch": 8.452173913043477,
1779
+ "grad_norm": NaN,
1780
+ "learning_rate": 0.0001403864734299517,
1781
+ "loss": 0.0,
1782
  "step": 4860
1783
  },
1784
  {
1785
+ "epoch": 8.486956521739131,
1786
+ "grad_norm": NaN,
1787
+ "learning_rate": 0.0001403864734299517,
1788
+ "loss": 0.0,
1789
  "step": 4880
1790
  },
1791
  {
1792
+ "epoch": 8.521739130434783,
1793
+ "grad_norm": NaN,
1794
+ "learning_rate": 0.0001403864734299517,
1795
+ "loss": 0.0,
1796
  "step": 4900
1797
  },
1798
  {
1799
+ "epoch": 8.556521739130435,
1800
+ "grad_norm": NaN,
1801
+ "learning_rate": 0.0001403864734299517,
1802
+ "loss": 0.0,
1803
  "step": 4920
1804
  },
1805
  {
1806
+ "epoch": 8.591304347826087,
1807
+ "grad_norm": NaN,
1808
+ "learning_rate": 0.0001403864734299517,
1809
+ "loss": 0.0,
1810
  "step": 4940
1811
  },
1812
  {
1813
+ "epoch": 8.626086956521739,
1814
+ "grad_norm": NaN,
1815
+ "learning_rate": 0.0001403864734299517,
1816
+ "loss": 0.0,
1817
  "step": 4960
1818
  },
1819
  {
1820
+ "epoch": 8.660869565217391,
1821
+ "grad_norm": NaN,
1822
+ "learning_rate": 0.0001403864734299517,
1823
+ "loss": 0.0,
1824
  "step": 4980
1825
  },
1826
  {
1827
+ "epoch": 8.695652173913043,
1828
+ "grad_norm": NaN,
1829
+ "learning_rate": 0.0001403864734299517,
1830
+ "loss": 0.0,
1831
  "step": 5000
1832
  },
1833
  {
1834
+ "epoch": 8.730434782608695,
1835
+ "grad_norm": NaN,
1836
+ "learning_rate": 0.0001403864734299517,
1837
+ "loss": 0.0,
1838
  "step": 5020
1839
  },
1840
  {
1841
+ "epoch": 8.765217391304347,
1842
+ "grad_norm": NaN,
1843
+ "learning_rate": 0.0001403864734299517,
1844
+ "loss": 0.0,
1845
  "step": 5040
1846
  },
1847
  {
1848
+ "epoch": 8.8,
1849
+ "grad_norm": NaN,
1850
+ "learning_rate": 0.0001403864734299517,
1851
+ "loss": 0.0,
1852
  "step": 5060
1853
  },
1854
  {
1855
+ "epoch": 8.834782608695653,
1856
+ "grad_norm": NaN,
1857
+ "learning_rate": 0.0001403864734299517,
1858
+ "loss": 0.0,
1859
  "step": 5080
1860
  },
1861
  {
1862
+ "epoch": 8.869565217391305,
1863
+ "grad_norm": NaN,
1864
+ "learning_rate": 0.0001403864734299517,
1865
+ "loss": 0.0,
1866
  "step": 5100
1867
  },
1868
  {
1869
+ "epoch": 8.904347826086957,
1870
+ "grad_norm": NaN,
1871
+ "learning_rate": 0.0001403864734299517,
1872
+ "loss": 0.0,
1873
  "step": 5120
1874
  },
1875
  {
1876
+ "epoch": 8.939130434782609,
1877
+ "grad_norm": NaN,
1878
+ "learning_rate": 0.0001403864734299517,
1879
+ "loss": 0.0,
1880
  "step": 5140
1881
  },
1882
  {
1883
+ "epoch": 8.97391304347826,
1884
+ "grad_norm": NaN,
1885
+ "learning_rate": 0.0001403864734299517,
1886
+ "loss": 0.0,
1887
  "step": 5160
1888
  },
1889
  {
1890
+ "epoch": 9.0,
1891
+ "eval_accuracy": 0.0006724949562878278,
1892
+ "eval_loss": NaN,
1893
+ "eval_runtime": 117.1288,
1894
+ "eval_samples_per_second": 12.695,
1895
+ "eval_steps_per_second": 12.695,
1896
+ "step": 5175
1897
+ },
1898
+ {
1899
+ "epoch": 9.008695652173913,
1900
+ "grad_norm": NaN,
1901
+ "learning_rate": 0.0001403864734299517,
1902
+ "loss": 0.0,
1903
  "step": 5180
1904
  },
1905
  {
1906
+ "epoch": 9.043478260869565,
1907
+ "grad_norm": NaN,
1908
+ "learning_rate": 0.0001403864734299517,
1909
+ "loss": 0.0,
1910
  "step": 5200
1911
  },
1912
  {
1913
+ "epoch": 9.078260869565218,
1914
+ "grad_norm": NaN,
1915
+ "learning_rate": 0.0001403864734299517,
1916
+ "loss": 0.0,
1917
  "step": 5220
1918
  },
1919
+ {
1920
+ "epoch": 9.11304347826087,
1921
+ "grad_norm": NaN,
1922
+ "learning_rate": 0.0001403864734299517,
1923
+ "loss": 0.0,
1924
+ "step": 5240
1925
+ },
1926
+ {
1927
+ "epoch": 9.147826086956522,
1928
+ "grad_norm": NaN,
1929
+ "learning_rate": 0.0001403864734299517,
1930
+ "loss": 0.0,
1931
+ "step": 5260
1932
+ },
1933
+ {
1934
+ "epoch": 9.182608695652174,
1935
+ "grad_norm": NaN,
1936
+ "learning_rate": 0.0001403864734299517,
1937
+ "loss": 0.0,
1938
+ "step": 5280
1939
+ },
1940
+ {
1941
+ "epoch": 9.217391304347826,
1942
+ "grad_norm": NaN,
1943
+ "learning_rate": 0.0001403864734299517,
1944
+ "loss": 0.0,
1945
+ "step": 5300
1946
+ },
1947
+ {
1948
+ "epoch": 9.252173913043478,
1949
+ "grad_norm": NaN,
1950
+ "learning_rate": 0.0001403864734299517,
1951
+ "loss": 0.0,
1952
+ "step": 5320
1953
+ },
1954
+ {
1955
+ "epoch": 9.28695652173913,
1956
+ "grad_norm": NaN,
1957
+ "learning_rate": 0.0001403864734299517,
1958
+ "loss": 0.0,
1959
+ "step": 5340
1960
+ },
1961
+ {
1962
+ "epoch": 9.321739130434782,
1963
+ "grad_norm": NaN,
1964
+ "learning_rate": 0.0001403864734299517,
1965
+ "loss": 0.0,
1966
+ "step": 5360
1967
+ },
1968
+ {
1969
+ "epoch": 9.356521739130434,
1970
+ "grad_norm": NaN,
1971
+ "learning_rate": 0.0001403864734299517,
1972
+ "loss": 0.0,
1973
+ "step": 5380
1974
+ },
1975
+ {
1976
+ "epoch": 9.391304347826088,
1977
+ "grad_norm": NaN,
1978
+ "learning_rate": 0.0001403864734299517,
1979
+ "loss": 0.0,
1980
+ "step": 5400
1981
+ },
1982
+ {
1983
+ "epoch": 9.42608695652174,
1984
+ "grad_norm": NaN,
1985
+ "learning_rate": 0.0001403864734299517,
1986
+ "loss": 0.0,
1987
+ "step": 5420
1988
+ },
1989
+ {
1990
+ "epoch": 9.460869565217392,
1991
+ "grad_norm": NaN,
1992
+ "learning_rate": 0.0001403864734299517,
1993
+ "loss": 0.0,
1994
+ "step": 5440
1995
+ },
1996
+ {
1997
+ "epoch": 9.495652173913044,
1998
+ "grad_norm": NaN,
1999
+ "learning_rate": 0.0001403864734299517,
2000
+ "loss": 0.0,
2001
+ "step": 5460
2002
+ },
2003
+ {
2004
+ "epoch": 9.530434782608696,
2005
+ "grad_norm": NaN,
2006
+ "learning_rate": 0.0001403864734299517,
2007
+ "loss": 0.0,
2008
+ "step": 5480
2009
+ },
2010
+ {
2011
+ "epoch": 9.565217391304348,
2012
+ "grad_norm": NaN,
2013
+ "learning_rate": 0.0001403864734299517,
2014
+ "loss": 0.0,
2015
+ "step": 5500
2016
+ },
2017
+ {
2018
+ "epoch": 9.6,
2019
+ "grad_norm": NaN,
2020
+ "learning_rate": 0.0001403864734299517,
2021
+ "loss": 0.0,
2022
+ "step": 5520
2023
+ },
2024
+ {
2025
+ "epoch": 9.634782608695652,
2026
+ "grad_norm": NaN,
2027
+ "learning_rate": 0.0001403864734299517,
2028
+ "loss": 0.0,
2029
+ "step": 5540
2030
+ },
2031
+ {
2032
+ "epoch": 9.669565217391304,
2033
+ "grad_norm": NaN,
2034
+ "learning_rate": 0.0001403864734299517,
2035
+ "loss": 0.0,
2036
+ "step": 5560
2037
+ },
2038
+ {
2039
+ "epoch": 9.704347826086957,
2040
+ "grad_norm": NaN,
2041
+ "learning_rate": 0.0001403864734299517,
2042
+ "loss": 0.0,
2043
+ "step": 5580
2044
+ },
2045
+ {
2046
+ "epoch": 9.73913043478261,
2047
+ "grad_norm": NaN,
2048
+ "learning_rate": 0.0001403864734299517,
2049
+ "loss": 0.0,
2050
+ "step": 5600
2051
+ },
2052
+ {
2053
+ "epoch": 9.773913043478261,
2054
+ "grad_norm": NaN,
2055
+ "learning_rate": 0.0001403864734299517,
2056
+ "loss": 0.0,
2057
+ "step": 5620
2058
+ },
2059
+ {
2060
+ "epoch": 9.808695652173913,
2061
+ "grad_norm": NaN,
2062
+ "learning_rate": 0.0001403864734299517,
2063
+ "loss": 0.0,
2064
+ "step": 5640
2065
+ },
2066
+ {
2067
+ "epoch": 9.843478260869565,
2068
+ "grad_norm": NaN,
2069
+ "learning_rate": 0.0001403864734299517,
2070
+ "loss": 0.0,
2071
+ "step": 5660
2072
+ },
2073
+ {
2074
+ "epoch": 9.878260869565217,
2075
+ "grad_norm": NaN,
2076
+ "learning_rate": 0.0001403864734299517,
2077
+ "loss": 0.0,
2078
+ "step": 5680
2079
+ },
2080
+ {
2081
+ "epoch": 9.91304347826087,
2082
+ "grad_norm": NaN,
2083
+ "learning_rate": 0.0001403864734299517,
2084
+ "loss": 0.0,
2085
+ "step": 5700
2086
+ },
2087
+ {
2088
+ "epoch": 9.947826086956521,
2089
+ "grad_norm": NaN,
2090
+ "learning_rate": 0.0001403864734299517,
2091
+ "loss": 0.0,
2092
+ "step": 5720
2093
+ },
2094
+ {
2095
+ "epoch": 9.982608695652173,
2096
+ "grad_norm": NaN,
2097
+ "learning_rate": 0.0001403864734299517,
2098
+ "loss": 0.0,
2099
+ "step": 5740
2100
+ },
2101
  {
2102
  "epoch": 10.0,
2103
+ "eval_accuracy": 0.0006724949562878278,
2104
+ "eval_loss": NaN,
2105
+ "eval_runtime": 103.3199,
2106
+ "eval_samples_per_second": 14.392,
2107
+ "eval_steps_per_second": 14.392,
2108
+ "step": 5750
2109
  },
2110
  {
2111
  "epoch": 10.0,
2112
+ "step": 5750,
2113
+ "total_flos": 2.7398100529152e+18,
2114
+ "train_loss": 2.9414075751926587,
2115
+ "train_runtime": 59857.6179,
2116
+ "train_samples_per_second": 24.584,
2117
+ "train_steps_per_second": 0.096
2118
  }
2119
  ],
2120
  "logging_steps": 20,
2121
+ "max_steps": 5750,
2122
  "num_input_tokens_seen": 0,
2123
  "num_train_epochs": 10,
2124
  "save_steps": 500,
 
2134
  "attributes": {}
2135
  }
2136
  },
2137
+ "total_flos": 2.7398100529152e+18,
2138
  "train_batch_size": 256,
2139
  "trial_name": null,
2140
  "trial_params": null