redbioma commited on
Commit
ad5d69f
·
verified ·
1 Parent(s): 05aa423

🍻 cheers

Browse files
README.md CHANGED
@@ -3,6 +3,7 @@ library_name: transformers
3
  license: apache-2.0
4
  base_model: facebook/deit-base-distilled-patch16-224
5
  tags:
 
6
  - generated_from_trainer
7
  metrics:
8
  - accuracy
@@ -17,11 +18,11 @@ should probably proofread and complete it, then remove this comment. -->
17
 
18
  # deit-CEMEDE
19
 
20
- This model is a fine-tuned version of [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224) on an unknown dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.9955
23
- - Accuracy: 0.8008
24
- - F1: 0.8183
25
 
26
  ## Model description
27
 
 
3
  license: apache-2.0
4
  base_model: facebook/deit-base-distilled-patch16-224
5
  tags:
6
+ - image-classification
7
  - generated_from_trainer
8
  metrics:
9
  - accuracy
 
18
 
19
  # deit-CEMEDE
20
 
21
+ This model is a fine-tuned version of [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224) on the cemede dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.8585
24
+ - Accuracy: 0.7884
25
+ - F1: 0.7973
26
 
27
  ## Model description
28
 
all_results.json CHANGED
@@ -1,14 +1,14 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.8155016642891108,
4
- "eval_f1": 0.7820920610319438,
5
- "eval_loss": 0.8849401473999023,
6
- "eval_runtime": 12.4391,
7
- "eval_samples_per_second": 169.064,
8
- "eval_steps_per_second": 21.143,
9
- "total_flos": 1.611433851126866e+18,
10
- "train_loss": 0.3103342184309776,
11
- "train_runtime": 732.7839,
12
- "train_samples_per_second": 70.928,
13
- "train_steps_per_second": 8.87
14
  }
 
1
  {
2
+ "epoch": 1.6923076923076923,
3
+ "eval_accuracy": 0.7883975273418925,
4
+ "eval_f1": 0.7973086083953446,
5
+ "eval_loss": 0.8585250973701477,
6
+ "eval_runtime": 12.5738,
7
+ "eval_samples_per_second": 167.252,
8
+ "eval_steps_per_second": 20.916,
9
+ "total_flos": 1.3637892549580186e+18,
10
+ "train_loss": 0.3320726641470736,
11
+ "train_runtime": 627.5337,
12
+ "train_samples_per_second": 82.824,
13
+ "train_steps_per_second": 10.358
14
  }
eval_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 2.0,
3
- "eval_accuracy": 0.8155016642891108,
4
- "eval_f1": 0.7820920610319438,
5
- "eval_loss": 0.8849401473999023,
6
- "eval_runtime": 12.4391,
7
- "eval_samples_per_second": 169.064,
8
- "eval_steps_per_second": 21.143
9
  }
 
1
  {
2
+ "epoch": 1.6923076923076923,
3
+ "eval_accuracy": 0.7883975273418925,
4
+ "eval_f1": 0.7973086083953446,
5
+ "eval_loss": 0.8585250973701477,
6
+ "eval_runtime": 12.5738,
7
+ "eval_samples_per_second": 167.252,
8
+ "eval_steps_per_second": 20.916
9
  }
runs/Aug10_02-56-03_instance-camaras/events.out.tfevents.1754795221.instance-camaras ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b42c9ea95e051e5bc8d73a868483da7b840faf76679b42cce2f3e9cbfbe3051
3
+ size 40
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.0,
3
- "total_flos": 1.611433851126866e+18,
4
- "train_loss": 0.3103342184309776,
5
- "train_runtime": 732.7839,
6
- "train_samples_per_second": 70.928,
7
- "train_steps_per_second": 8.87
8
  }
 
1
  {
2
+ "epoch": 1.6923076923076923,
3
+ "total_flos": 1.3637892549580186e+18,
4
+ "train_loss": 0.3320726641470736,
5
+ "train_runtime": 627.5337,
6
+ "train_samples_per_second": 82.824,
7
+ "train_steps_per_second": 10.358
8
  }
trainer_state.json CHANGED
@@ -1,2102 +1,1782 @@
1
  {
2
- "best_global_step": 1600,
3
- "best_metric": 0.8849401473999023,
4
- "best_model_checkpoint": "./deit-CEMEDE/checkpoint-1600",
5
- "epoch": 2.0,
6
  "eval_steps": 100,
7
- "global_step": 2600,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.007692307692307693,
14
- "grad_norm": 14.492234230041504,
15
- "learning_rate": 0.0001997846153846154,
16
- "loss": 2.7483,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.015384615384615385,
21
- "grad_norm": 9.443517684936523,
22
- "learning_rate": 0.00019947692307692308,
23
- "loss": 1.914,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.023076923076923078,
28
- "grad_norm": 7.049248218536377,
29
- "learning_rate": 0.00019916923076923078,
30
- "loss": 1.3368,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.03076923076923077,
35
- "grad_norm": 14.415254592895508,
36
- "learning_rate": 0.00019886153846153848,
37
- "loss": 1.4175,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.038461538461538464,
42
- "grad_norm": 4.933824062347412,
43
- "learning_rate": 0.00019855384615384615,
44
- "loss": 0.8836,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.046153846153846156,
49
- "grad_norm": 8.808599472045898,
50
- "learning_rate": 0.00019824615384615385,
51
- "loss": 0.8803,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.05384615384615385,
56
- "grad_norm": 9.65746784210205,
57
- "learning_rate": 0.00019793846153846154,
58
- "loss": 0.9991,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.06153846153846154,
63
- "grad_norm": 19.919265747070312,
64
- "learning_rate": 0.00019763076923076924,
65
- "loss": 1.039,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.06923076923076923,
70
- "grad_norm": 8.035650253295898,
71
- "learning_rate": 0.0001973230769230769,
72
- "loss": 0.6571,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.07692307692307693,
77
- "grad_norm": 11.841368675231934,
78
- "learning_rate": 0.00019701538461538464,
79
- "loss": 0.8285,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.07692307692307693,
84
- "eval_accuracy": 0.5468378506894912,
85
- "eval_f1": 0.35243936126182984,
86
- "eval_loss": 1.523919701576233,
87
- "eval_runtime": 12.5719,
88
- "eval_samples_per_second": 167.277,
89
- "eval_steps_per_second": 20.92,
90
  "step": 100
91
  },
92
  {
93
  "epoch": 0.08461538461538462,
94
- "grad_norm": 8.712408065795898,
95
- "learning_rate": 0.00019670769230769233,
96
- "loss": 0.9902,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.09230769230769231,
101
- "grad_norm": 23.094526290893555,
102
- "learning_rate": 0.0001964,
103
- "loss": 0.7044,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 0.1,
108
- "grad_norm": 15.830307960510254,
109
- "learning_rate": 0.0001960923076923077,
110
- "loss": 0.5335,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 0.1076923076923077,
115
- "grad_norm": 14.117691040039062,
116
- "learning_rate": 0.0001957846153846154,
117
- "loss": 0.69,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 0.11538461538461539,
122
- "grad_norm": 9.931026458740234,
123
- "learning_rate": 0.0001954769230769231,
124
- "loss": 0.7231,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 0.12307692307692308,
129
- "grad_norm": 4.095664978027344,
130
- "learning_rate": 0.00019516923076923077,
131
- "loss": 0.5577,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 0.13076923076923078,
136
- "grad_norm": 7.839579105377197,
137
- "learning_rate": 0.00019486153846153846,
138
- "loss": 0.2998,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 0.13846153846153847,
143
- "grad_norm": 9.043660163879395,
144
- "learning_rate": 0.00019455384615384616,
145
- "loss": 0.5577,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 0.14615384615384616,
150
- "grad_norm": 14.179769515991211,
151
- "learning_rate": 0.00019424615384615386,
152
- "loss": 0.8946,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.15384615384615385,
157
- "grad_norm": 5.835270881652832,
158
- "learning_rate": 0.00019393846153846155,
159
- "loss": 0.6238,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 0.15384615384615385,
164
- "eval_accuracy": 0.6880646695197337,
165
- "eval_f1": 0.6395354639429841,
166
- "eval_loss": 1.2642189264297485,
167
- "eval_runtime": 12.5255,
168
- "eval_samples_per_second": 167.898,
169
- "eval_steps_per_second": 20.997,
170
  "step": 200
171
  },
172
  {
173
  "epoch": 0.16153846153846155,
174
- "grad_norm": 9.955816268920898,
175
- "learning_rate": 0.00019363076923076923,
176
- "loss": 0.2387,
177
  "step": 210
178
  },
179
  {
180
  "epoch": 0.16923076923076924,
181
- "grad_norm": 18.43828773498535,
182
- "learning_rate": 0.00019332307692307695,
183
- "loss": 0.9951,
184
  "step": 220
185
  },
186
  {
187
  "epoch": 0.17692307692307693,
188
- "grad_norm": 12.050088882446289,
189
- "learning_rate": 0.00019301538461538462,
190
- "loss": 0.4487,
191
  "step": 230
192
  },
193
  {
194
  "epoch": 0.18461538461538463,
195
- "grad_norm": 4.2351155281066895,
196
- "learning_rate": 0.00019270769230769232,
197
- "loss": 0.6186,
198
  "step": 240
199
  },
200
  {
201
  "epoch": 0.19230769230769232,
202
- "grad_norm": 3.6490843296051025,
203
- "learning_rate": 0.00019240000000000001,
204
- "loss": 0.7701,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.2,
209
- "grad_norm": 8.663690567016602,
210
- "learning_rate": 0.0001920923076923077,
211
- "loss": 0.3304,
212
  "step": 260
213
  },
214
  {
215
  "epoch": 0.2076923076923077,
216
- "grad_norm": 6.940672874450684,
217
- "learning_rate": 0.00019178461538461538,
218
- "loss": 0.622,
219
  "step": 270
220
  },
221
  {
222
  "epoch": 0.2153846153846154,
223
- "grad_norm": 8.70635986328125,
224
- "learning_rate": 0.00019147692307692308,
225
- "loss": 0.5354,
226
  "step": 280
227
  },
228
  {
229
  "epoch": 0.2230769230769231,
230
- "grad_norm": 1.5922820568084717,
231
- "learning_rate": 0.00019116923076923078,
232
- "loss": 0.681,
233
  "step": 290
234
  },
235
  {
236
  "epoch": 0.23076923076923078,
237
- "grad_norm": 10.358661651611328,
238
- "learning_rate": 0.00019086153846153847,
239
- "loss": 0.3715,
240
  "step": 300
241
  },
242
  {
243
  "epoch": 0.23076923076923078,
244
- "eval_accuracy": 0.6723728007608178,
245
- "eval_f1": 0.5919683394883382,
246
- "eval_loss": 1.1914079189300537,
247
- "eval_runtime": 12.5128,
248
- "eval_samples_per_second": 168.067,
249
- "eval_steps_per_second": 21.018,
250
  "step": 300
251
  },
252
  {
253
  "epoch": 0.23846153846153847,
254
- "grad_norm": 12.635391235351562,
255
- "learning_rate": 0.00019055384615384617,
256
- "loss": 0.2776,
257
  "step": 310
258
  },
259
  {
260
  "epoch": 0.24615384615384617,
261
- "grad_norm": 15.975918769836426,
262
- "learning_rate": 0.00019024615384615384,
263
- "loss": 0.469,
264
  "step": 320
265
  },
266
  {
267
  "epoch": 0.25384615384615383,
268
- "grad_norm": 21.361492156982422,
269
- "learning_rate": 0.00018993846153846157,
270
- "loss": 0.3723,
271
  "step": 330
272
  },
273
  {
274
  "epoch": 0.26153846153846155,
275
- "grad_norm": 6.521085739135742,
276
- "learning_rate": 0.00018963076923076924,
277
- "loss": 0.7115,
278
  "step": 340
279
  },
280
  {
281
  "epoch": 0.2692307692307692,
282
- "grad_norm": 8.805971145629883,
283
- "learning_rate": 0.00018932307692307693,
284
- "loss": 0.4367,
285
  "step": 350
286
  },
287
  {
288
  "epoch": 0.27692307692307694,
289
- "grad_norm": 8.943021774291992,
290
- "learning_rate": 0.0001890153846153846,
291
- "loss": 0.737,
292
  "step": 360
293
  },
294
  {
295
  "epoch": 0.2846153846153846,
296
- "grad_norm": 11.837237358093262,
297
- "learning_rate": 0.00018870769230769233,
298
- "loss": 0.6266,
299
  "step": 370
300
  },
301
  {
302
  "epoch": 0.2923076923076923,
303
- "grad_norm": 6.2347235679626465,
304
- "learning_rate": 0.0001884,
305
- "loss": 0.5612,
306
  "step": 380
307
  },
308
  {
309
  "epoch": 0.3,
310
- "grad_norm": 4.176232814788818,
311
- "learning_rate": 0.0001880923076923077,
312
- "loss": 0.5348,
313
  "step": 390
314
  },
315
  {
316
  "epoch": 0.3076923076923077,
317
- "grad_norm": 7.420060634613037,
318
- "learning_rate": 0.0001877846153846154,
319
- "loss": 0.4736,
320
  "step": 400
321
  },
322
  {
323
  "epoch": 0.3076923076923077,
324
- "eval_accuracy": 0.7137422729434142,
325
- "eval_f1": 0.6850347529661042,
326
- "eval_loss": 1.140069603919983,
327
- "eval_runtime": 12.5717,
328
- "eval_samples_per_second": 167.281,
329
- "eval_steps_per_second": 20.92,
330
  "step": 400
331
  },
332
  {
333
  "epoch": 0.3153846153846154,
334
- "grad_norm": 21.14431381225586,
335
- "learning_rate": 0.0001874769230769231,
336
- "loss": 0.3025,
337
  "step": 410
338
  },
339
  {
340
  "epoch": 0.3230769230769231,
341
- "grad_norm": 4.992921829223633,
342
- "learning_rate": 0.0001871692307692308,
343
- "loss": 0.4013,
344
  "step": 420
345
  },
346
  {
347
  "epoch": 0.33076923076923076,
348
- "grad_norm": 2.906620740890503,
349
- "learning_rate": 0.00018686153846153846,
350
- "loss": 0.6153,
351
  "step": 430
352
  },
353
  {
354
  "epoch": 0.3384615384615385,
355
- "grad_norm": 8.505741119384766,
356
- "learning_rate": 0.00018655384615384616,
357
- "loss": 0.4048,
358
  "step": 440
359
  },
360
  {
361
  "epoch": 0.34615384615384615,
362
- "grad_norm": 1.9161285161972046,
363
- "learning_rate": 0.00018624615384615385,
364
- "loss": 0.36,
365
  "step": 450
366
  },
367
  {
368
  "epoch": 0.35384615384615387,
369
- "grad_norm": 14.800432205200195,
370
- "learning_rate": 0.00018593846153846155,
371
- "loss": 0.557,
372
  "step": 460
373
  },
374
  {
375
  "epoch": 0.36153846153846153,
376
- "grad_norm": 11.860761642456055,
377
- "learning_rate": 0.00018563076923076922,
378
- "loss": 0.3601,
379
  "step": 470
380
  },
381
  {
382
  "epoch": 0.36923076923076925,
383
- "grad_norm": 1.1859639883041382,
384
- "learning_rate": 0.00018532307692307694,
385
- "loss": 0.2586,
386
  "step": 480
387
  },
388
  {
389
  "epoch": 0.3769230769230769,
390
- "grad_norm": 6.978835582733154,
391
- "learning_rate": 0.00018501538461538464,
392
- "loss": 0.2885,
393
  "step": 490
394
  },
395
  {
396
  "epoch": 0.38461538461538464,
397
- "grad_norm": 0.2397751659154892,
398
- "learning_rate": 0.0001847076923076923,
399
- "loss": 0.382,
400
  "step": 500
401
  },
402
  {
403
  "epoch": 0.38461538461538464,
404
- "eval_accuracy": 0.6504992867332382,
405
- "eval_f1": 0.6265039413087525,
406
- "eval_loss": 1.5841457843780518,
407
- "eval_runtime": 12.5458,
408
- "eval_samples_per_second": 167.625,
409
- "eval_steps_per_second": 20.963,
410
  "step": 500
411
  },
412
  {
413
  "epoch": 0.3923076923076923,
414
- "grad_norm": 14.071270942687988,
415
- "learning_rate": 0.0001844,
416
- "loss": 0.4412,
417
  "step": 510
418
  },
419
  {
420
  "epoch": 0.4,
421
- "grad_norm": 2.5233845710754395,
422
- "learning_rate": 0.0001840923076923077,
423
- "loss": 0.3486,
424
  "step": 520
425
  },
426
  {
427
  "epoch": 0.4076923076923077,
428
- "grad_norm": 8.451884269714355,
429
- "learning_rate": 0.0001837846153846154,
430
- "loss": 0.3113,
431
  "step": 530
432
  },
433
  {
434
  "epoch": 0.4153846153846154,
435
- "grad_norm": 6.481016159057617,
436
- "learning_rate": 0.00018347692307692307,
437
- "loss": 0.4551,
438
  "step": 540
439
  },
440
  {
441
  "epoch": 0.4230769230769231,
442
- "grad_norm": 9.866789817810059,
443
- "learning_rate": 0.00018316923076923077,
444
- "loss": 0.5613,
445
  "step": 550
446
  },
447
  {
448
  "epoch": 0.4307692307692308,
449
- "grad_norm": 0.32447025179862976,
450
- "learning_rate": 0.00018286153846153847,
451
- "loss": 0.4739,
452
  "step": 560
453
  },
454
  {
455
  "epoch": 0.43846153846153846,
456
- "grad_norm": 0.5857470631599426,
457
- "learning_rate": 0.00018255384615384617,
458
- "loss": 0.4318,
459
  "step": 570
460
  },
461
  {
462
  "epoch": 0.4461538461538462,
463
- "grad_norm": 9.131505966186523,
464
- "learning_rate": 0.00018224615384615384,
465
- "loss": 0.2337,
466
  "step": 580
467
  },
468
  {
469
  "epoch": 0.45384615384615384,
470
- "grad_norm": 7.286001205444336,
471
- "learning_rate": 0.00018193846153846153,
472
- "loss": 0.2116,
473
  "step": 590
474
  },
475
  {
476
  "epoch": 0.46153846153846156,
477
- "grad_norm": 2.5808300971984863,
478
- "learning_rate": 0.00018163076923076926,
479
- "loss": 0.5738,
480
  "step": 600
481
  },
482
  {
483
  "epoch": 0.46153846153846156,
484
- "eval_accuracy": 0.6932952924393724,
485
- "eval_f1": 0.6330824959687575,
486
- "eval_loss": 1.3199223279953003,
487
- "eval_runtime": 12.5804,
488
- "eval_samples_per_second": 167.165,
489
- "eval_steps_per_second": 20.906,
490
  "step": 600
491
  },
492
  {
493
  "epoch": 0.46923076923076923,
494
- "grad_norm": 1.627287745475769,
495
- "learning_rate": 0.00018132307692307693,
496
- "loss": 0.1588,
497
  "step": 610
498
  },
499
  {
500
  "epoch": 0.47692307692307695,
501
- "grad_norm": 15.70780086517334,
502
- "learning_rate": 0.00018101538461538463,
503
- "loss": 0.2552,
504
  "step": 620
505
  },
506
  {
507
  "epoch": 0.4846153846153846,
508
- "grad_norm": 11.650376319885254,
509
- "learning_rate": 0.00018070769230769232,
510
- "loss": 0.3532,
511
  "step": 630
512
  },
513
  {
514
  "epoch": 0.49230769230769234,
515
- "grad_norm": 9.004545211791992,
516
- "learning_rate": 0.00018040000000000002,
517
- "loss": 0.3774,
518
  "step": 640
519
  },
520
  {
521
  "epoch": 0.5,
522
- "grad_norm": 6.5288801193237305,
523
- "learning_rate": 0.0001800923076923077,
524
- "loss": 0.4177,
525
  "step": 650
526
  },
527
  {
528
  "epoch": 0.5076923076923077,
529
- "grad_norm": 6.529111385345459,
530
- "learning_rate": 0.0001797846153846154,
531
- "loss": 0.3681,
532
  "step": 660
533
  },
534
  {
535
  "epoch": 0.5153846153846153,
536
- "grad_norm": 16.29365348815918,
537
- "learning_rate": 0.00017947692307692309,
538
- "loss": 0.4766,
539
  "step": 670
540
  },
541
  {
542
  "epoch": 0.5230769230769231,
543
- "grad_norm": 2.9953908920288086,
544
- "learning_rate": 0.00017916923076923078,
545
- "loss": 0.217,
546
  "step": 680
547
  },
548
  {
549
  "epoch": 0.5307692307692308,
550
- "grad_norm": 9.404020309448242,
551
- "learning_rate": 0.00017886153846153848,
552
- "loss": 0.3664,
553
  "step": 690
554
  },
555
  {
556
  "epoch": 0.5384615384615384,
557
- "grad_norm": 0.16106829047203064,
558
- "learning_rate": 0.00017855384615384615,
559
- "loss": 0.2276,
560
  "step": 700
561
  },
562
  {
563
  "epoch": 0.5384615384615384,
564
- "eval_accuracy": 0.7517831669044223,
565
- "eval_f1": 0.7259228279152882,
566
- "eval_loss": 1.0970804691314697,
567
- "eval_runtime": 12.6544,
568
- "eval_samples_per_second": 166.187,
569
- "eval_steps_per_second": 20.783,
570
  "step": 700
571
  },
572
  {
573
  "epoch": 0.5461538461538461,
574
- "grad_norm": 14.301560401916504,
575
- "learning_rate": 0.00017824615384615388,
576
- "loss": 0.4534,
577
  "step": 710
578
  },
579
  {
580
  "epoch": 0.5538461538461539,
581
- "grad_norm": 10.59310245513916,
582
- "learning_rate": 0.00017793846153846155,
583
- "loss": 0.2536,
584
  "step": 720
585
  },
586
  {
587
  "epoch": 0.5615384615384615,
588
- "grad_norm": 6.413036346435547,
589
- "learning_rate": 0.00017763076923076924,
590
- "loss": 0.4327,
591
  "step": 730
592
  },
593
  {
594
  "epoch": 0.5692307692307692,
595
- "grad_norm": 0.8140048384666443,
596
- "learning_rate": 0.0001773230769230769,
597
- "loss": 0.3722,
598
  "step": 740
599
  },
600
  {
601
  "epoch": 0.5769230769230769,
602
- "grad_norm": 5.8807597160339355,
603
- "learning_rate": 0.00017701538461538464,
604
- "loss": 0.355,
605
  "step": 750
606
  },
607
  {
608
  "epoch": 0.5846153846153846,
609
- "grad_norm": 3.7453725337982178,
610
- "learning_rate": 0.0001767076923076923,
611
- "loss": 0.3793,
612
  "step": 760
613
  },
614
  {
615
  "epoch": 0.5923076923076923,
616
- "grad_norm": 28.321508407592773,
617
- "learning_rate": 0.0001764,
618
- "loss": 0.2802,
619
  "step": 770
620
  },
621
  {
622
  "epoch": 0.6,
623
- "grad_norm": 10.241598129272461,
624
- "learning_rate": 0.0001760923076923077,
625
- "loss": 0.7753,
626
  "step": 780
627
  },
628
  {
629
  "epoch": 0.6076923076923076,
630
- "grad_norm": 17.217174530029297,
631
- "learning_rate": 0.0001757846153846154,
632
- "loss": 0.2884,
633
  "step": 790
634
  },
635
  {
636
  "epoch": 0.6153846153846154,
637
- "grad_norm": 11.713438987731934,
638
- "learning_rate": 0.0001754769230769231,
639
- "loss": 0.3142,
640
  "step": 800
641
  },
642
  {
643
  "epoch": 0.6153846153846154,
644
- "eval_accuracy": 0.7351402757964812,
645
- "eval_f1": 0.7117245960074927,
646
- "eval_loss": 1.2919145822525024,
647
- "eval_runtime": 12.6264,
648
- "eval_samples_per_second": 166.555,
649
- "eval_steps_per_second": 20.829,
650
  "step": 800
651
  },
652
  {
653
  "epoch": 0.6230769230769231,
654
- "grad_norm": 9.485101699829102,
655
- "learning_rate": 0.00017516923076923077,
656
- "loss": 0.3256,
657
  "step": 810
658
  },
659
  {
660
  "epoch": 0.6307692307692307,
661
- "grad_norm": 4.645462512969971,
662
- "learning_rate": 0.00017486153846153846,
663
- "loss": 0.1156,
664
  "step": 820
665
  },
666
  {
667
  "epoch": 0.6384615384615384,
668
- "grad_norm": 8.31661319732666,
669
- "learning_rate": 0.00017455384615384616,
670
- "loss": 0.2883,
671
  "step": 830
672
  },
673
  {
674
  "epoch": 0.6461538461538462,
675
- "grad_norm": 9.54610824584961,
676
- "learning_rate": 0.00017424615384615386,
677
- "loss": 0.4546,
678
  "step": 840
679
  },
680
  {
681
  "epoch": 0.6538461538461539,
682
- "grad_norm": 0.08736976981163025,
683
- "learning_rate": 0.00017393846153846153,
684
- "loss": 0.5806,
685
  "step": 850
686
  },
687
  {
688
  "epoch": 0.6615384615384615,
689
- "grad_norm": 3.294229507446289,
690
- "learning_rate": 0.00017363076923076925,
691
- "loss": 0.4676,
692
  "step": 860
693
  },
694
  {
695
  "epoch": 0.6692307692307692,
696
- "grad_norm": 2.508976936340332,
697
- "learning_rate": 0.00017332307692307692,
698
- "loss": 0.3833,
699
  "step": 870
700
  },
701
  {
702
  "epoch": 0.676923076923077,
703
- "grad_norm": 4.155116558074951,
704
- "learning_rate": 0.00017301538461538462,
705
- "loss": 0.5317,
706
  "step": 880
707
  },
708
  {
709
  "epoch": 0.6846153846153846,
710
- "grad_norm": 1.9500211477279663,
711
- "learning_rate": 0.00017270769230769232,
712
- "loss": 0.3115,
713
  "step": 890
714
  },
715
  {
716
  "epoch": 0.6923076923076923,
717
- "grad_norm": 8.204526901245117,
718
- "learning_rate": 0.00017240000000000002,
719
- "loss": 0.1997,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 0.6923076923076923,
724
- "eval_accuracy": 0.698525915359011,
725
- "eval_f1": 0.6896796828544773,
726
- "eval_loss": 1.143184781074524,
727
- "eval_runtime": 12.5409,
728
- "eval_samples_per_second": 167.691,
729
- "eval_steps_per_second": 20.971,
730
  "step": 900
731
  },
732
  {
733
  "epoch": 0.7,
734
- "grad_norm": 0.08845722675323486,
735
- "learning_rate": 0.00017209230769230771,
736
- "loss": 0.3255,
737
  "step": 910
738
  },
739
  {
740
  "epoch": 0.7076923076923077,
741
- "grad_norm": 0.24324318766593933,
742
- "learning_rate": 0.00017178461538461538,
743
- "loss": 0.1837,
744
  "step": 920
745
  },
746
  {
747
  "epoch": 0.7153846153846154,
748
- "grad_norm": 9.8903169631958,
749
- "learning_rate": 0.00017147692307692308,
750
- "loss": 0.1985,
751
  "step": 930
752
  },
753
  {
754
  "epoch": 0.7230769230769231,
755
- "grad_norm": 7.903842449188232,
756
- "learning_rate": 0.00017116923076923078,
757
- "loss": 0.237,
758
  "step": 940
759
  },
760
  {
761
  "epoch": 0.7307692307692307,
762
- "grad_norm": 6.531442642211914,
763
- "learning_rate": 0.00017086153846153848,
764
- "loss": 0.2567,
765
  "step": 950
766
  },
767
  {
768
  "epoch": 0.7384615384615385,
769
- "grad_norm": 10.636625289916992,
770
- "learning_rate": 0.00017055384615384615,
771
- "loss": 0.214,
772
  "step": 960
773
  },
774
  {
775
  "epoch": 0.7461538461538462,
776
- "grad_norm": 0.12718407809734344,
777
- "learning_rate": 0.00017024615384615384,
778
- "loss": 0.2414,
779
  "step": 970
780
  },
781
  {
782
  "epoch": 0.7538461538461538,
783
- "grad_norm": 15.172039031982422,
784
- "learning_rate": 0.00016993846153846157,
785
- "loss": 0.2235,
786
  "step": 980
787
  },
788
  {
789
  "epoch": 0.7615384615384615,
790
- "grad_norm": 0.3109220862388611,
791
- "learning_rate": 0.00016963076923076924,
792
- "loss": 0.3238,
793
  "step": 990
794
  },
795
  {
796
  "epoch": 0.7692307692307693,
797
- "grad_norm": 17.5101261138916,
798
- "learning_rate": 0.00016932307692307694,
799
- "loss": 0.4917,
800
  "step": 1000
801
  },
802
  {
803
  "epoch": 0.7692307692307693,
804
- "eval_accuracy": 0.7574893009985735,
805
- "eval_f1": 0.7479705301391664,
806
- "eval_loss": 1.0517505407333374,
807
- "eval_runtime": 13.879,
808
- "eval_samples_per_second": 151.524,
809
- "eval_steps_per_second": 18.949,
810
  "step": 1000
811
  },
812
  {
813
  "epoch": 0.7769230769230769,
814
- "grad_norm": 0.8310131430625916,
815
- "learning_rate": 0.00016901538461538463,
816
- "loss": 0.3775,
817
  "step": 1010
818
  },
819
  {
820
  "epoch": 0.7846153846153846,
821
- "grad_norm": 6.707934856414795,
822
- "learning_rate": 0.00016870769230769233,
823
- "loss": 0.2104,
824
  "step": 1020
825
  },
826
  {
827
  "epoch": 0.7923076923076923,
828
- "grad_norm": 22.98932647705078,
829
- "learning_rate": 0.0001684,
830
- "loss": 0.3316,
831
  "step": 1030
832
  },
833
  {
834
  "epoch": 0.8,
835
- "grad_norm": 9.406743049621582,
836
- "learning_rate": 0.0001680923076923077,
837
- "loss": 0.3291,
838
  "step": 1040
839
  },
840
  {
841
  "epoch": 0.8076923076923077,
842
- "grad_norm": 9.376912117004395,
843
- "learning_rate": 0.0001677846153846154,
844
- "loss": 0.1759,
845
  "step": 1050
846
  },
847
  {
848
  "epoch": 0.8153846153846154,
849
- "grad_norm": 3.6835243701934814,
850
- "learning_rate": 0.0001674769230769231,
851
- "loss": 0.2441,
852
  "step": 1060
853
  },
854
  {
855
  "epoch": 0.823076923076923,
856
- "grad_norm": 13.692422866821289,
857
- "learning_rate": 0.00016716923076923076,
858
- "loss": 0.4536,
859
  "step": 1070
860
  },
861
  {
862
  "epoch": 0.8307692307692308,
863
- "grad_norm": 1.0375890731811523,
864
- "learning_rate": 0.00016686153846153846,
865
- "loss": 0.4734,
866
  "step": 1080
867
  },
868
  {
869
  "epoch": 0.8384615384615385,
870
- "grad_norm": 15.942821502685547,
871
- "learning_rate": 0.00016655384615384618,
872
- "loss": 0.1703,
873
  "step": 1090
874
  },
875
  {
876
  "epoch": 0.8461538461538461,
877
- "grad_norm": 4.353820323944092,
878
- "learning_rate": 0.00016624615384615385,
879
- "loss": 0.1029,
880
  "step": 1100
881
  },
882
  {
883
  "epoch": 0.8461538461538461,
884
- "eval_accuracy": 0.792677127912506,
885
- "eval_f1": 0.7361843876143177,
886
- "eval_loss": 0.9429498314857483,
887
- "eval_runtime": 12.6397,
888
- "eval_samples_per_second": 166.381,
889
- "eval_steps_per_second": 20.808,
890
  "step": 1100
891
  },
892
  {
893
  "epoch": 0.8538461538461538,
894
- "grad_norm": 2.139413595199585,
895
- "learning_rate": 0.00016593846153846155,
896
- "loss": 0.3831,
897
  "step": 1110
898
  },
899
  {
900
  "epoch": 0.8615384615384616,
901
- "grad_norm": 10.040633201599121,
902
- "learning_rate": 0.00016563076923076922,
903
- "loss": 0.3734,
904
  "step": 1120
905
  },
906
  {
907
  "epoch": 0.8692307692307693,
908
- "grad_norm": 1.0804554224014282,
909
- "learning_rate": 0.00016532307692307695,
910
- "loss": 0.0746,
911
  "step": 1130
912
  },
913
  {
914
  "epoch": 0.8769230769230769,
915
- "grad_norm": 0.1748315691947937,
916
- "learning_rate": 0.00016501538461538462,
917
- "loss": 0.4497,
918
  "step": 1140
919
  },
920
  {
921
  "epoch": 0.8846153846153846,
922
- "grad_norm": 14.415447235107422,
923
- "learning_rate": 0.00016470769230769231,
924
- "loss": 0.2803,
925
  "step": 1150
926
  },
927
  {
928
  "epoch": 0.8923076923076924,
929
- "grad_norm": 1.3445711135864258,
930
- "learning_rate": 0.0001644,
931
- "loss": 0.4024,
932
  "step": 1160
933
  },
934
  {
935
  "epoch": 0.9,
936
- "grad_norm": 1.1939899921417236,
937
- "learning_rate": 0.0001640923076923077,
938
- "loss": 0.2185,
939
  "step": 1170
940
  },
941
  {
942
  "epoch": 0.9076923076923077,
943
- "grad_norm": 5.012731075286865,
944
- "learning_rate": 0.0001637846153846154,
945
- "loss": 0.1712,
946
  "step": 1180
947
  },
948
  {
949
  "epoch": 0.9153846153846154,
950
- "grad_norm": 0.46578100323677063,
951
- "learning_rate": 0.00016347692307692308,
952
- "loss": 0.2235,
953
  "step": 1190
954
  },
955
  {
956
  "epoch": 0.9230769230769231,
957
- "grad_norm": 8.191433906555176,
958
- "learning_rate": 0.00016316923076923077,
959
- "loss": 0.3715,
960
  "step": 1200
961
  },
962
  {
963
  "epoch": 0.9230769230769231,
964
- "eval_accuracy": 0.7051830718021873,
965
- "eval_f1": 0.6973363124749459,
966
- "eval_loss": 1.2021244764328003,
967
- "eval_runtime": 12.6278,
968
- "eval_samples_per_second": 166.537,
969
- "eval_steps_per_second": 20.827,
970
  "step": 1200
971
  },
972
  {
973
  "epoch": 0.9307692307692308,
974
- "grad_norm": 0.06772993505001068,
975
- "learning_rate": 0.00016286153846153847,
976
- "loss": 0.2099,
977
  "step": 1210
978
  },
979
  {
980
  "epoch": 0.9384615384615385,
981
- "grad_norm": 13.804479598999023,
982
- "learning_rate": 0.00016255384615384617,
983
- "loss": 0.4226,
984
  "step": 1220
985
  },
986
  {
987
  "epoch": 0.9461538461538461,
988
- "grad_norm": 9.784256935119629,
989
- "learning_rate": 0.00016224615384615384,
990
- "loss": 0.4287,
991
  "step": 1230
992
  },
993
  {
994
  "epoch": 0.9538461538461539,
995
- "grad_norm": 0.5529341101646423,
996
- "learning_rate": 0.00016193846153846156,
997
- "loss": 0.3275,
998
  "step": 1240
999
  },
1000
  {
1001
  "epoch": 0.9615384615384616,
1002
- "grad_norm": 1.4057235717773438,
1003
- "learning_rate": 0.00016163076923076923,
1004
- "loss": 0.3983,
1005
  "step": 1250
1006
  },
1007
  {
1008
  "epoch": 0.9692307692307692,
1009
- "grad_norm": 2.183885335922241,
1010
- "learning_rate": 0.00016132307692307693,
1011
- "loss": 0.1525,
1012
  "step": 1260
1013
  },
1014
  {
1015
  "epoch": 0.9769230769230769,
1016
- "grad_norm": 4.444169521331787,
1017
- "learning_rate": 0.0001610153846153846,
1018
- "loss": 0.3299,
1019
  "step": 1270
1020
  },
1021
  {
1022
  "epoch": 0.9846153846153847,
1023
- "grad_norm": 7.490978240966797,
1024
- "learning_rate": 0.00016070769230769233,
1025
- "loss": 0.3253,
1026
  "step": 1280
1027
  },
1028
  {
1029
  "epoch": 0.9923076923076923,
1030
- "grad_norm": 7.95802116394043,
1031
- "learning_rate": 0.00016040000000000002,
1032
- "loss": 0.1808,
1033
  "step": 1290
1034
  },
1035
  {
1036
  "epoch": 1.0,
1037
- "grad_norm": 0.03983300179243088,
1038
- "learning_rate": 0.0001600923076923077,
1039
- "loss": 0.5432,
1040
  "step": 1300
1041
  },
1042
  {
1043
  "epoch": 1.0,
1044
- "eval_accuracy": 0.7051830718021873,
1045
- "eval_f1": 0.6667419004227463,
1046
- "eval_loss": 1.296899676322937,
1047
- "eval_runtime": 12.4667,
1048
- "eval_samples_per_second": 168.69,
1049
- "eval_steps_per_second": 21.096,
1050
  "step": 1300
1051
  },
1052
  {
1053
  "epoch": 1.0076923076923077,
1054
- "grad_norm": 1.2856322526931763,
1055
- "learning_rate": 0.0001597846153846154,
1056
- "loss": 0.3966,
1057
  "step": 1310
1058
  },
1059
  {
1060
  "epoch": 1.0153846153846153,
1061
- "grad_norm": 0.08759118616580963,
1062
- "learning_rate": 0.0001594769230769231,
1063
- "loss": 0.1879,
1064
  "step": 1320
1065
  },
1066
  {
1067
  "epoch": 1.023076923076923,
1068
- "grad_norm": 1.4174461364746094,
1069
- "learning_rate": 0.00015916923076923079,
1070
- "loss": 0.1145,
1071
  "step": 1330
1072
  },
1073
  {
1074
  "epoch": 1.0307692307692307,
1075
- "grad_norm": 1.3266290426254272,
1076
- "learning_rate": 0.00015886153846153846,
1077
- "loss": 0.0771,
1078
  "step": 1340
1079
  },
1080
  {
1081
  "epoch": 1.0384615384615385,
1082
- "grad_norm": 12.582904815673828,
1083
- "learning_rate": 0.00015855384615384615,
1084
- "loss": 0.3993,
1085
  "step": 1350
1086
  },
1087
  {
1088
  "epoch": 1.0461538461538462,
1089
- "grad_norm": 0.2921277582645416,
1090
- "learning_rate": 0.00015824615384615385,
1091
- "loss": 0.1994,
1092
  "step": 1360
1093
  },
1094
  {
1095
  "epoch": 1.0538461538461539,
1096
- "grad_norm": 8.869648933410645,
1097
- "learning_rate": 0.00015793846153846155,
1098
- "loss": 0.2719,
1099
  "step": 1370
1100
  },
1101
  {
1102
  "epoch": 1.0615384615384615,
1103
- "grad_norm": 0.026608692482113838,
1104
- "learning_rate": 0.00015763076923076924,
1105
- "loss": 0.1551,
1106
  "step": 1380
1107
  },
1108
  {
1109
  "epoch": 1.0692307692307692,
1110
- "grad_norm": 0.24334125220775604,
1111
- "learning_rate": 0.00015732307692307694,
1112
- "loss": 0.1558,
1113
  "step": 1390
1114
  },
1115
  {
1116
  "epoch": 1.0769230769230769,
1117
- "grad_norm": 2.7597334384918213,
1118
- "learning_rate": 0.00015701538461538464,
1119
- "loss": 0.1645,
1120
  "step": 1400
1121
  },
1122
  {
1123
  "epoch": 1.0769230769230769,
1124
- "eval_accuracy": 0.750832144555397,
1125
- "eval_f1": 0.7742107003596032,
1126
- "eval_loss": 1.1917133331298828,
1127
- "eval_runtime": 12.6031,
1128
- "eval_samples_per_second": 166.864,
1129
- "eval_steps_per_second": 20.868,
1130
  "step": 1400
1131
  },
1132
  {
1133
  "epoch": 1.0846153846153845,
1134
- "grad_norm": 7.81436824798584,
1135
- "learning_rate": 0.0001567076923076923,
1136
- "loss": 0.3721,
1137
  "step": 1410
1138
  },
1139
  {
1140
  "epoch": 1.0923076923076924,
1141
- "grad_norm": 6.642779350280762,
1142
- "learning_rate": 0.0001564,
1143
- "loss": 0.2039,
1144
  "step": 1420
1145
  },
1146
  {
1147
  "epoch": 1.1,
1148
- "grad_norm": 1.3150064945220947,
1149
- "learning_rate": 0.0001560923076923077,
1150
- "loss": 0.0774,
1151
  "step": 1430
1152
  },
1153
  {
1154
  "epoch": 1.1076923076923078,
1155
- "grad_norm": 0.05881008133292198,
1156
- "learning_rate": 0.0001557846153846154,
1157
- "loss": 0.0806,
1158
  "step": 1440
1159
  },
1160
  {
1161
  "epoch": 1.1153846153846154,
1162
- "grad_norm": 0.20943497121334076,
1163
- "learning_rate": 0.00015547692307692307,
1164
- "loss": 0.0983,
1165
  "step": 1450
1166
  },
1167
  {
1168
  "epoch": 1.123076923076923,
1169
- "grad_norm": 0.2612471282482147,
1170
- "learning_rate": 0.00015516923076923077,
1171
- "loss": 0.2339,
1172
  "step": 1460
1173
  },
1174
  {
1175
  "epoch": 1.1307692307692307,
1176
- "grad_norm": 0.4519173502922058,
1177
- "learning_rate": 0.00015486153846153847,
1178
- "loss": 0.043,
1179
  "step": 1470
1180
  },
1181
  {
1182
  "epoch": 1.1384615384615384,
1183
- "grad_norm": 5.028959274291992,
1184
- "learning_rate": 0.00015455384615384616,
1185
- "loss": 0.1735,
1186
  "step": 1480
1187
  },
1188
  {
1189
  "epoch": 1.146153846153846,
1190
- "grad_norm": 6.712516784667969,
1191
- "learning_rate": 0.00015424615384615386,
1192
- "loss": 0.1878,
1193
  "step": 1490
1194
  },
1195
  {
1196
  "epoch": 1.1538461538461537,
1197
- "grad_norm": 0.013659660704433918,
1198
- "learning_rate": 0.00015393846153846153,
1199
- "loss": 0.1584,
1200
  "step": 1500
1201
  },
1202
  {
1203
  "epoch": 1.1538461538461537,
1204
- "eval_accuracy": 0.7689015691868759,
1205
- "eval_f1": 0.7645259140256611,
1206
- "eval_loss": 1.0244289636611938,
1207
- "eval_runtime": 12.573,
1208
- "eval_samples_per_second": 167.263,
1209
- "eval_steps_per_second": 20.918,
1210
  "step": 1500
1211
  },
1212
  {
1213
  "epoch": 1.1615384615384616,
1214
- "grad_norm": 0.012179275043308735,
1215
- "learning_rate": 0.00015363076923076926,
1216
- "loss": 0.0284,
1217
  "step": 1510
1218
  },
1219
  {
1220
  "epoch": 1.1692307692307693,
1221
- "grad_norm": 6.988743782043457,
1222
- "learning_rate": 0.00015332307692307693,
1223
- "loss": 0.1996,
1224
  "step": 1520
1225
  },
1226
  {
1227
  "epoch": 1.176923076923077,
1228
- "grad_norm": 1.0078998804092407,
1229
- "learning_rate": 0.00015301538461538462,
1230
- "loss": 0.1034,
1231
  "step": 1530
1232
  },
1233
  {
1234
  "epoch": 1.1846153846153846,
1235
- "grad_norm": 6.753546237945557,
1236
- "learning_rate": 0.0001527076923076923,
1237
- "loss": 0.2781,
1238
  "step": 1540
1239
  },
1240
  {
1241
  "epoch": 1.1923076923076923,
1242
- "grad_norm": 8.350870132446289,
1243
- "learning_rate": 0.00015240000000000002,
1244
- "loss": 0.1236,
1245
  "step": 1550
1246
  },
1247
  {
1248
  "epoch": 1.2,
1249
- "grad_norm": 0.2813374102115631,
1250
- "learning_rate": 0.0001520923076923077,
1251
- "loss": 0.1246,
1252
  "step": 1560
1253
  },
1254
  {
1255
  "epoch": 1.2076923076923076,
1256
- "grad_norm": 0.10124306380748749,
1257
- "learning_rate": 0.00015178461538461539,
1258
- "loss": 0.1692,
1259
  "step": 1570
1260
  },
1261
  {
1262
  "epoch": 1.2153846153846155,
1263
- "grad_norm": 6.230766773223877,
1264
- "learning_rate": 0.00015147692307692308,
1265
- "loss": 0.2269,
1266
  "step": 1580
1267
  },
1268
  {
1269
  "epoch": 1.2230769230769232,
1270
- "grad_norm": 0.22881397604942322,
1271
- "learning_rate": 0.00015116923076923078,
1272
- "loss": 0.1601,
1273
  "step": 1590
1274
  },
1275
  {
1276
  "epoch": 1.2307692307692308,
1277
- "grad_norm": 3.512399196624756,
1278
- "learning_rate": 0.00015086153846153848,
1279
- "loss": 0.0873,
1280
  "step": 1600
1281
  },
1282
  {
1283
  "epoch": 1.2307692307692308,
1284
- "eval_accuracy": 0.8155016642891108,
1285
- "eval_f1": 0.7820920610319438,
1286
- "eval_loss": 0.8849401473999023,
1287
- "eval_runtime": 12.3964,
1288
- "eval_samples_per_second": 169.647,
1289
- "eval_steps_per_second": 21.216,
1290
  "step": 1600
1291
  },
1292
  {
1293
  "epoch": 1.2384615384615385,
1294
- "grad_norm": 0.2611481249332428,
1295
- "learning_rate": 0.00015055384615384615,
1296
- "loss": 0.2379,
1297
  "step": 1610
1298
  },
1299
  {
1300
  "epoch": 1.2461538461538462,
1301
- "grad_norm": 0.1190253347158432,
1302
- "learning_rate": 0.00015024615384615385,
1303
- "loss": 0.0495,
1304
  "step": 1620
1305
  },
1306
  {
1307
  "epoch": 1.2538461538461538,
1308
- "grad_norm": 0.01619214005768299,
1309
- "learning_rate": 0.00014993846153846154,
1310
- "loss": 0.0515,
1311
  "step": 1630
1312
  },
1313
  {
1314
  "epoch": 1.2615384615384615,
1315
- "grad_norm": 0.008227836340665817,
1316
- "learning_rate": 0.00014963076923076924,
1317
- "loss": 0.0454,
1318
  "step": 1640
1319
  },
1320
  {
1321
  "epoch": 1.2692307692307692,
1322
- "grad_norm": 6.544041633605957,
1323
- "learning_rate": 0.0001493230769230769,
1324
- "loss": 0.1042,
1325
  "step": 1650
1326
  },
1327
  {
1328
  "epoch": 1.2769230769230768,
1329
- "grad_norm": 11.813011169433594,
1330
- "learning_rate": 0.00014901538461538463,
1331
- "loss": 0.3294,
1332
  "step": 1660
1333
  },
1334
  {
1335
  "epoch": 1.2846153846153845,
1336
- "grad_norm": 0.6227591633796692,
1337
- "learning_rate": 0.00014870769230769233,
1338
- "loss": 0.2643,
1339
  "step": 1670
1340
  },
1341
  {
1342
  "epoch": 1.2923076923076924,
1343
- "grad_norm": 5.635829925537109,
1344
- "learning_rate": 0.0001484,
1345
- "loss": 0.1771,
1346
  "step": 1680
1347
  },
1348
  {
1349
  "epoch": 1.3,
1350
- "grad_norm": 2.552438259124756,
1351
- "learning_rate": 0.0001480923076923077,
1352
- "loss": 0.0987,
1353
  "step": 1690
1354
  },
1355
  {
1356
  "epoch": 1.3076923076923077,
1357
- "grad_norm": 0.07186438143253326,
1358
- "learning_rate": 0.0001477846153846154,
1359
- "loss": 0.3077,
1360
  "step": 1700
1361
  },
1362
  {
1363
  "epoch": 1.3076923076923077,
1364
- "eval_accuracy": 0.7822158820732287,
1365
- "eval_f1": 0.725669962539516,
1366
- "eval_loss": 0.9735142588615417,
1367
- "eval_runtime": 12.5089,
1368
- "eval_samples_per_second": 168.12,
1369
- "eval_steps_per_second": 21.025,
1370
  "step": 1700
1371
  },
1372
  {
1373
  "epoch": 1.3153846153846154,
1374
- "grad_norm": 2.4193100929260254,
1375
- "learning_rate": 0.0001474769230769231,
1376
- "loss": 0.2109,
1377
  "step": 1710
1378
  },
1379
  {
1380
  "epoch": 1.323076923076923,
1381
- "grad_norm": 0.001091918908059597,
1382
- "learning_rate": 0.00014716923076923076,
1383
- "loss": 0.1222,
1384
  "step": 1720
1385
  },
1386
  {
1387
  "epoch": 1.3307692307692307,
1388
- "grad_norm": 15.755866050720215,
1389
- "learning_rate": 0.00014686153846153846,
1390
- "loss": 0.3006,
1391
  "step": 1730
1392
  },
1393
  {
1394
  "epoch": 1.3384615384615386,
1395
- "grad_norm": 16.329692840576172,
1396
- "learning_rate": 0.00014655384615384616,
1397
- "loss": 0.1837,
1398
  "step": 1740
1399
  },
1400
  {
1401
  "epoch": 1.3461538461538463,
1402
- "grad_norm": 6.158926963806152,
1403
- "learning_rate": 0.00014624615384615386,
1404
- "loss": 0.1682,
1405
  "step": 1750
1406
  },
1407
  {
1408
  "epoch": 1.353846153846154,
1409
- "grad_norm": 8.750590324401855,
1410
- "learning_rate": 0.00014593846153846153,
1411
- "loss": 0.2625,
1412
  "step": 1760
1413
  },
1414
  {
1415
  "epoch": 1.3615384615384616,
1416
- "grad_norm": 2.014807939529419,
1417
- "learning_rate": 0.00014563076923076922,
1418
- "loss": 0.242,
1419
  "step": 1770
1420
  },
1421
  {
1422
  "epoch": 1.3692307692307693,
1423
- "grad_norm": 0.013187545351684093,
1424
- "learning_rate": 0.00014532307692307695,
1425
- "loss": 0.1397,
1426
  "step": 1780
1427
  },
1428
  {
1429
  "epoch": 1.376923076923077,
1430
- "grad_norm": 2.364464044570923,
1431
- "learning_rate": 0.00014501538461538462,
1432
- "loss": 0.2673,
1433
  "step": 1790
1434
  },
1435
  {
1436
  "epoch": 1.3846153846153846,
1437
- "grad_norm": 1.5568692684173584,
1438
- "learning_rate": 0.00014470769230769232,
1439
- "loss": 0.0167,
1440
  "step": 1800
1441
  },
1442
  {
1443
  "epoch": 1.3846153846153846,
1444
- "eval_accuracy": 0.7622444127436995,
1445
- "eval_f1": 0.7753703057047507,
1446
- "eval_loss": 1.1507965326309204,
1447
- "eval_runtime": 12.5241,
1448
- "eval_samples_per_second": 167.916,
1449
- "eval_steps_per_second": 20.999,
1450
  "step": 1800
1451
  },
1452
  {
1453
  "epoch": 1.3923076923076922,
1454
- "grad_norm": 4.34848690032959,
1455
- "learning_rate": 0.0001444,
1456
- "loss": 0.0429,
1457
  "step": 1810
1458
  },
1459
  {
1460
  "epoch": 1.4,
1461
- "grad_norm": 0.049348413944244385,
1462
- "learning_rate": 0.0001440923076923077,
1463
- "loss": 0.1613,
1464
  "step": 1820
1465
  },
1466
  {
1467
  "epoch": 1.4076923076923076,
1468
- "grad_norm": 6.783623695373535,
1469
- "learning_rate": 0.00014378461538461538,
1470
- "loss": 0.316,
1471
  "step": 1830
1472
  },
1473
  {
1474
  "epoch": 1.4153846153846155,
1475
- "grad_norm": 21.144811630249023,
1476
- "learning_rate": 0.00014347692307692308,
1477
- "loss": 0.2016,
1478
  "step": 1840
1479
  },
1480
  {
1481
  "epoch": 1.4230769230769231,
1482
- "grad_norm": 2.688338041305542,
1483
- "learning_rate": 0.00014316923076923078,
1484
- "loss": 0.1888,
1485
  "step": 1850
1486
  },
1487
  {
1488
  "epoch": 1.4307692307692308,
1489
- "grad_norm": 0.6251327395439148,
1490
- "learning_rate": 0.00014286153846153847,
1491
- "loss": 0.1055,
1492
  "step": 1860
1493
  },
1494
  {
1495
  "epoch": 1.4384615384615385,
1496
- "grad_norm": 0.1399720460176468,
1497
- "learning_rate": 0.00014255384615384617,
1498
- "loss": 0.1032,
1499
  "step": 1870
1500
  },
1501
  {
1502
  "epoch": 1.4461538461538461,
1503
- "grad_norm": 0.18685415387153625,
1504
- "learning_rate": 0.00014224615384615384,
1505
- "loss": 0.2334,
1506
  "step": 1880
1507
  },
1508
  {
1509
  "epoch": 1.4538461538461538,
1510
- "grad_norm": 14.955801010131836,
1511
- "learning_rate": 0.00014193846153846156,
1512
- "loss": 0.0856,
1513
  "step": 1890
1514
  },
1515
  {
1516
  "epoch": 1.4615384615384617,
1517
- "grad_norm": 0.0011694247368723154,
1518
- "learning_rate": 0.00014163076923076924,
1519
- "loss": 0.0593,
1520
  "step": 1900
1521
  },
1522
  {
1523
  "epoch": 1.4615384615384617,
1524
- "eval_accuracy": 0.7727056585829767,
1525
- "eval_f1": 0.7687850851462801,
1526
- "eval_loss": 1.259105920791626,
1527
- "eval_runtime": 12.4837,
1528
- "eval_samples_per_second": 168.46,
1529
- "eval_steps_per_second": 21.068,
1530
  "step": 1900
1531
  },
1532
  {
1533
  "epoch": 1.4692307692307693,
1534
- "grad_norm": 0.0023549695033580065,
1535
- "learning_rate": 0.00014132307692307693,
1536
- "loss": 0.2332,
1537
  "step": 1910
1538
  },
1539
  {
1540
  "epoch": 1.476923076923077,
1541
- "grad_norm": 0.06210291385650635,
1542
- "learning_rate": 0.0001410153846153846,
1543
- "loss": 0.0498,
1544
  "step": 1920
1545
  },
1546
  {
1547
  "epoch": 1.4846153846153847,
1548
- "grad_norm": 0.007511141709983349,
1549
- "learning_rate": 0.00014070769230769233,
1550
- "loss": 0.0331,
1551
  "step": 1930
1552
  },
1553
  {
1554
  "epoch": 1.4923076923076923,
1555
- "grad_norm": 6.702728748321533,
1556
- "learning_rate": 0.0001404,
1557
- "loss": 0.182,
1558
  "step": 1940
1559
  },
1560
  {
1561
  "epoch": 1.5,
1562
- "grad_norm": 3.9564318656921387,
1563
- "learning_rate": 0.0001400923076923077,
1564
- "loss": 0.1,
1565
  "step": 1950
1566
  },
1567
  {
1568
  "epoch": 1.5076923076923077,
1569
- "grad_norm": 0.6039676070213318,
1570
- "learning_rate": 0.0001397846153846154,
1571
- "loss": 0.0499,
1572
  "step": 1960
1573
  },
1574
  {
1575
  "epoch": 1.5153846153846153,
1576
- "grad_norm": 0.0027682275976985693,
1577
- "learning_rate": 0.0001394769230769231,
1578
- "loss": 0.0046,
1579
  "step": 1970
1580
  },
1581
  {
1582
  "epoch": 1.523076923076923,
1583
- "grad_norm": 13.363993644714355,
1584
- "learning_rate": 0.0001391692307692308,
1585
- "loss": 0.1627,
1586
  "step": 1980
1587
  },
1588
  {
1589
  "epoch": 1.5307692307692307,
1590
- "grad_norm": 0.38019949197769165,
1591
- "learning_rate": 0.00013886153846153846,
1592
- "loss": 0.1781,
1593
  "step": 1990
1594
  },
1595
  {
1596
  "epoch": 1.5384615384615383,
1597
- "grad_norm": 0.7138615846633911,
1598
- "learning_rate": 0.00013855384615384615,
1599
- "loss": 0.3321,
1600
  "step": 2000
1601
  },
1602
  {
1603
  "epoch": 1.5384615384615383,
1604
- "eval_accuracy": 0.7836424155967665,
1605
- "eval_f1": 0.8206327189662326,
1606
- "eval_loss": 1.0946073532104492,
1607
- "eval_runtime": 12.5872,
1608
- "eval_samples_per_second": 167.074,
1609
- "eval_steps_per_second": 20.894,
1610
  "step": 2000
1611
  },
1612
  {
1613
  "epoch": 1.546153846153846,
1614
- "grad_norm": 0.12101047486066818,
1615
- "learning_rate": 0.00013824615384615385,
1616
- "loss": 0.0212,
1617
  "step": 2010
1618
  },
1619
  {
1620
  "epoch": 1.5538461538461539,
1621
- "grad_norm": 0.27666324377059937,
1622
- "learning_rate": 0.00013793846153846155,
1623
- "loss": 0.0318,
1624
  "step": 2020
1625
  },
1626
  {
1627
  "epoch": 1.5615384615384615,
1628
- "grad_norm": 0.14969737827777863,
1629
- "learning_rate": 0.00013763076923076922,
1630
- "loss": 0.1535,
1631
  "step": 2030
1632
  },
1633
  {
1634
  "epoch": 1.5692307692307692,
1635
- "grad_norm": 0.11491697281599045,
1636
  "learning_rate": 0.00013732307692307694,
1637
- "loss": 0.0937,
1638
  "step": 2040
1639
  },
1640
  {
1641
  "epoch": 1.5769230769230769,
1642
- "grad_norm": 0.007214740384370089,
1643
  "learning_rate": 0.00013701538461538461,
1644
- "loss": 0.1818,
1645
  "step": 2050
1646
  },
1647
  {
1648
  "epoch": 1.5846153846153848,
1649
- "grad_norm": 0.01648704707622528,
1650
  "learning_rate": 0.0001367076923076923,
1651
- "loss": 0.1719,
1652
  "step": 2060
1653
  },
1654
  {
1655
  "epoch": 1.5923076923076924,
1656
- "grad_norm": 0.04609803482890129,
1657
  "learning_rate": 0.0001364,
1658
- "loss": 0.2252,
1659
  "step": 2070
1660
  },
1661
  {
1662
  "epoch": 1.6,
1663
- "grad_norm": 0.1964152753353119,
1664
  "learning_rate": 0.0001360923076923077,
1665
- "loss": 0.0656,
1666
  "step": 2080
1667
  },
1668
  {
1669
  "epoch": 1.6076923076923078,
1670
- "grad_norm": 15.397407531738281,
1671
  "learning_rate": 0.0001357846153846154,
1672
- "loss": 0.2084,
1673
  "step": 2090
1674
  },
1675
  {
1676
  "epoch": 1.6153846153846154,
1677
- "grad_norm": 15.927462577819824,
1678
  "learning_rate": 0.00013547692307692307,
1679
- "loss": 0.1713,
1680
  "step": 2100
1681
  },
1682
  {
1683
  "epoch": 1.6153846153846154,
1684
- "eval_accuracy": 0.7546362339514978,
1685
- "eval_f1": 0.7967976441768924,
1686
- "eval_loss": 1.5009040832519531,
1687
- "eval_runtime": 12.5576,
1688
- "eval_samples_per_second": 167.468,
1689
- "eval_steps_per_second": 20.943,
1690
  "step": 2100
1691
  },
1692
  {
1693
  "epoch": 1.623076923076923,
1694
- "grad_norm": 0.049132537096738815,
1695
  "learning_rate": 0.00013516923076923077,
1696
- "loss": 0.0538,
1697
  "step": 2110
1698
  },
1699
  {
1700
  "epoch": 1.6307692307692307,
1701
- "grad_norm": 0.4282131791114807,
1702
  "learning_rate": 0.00013486153846153847,
1703
- "loss": 0.1949,
1704
  "step": 2120
1705
  },
1706
  {
1707
  "epoch": 1.6384615384615384,
1708
- "grad_norm": 0.4743361175060272,
1709
  "learning_rate": 0.00013455384615384617,
1710
- "loss": 0.1656,
1711
  "step": 2130
1712
  },
1713
  {
1714
  "epoch": 1.646153846153846,
1715
- "grad_norm": 0.011881379410624504,
1716
  "learning_rate": 0.00013424615384615384,
1717
- "loss": 0.2567,
1718
  "step": 2140
1719
  },
1720
  {
1721
  "epoch": 1.6538461538461537,
1722
- "grad_norm": 0.5166123509407043,
1723
  "learning_rate": 0.00013393846153846153,
1724
- "loss": 0.1373,
1725
  "step": 2150
1726
  },
1727
  {
1728
  "epoch": 1.6615384615384614,
1729
- "grad_norm": 7.127758026123047,
1730
  "learning_rate": 0.00013363076923076926,
1731
- "loss": 0.113,
1732
  "step": 2160
1733
  },
1734
  {
1735
  "epoch": 1.669230769230769,
1736
- "grad_norm": 13.634576797485352,
1737
  "learning_rate": 0.00013332307692307693,
1738
- "loss": 0.2203,
1739
  "step": 2170
1740
  },
1741
  {
1742
  "epoch": 1.676923076923077,
1743
- "grad_norm": 3.4795382022857666,
1744
  "learning_rate": 0.00013301538461538463,
1745
- "loss": 0.0577,
1746
  "step": 2180
1747
  },
1748
  {
1749
  "epoch": 1.6846153846153846,
1750
- "grad_norm": 0.1704314947128296,
1751
  "learning_rate": 0.00013270769230769232,
1752
- "loss": 0.3477,
1753
  "step": 2190
1754
  },
1755
  {
1756
  "epoch": 1.6923076923076923,
1757
- "grad_norm": 4.465692520141602,
1758
  "learning_rate": 0.00013240000000000002,
1759
- "loss": 0.3072,
1760
  "step": 2200
1761
  },
1762
  {
1763
  "epoch": 1.6923076923076923,
1764
- "eval_accuracy": 0.797432239657632,
1765
- "eval_f1": 0.8187976499019674,
1766
- "eval_loss": 1.1053212881088257,
1767
- "eval_runtime": 12.422,
1768
- "eval_samples_per_second": 169.297,
1769
- "eval_steps_per_second": 21.172,
1770
  "step": 2200
1771
  },
1772
  {
1773
- "epoch": 1.7,
1774
- "grad_norm": 0.04337165132164955,
1775
- "learning_rate": 0.0001320923076923077,
1776
- "loss": 0.1805,
1777
- "step": 2210
1778
- },
1779
- {
1780
- "epoch": 1.7076923076923078,
1781
- "grad_norm": 0.1556902378797531,
1782
- "learning_rate": 0.0001317846153846154,
1783
- "loss": 0.0158,
1784
- "step": 2220
1785
- },
1786
- {
1787
- "epoch": 1.7153846153846155,
1788
- "grad_norm": 7.362657070159912,
1789
- "learning_rate": 0.00013147692307692308,
1790
- "loss": 0.294,
1791
- "step": 2230
1792
- },
1793
- {
1794
- "epoch": 1.7230769230769232,
1795
- "grad_norm": 0.4276955723762512,
1796
- "learning_rate": 0.00013116923076923078,
1797
- "loss": 0.1368,
1798
- "step": 2240
1799
- },
1800
- {
1801
- "epoch": 1.7307692307692308,
1802
- "grad_norm": 7.7706427574157715,
1803
- "learning_rate": 0.00013086153846153845,
1804
- "loss": 0.0855,
1805
- "step": 2250
1806
- },
1807
- {
1808
- "epoch": 1.7384615384615385,
1809
- "grad_norm": 0.010043763555586338,
1810
- "learning_rate": 0.00013055384615384615,
1811
- "loss": 0.1108,
1812
- "step": 2260
1813
- },
1814
- {
1815
- "epoch": 1.7461538461538462,
1816
- "grad_norm": 0.09377148002386093,
1817
- "learning_rate": 0.00013024615384615387,
1818
- "loss": 0.3113,
1819
- "step": 2270
1820
- },
1821
- {
1822
- "epoch": 1.7538461538461538,
1823
- "grad_norm": 3.6055455207824707,
1824
- "learning_rate": 0.00012993846153846154,
1825
- "loss": 0.128,
1826
- "step": 2280
1827
- },
1828
- {
1829
- "epoch": 1.7615384615384615,
1830
- "grad_norm": 2.1152894496917725,
1831
- "learning_rate": 0.00012963076923076924,
1832
- "loss": 0.1954,
1833
- "step": 2290
1834
- },
1835
- {
1836
- "epoch": 1.7692307692307692,
1837
- "grad_norm": 0.0025315466336905956,
1838
- "learning_rate": 0.0001293230769230769,
1839
- "loss": 0.1457,
1840
- "step": 2300
1841
- },
1842
- {
1843
- "epoch": 1.7692307692307692,
1844
- "eval_accuracy": 0.7841179267712791,
1845
- "eval_f1": 0.8178538109762332,
1846
- "eval_loss": 1.0606149435043335,
1847
- "eval_runtime": 12.6141,
1848
- "eval_samples_per_second": 166.718,
1849
- "eval_steps_per_second": 20.85,
1850
- "step": 2300
1851
- },
1852
- {
1853
- "epoch": 1.7769230769230768,
1854
- "grad_norm": 0.02707645110785961,
1855
- "learning_rate": 0.00012901538461538464,
1856
- "loss": 0.0758,
1857
- "step": 2310
1858
- },
1859
- {
1860
- "epoch": 1.7846153846153845,
1861
- "grad_norm": 0.06029946357011795,
1862
- "learning_rate": 0.0001287076923076923,
1863
- "loss": 0.1275,
1864
- "step": 2320
1865
- },
1866
- {
1867
- "epoch": 1.7923076923076922,
1868
- "grad_norm": 0.21329770982265472,
1869
- "learning_rate": 0.0001284,
1870
- "loss": 0.0112,
1871
- "step": 2330
1872
- },
1873
- {
1874
- "epoch": 1.8,
1875
- "grad_norm": 0.9335693120956421,
1876
- "learning_rate": 0.0001280923076923077,
1877
- "loss": 0.2588,
1878
- "step": 2340
1879
- },
1880
- {
1881
- "epoch": 1.8076923076923077,
1882
- "grad_norm": 6.8204474449157715,
1883
- "learning_rate": 0.0001277846153846154,
1884
- "loss": 0.1611,
1885
- "step": 2350
1886
- },
1887
- {
1888
- "epoch": 1.8153846153846154,
1889
- "grad_norm": 0.023739751428365707,
1890
- "learning_rate": 0.0001274769230769231,
1891
- "loss": 0.0495,
1892
- "step": 2360
1893
- },
1894
- {
1895
- "epoch": 1.823076923076923,
1896
- "grad_norm": 15.699667930603027,
1897
- "learning_rate": 0.00012716923076923077,
1898
- "loss": 0.1245,
1899
- "step": 2370
1900
- },
1901
- {
1902
- "epoch": 1.830769230769231,
1903
- "grad_norm": 0.019991083070635796,
1904
- "learning_rate": 0.00012686153846153846,
1905
- "loss": 0.0221,
1906
- "step": 2380
1907
- },
1908
- {
1909
- "epoch": 1.8384615384615386,
1910
- "grad_norm": 0.24979303777217865,
1911
- "learning_rate": 0.00012655384615384616,
1912
- "loss": 0.1426,
1913
- "step": 2390
1914
- },
1915
- {
1916
- "epoch": 1.8461538461538463,
1917
- "grad_norm": 0.05418672040104866,
1918
- "learning_rate": 0.00012624615384615386,
1919
- "loss": 0.1211,
1920
- "step": 2400
1921
- },
1922
- {
1923
- "epoch": 1.8461538461538463,
1924
- "eval_accuracy": 0.8145506419400856,
1925
- "eval_f1": 0.8214275708995336,
1926
- "eval_loss": 0.933445394039154,
1927
- "eval_runtime": 12.5308,
1928
- "eval_samples_per_second": 167.826,
1929
- "eval_steps_per_second": 20.988,
1930
- "step": 2400
1931
- },
1932
- {
1933
- "epoch": 1.853846153846154,
1934
- "grad_norm": 0.018888354301452637,
1935
- "learning_rate": 0.00012593846153846153,
1936
- "loss": 0.0065,
1937
- "step": 2410
1938
- },
1939
- {
1940
- "epoch": 1.8615384615384616,
1941
- "grad_norm": 0.14981931447982788,
1942
- "learning_rate": 0.00012563076923076925,
1943
- "loss": 0.0036,
1944
- "step": 2420
1945
- },
1946
- {
1947
- "epoch": 1.8692307692307693,
1948
- "grad_norm": 0.1521558165550232,
1949
- "learning_rate": 0.00012532307692307692,
1950
- "loss": 0.0123,
1951
- "step": 2430
1952
- },
1953
- {
1954
- "epoch": 1.876923076923077,
1955
- "grad_norm": 0.6288071870803833,
1956
- "learning_rate": 0.00012501538461538462,
1957
- "loss": 0.0052,
1958
- "step": 2440
1959
- },
1960
- {
1961
- "epoch": 1.8846153846153846,
1962
- "grad_norm": 8.094133377075195,
1963
- "learning_rate": 0.0001247076923076923,
1964
- "loss": 0.2472,
1965
- "step": 2450
1966
- },
1967
- {
1968
- "epoch": 1.8923076923076922,
1969
- "grad_norm": 0.015556249767541885,
1970
- "learning_rate": 0.00012440000000000002,
1971
- "loss": 0.0531,
1972
- "step": 2460
1973
- },
1974
- {
1975
- "epoch": 1.9,
1976
- "grad_norm": 0.05205320566892624,
1977
- "learning_rate": 0.0001240923076923077,
1978
- "loss": 0.1907,
1979
- "step": 2470
1980
- },
1981
- {
1982
- "epoch": 1.9076923076923076,
1983
- "grad_norm": 6.338665962219238,
1984
- "learning_rate": 0.00012378461538461538,
1985
- "loss": 0.1587,
1986
- "step": 2480
1987
- },
1988
- {
1989
- "epoch": 1.9153846153846152,
1990
- "grad_norm": 0.4923330545425415,
1991
- "learning_rate": 0.00012347692307692308,
1992
- "loss": 0.0978,
1993
- "step": 2490
1994
- },
1995
- {
1996
- "epoch": 1.9230769230769231,
1997
- "grad_norm": 2.8340303897857666,
1998
- "learning_rate": 0.00012316923076923078,
1999
- "loss": 0.0175,
2000
- "step": 2500
2001
- },
2002
- {
2003
- "epoch": 1.9230769230769231,
2004
- "eval_accuracy": 0.734189253447456,
2005
- "eval_f1": 0.77245031199361,
2006
- "eval_loss": 1.9268134832382202,
2007
- "eval_runtime": 12.5739,
2008
- "eval_samples_per_second": 167.251,
2009
- "eval_steps_per_second": 20.916,
2010
- "step": 2500
2011
- },
2012
- {
2013
- "epoch": 1.9307692307692308,
2014
- "grad_norm": 7.629446029663086,
2015
- "learning_rate": 0.00012286153846153847,
2016
- "loss": 0.2534,
2017
- "step": 2510
2018
- },
2019
- {
2020
- "epoch": 1.9384615384615385,
2021
- "grad_norm": 0.5628572702407837,
2022
- "learning_rate": 0.00012255384615384614,
2023
- "loss": 0.1828,
2024
- "step": 2520
2025
- },
2026
- {
2027
- "epoch": 1.9461538461538461,
2028
- "grad_norm": 0.007969832979142666,
2029
- "learning_rate": 0.00012224615384615384,
2030
- "loss": 0.1046,
2031
- "step": 2530
2032
- },
2033
- {
2034
- "epoch": 1.953846153846154,
2035
- "grad_norm": 4.393219947814941,
2036
- "learning_rate": 0.00012193846153846154,
2037
- "loss": 0.221,
2038
- "step": 2540
2039
- },
2040
- {
2041
- "epoch": 1.9615384615384617,
2042
- "grad_norm": 2.58774995803833,
2043
- "learning_rate": 0.00012163076923076924,
2044
- "loss": 0.0651,
2045
- "step": 2550
2046
- },
2047
- {
2048
- "epoch": 1.9692307692307693,
2049
- "grad_norm": 0.01533615030348301,
2050
- "learning_rate": 0.00012132307692307693,
2051
- "loss": 0.3142,
2052
- "step": 2560
2053
- },
2054
- {
2055
- "epoch": 1.976923076923077,
2056
- "grad_norm": 0.04511953145265579,
2057
- "learning_rate": 0.00012101538461538462,
2058
- "loss": 0.0437,
2059
- "step": 2570
2060
- },
2061
- {
2062
- "epoch": 1.9846153846153847,
2063
- "grad_norm": 4.066844940185547,
2064
- "learning_rate": 0.00012070769230769232,
2065
- "loss": 0.1857,
2066
- "step": 2580
2067
- },
2068
- {
2069
- "epoch": 1.9923076923076923,
2070
- "grad_norm": 0.030109547078609467,
2071
- "learning_rate": 0.0001204,
2072
- "loss": 0.2628,
2073
- "step": 2590
2074
- },
2075
- {
2076
- "epoch": 2.0,
2077
- "grad_norm": 13.797858238220215,
2078
- "learning_rate": 0.00012009230769230771,
2079
- "loss": 0.0539,
2080
- "step": 2600
2081
- },
2082
- {
2083
- "epoch": 2.0,
2084
- "eval_accuracy": 0.81169757489301,
2085
- "eval_f1": 0.7984725776961168,
2086
- "eval_loss": 0.9400935769081116,
2087
- "eval_runtime": 12.4311,
2088
- "eval_samples_per_second": 169.173,
2089
- "eval_steps_per_second": 21.157,
2090
- "step": 2600
2091
- },
2092
- {
2093
- "epoch": 2.0,
2094
- "step": 2600,
2095
- "total_flos": 1.611433851126866e+18,
2096
- "train_loss": 0.3103342184309776,
2097
- "train_runtime": 732.7839,
2098
- "train_samples_per_second": 70.928,
2099
- "train_steps_per_second": 8.87
2100
  }
2101
  ],
2102
  "logging_steps": 10,
@@ -2108,7 +1788,7 @@
2108
  "EarlyStoppingCallback": {
2109
  "args": {
2110
  "early_stopping_patience": 10,
2111
- "early_stopping_threshold": 0.0002
2112
  },
2113
  "attributes": {
2114
  "early_stopping_patience_counter": 10
@@ -2125,7 +1805,7 @@
2125
  "attributes": {}
2126
  }
2127
  },
2128
- "total_flos": 1.611433851126866e+18,
2129
  "train_batch_size": 8,
2130
  "trial_name": null,
2131
  "trial_params": null
 
1
  {
2
+ "best_global_step": 1200,
3
+ "best_metric": 0.8585250973701477,
4
+ "best_model_checkpoint": "./deit-CEMEDE/checkpoint-1200",
5
+ "epoch": 1.6923076923076923,
6
  "eval_steps": 100,
7
+ "global_step": 2200,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.007692307692307693,
14
+ "grad_norm": 18.197193145751953,
15
+ "learning_rate": 0.00019975384615384615,
16
+ "loss": 2.5816,
17
  "step": 10
18
  },
19
  {
20
  "epoch": 0.015384615384615385,
21
+ "grad_norm": 8.680727005004883,
22
+ "learning_rate": 0.00019944615384615385,
23
+ "loss": 1.8162,
24
  "step": 20
25
  },
26
  {
27
  "epoch": 0.023076923076923078,
28
+ "grad_norm": 12.35321044921875,
29
+ "learning_rate": 0.00019913846153846155,
30
+ "loss": 1.3586,
31
  "step": 30
32
  },
33
  {
34
  "epoch": 0.03076923076923077,
35
+ "grad_norm": 16.556589126586914,
36
+ "learning_rate": 0.00019883076923076924,
37
+ "loss": 1.3773,
38
  "step": 40
39
  },
40
  {
41
  "epoch": 0.038461538461538464,
42
+ "grad_norm": 7.4731059074401855,
43
+ "learning_rate": 0.0001985230769230769,
44
+ "loss": 0.833,
45
  "step": 50
46
  },
47
  {
48
  "epoch": 0.046153846153846156,
49
+ "grad_norm": 16.327470779418945,
50
+ "learning_rate": 0.00019821538461538464,
51
+ "loss": 0.861,
52
  "step": 60
53
  },
54
  {
55
  "epoch": 0.05384615384615385,
56
+ "grad_norm": 6.821540832519531,
57
+ "learning_rate": 0.00019790769230769234,
58
+ "loss": 1.0562,
59
  "step": 70
60
  },
61
  {
62
  "epoch": 0.06153846153846154,
63
+ "grad_norm": 11.482409477233887,
64
+ "learning_rate": 0.0001976,
65
+ "loss": 1.0553,
66
  "step": 80
67
  },
68
  {
69
  "epoch": 0.06923076923076923,
70
+ "grad_norm": 7.535261154174805,
71
+ "learning_rate": 0.0001972923076923077,
72
+ "loss": 0.5367,
73
  "step": 90
74
  },
75
  {
76
  "epoch": 0.07692307692307693,
77
+ "grad_norm": 8.011478424072266,
78
+ "learning_rate": 0.0001969846153846154,
79
+ "loss": 0.5433,
80
  "step": 100
81
  },
82
  {
83
  "epoch": 0.07692307692307693,
84
+ "eval_accuracy": 0.6338563956252972,
85
+ "eval_f1": 0.43161314039771254,
86
+ "eval_loss": 1.4162613153457642,
87
+ "eval_runtime": 12.991,
88
+ "eval_samples_per_second": 161.882,
89
+ "eval_steps_per_second": 20.245,
90
  "step": 100
91
  },
92
  {
93
  "epoch": 0.08461538461538462,
94
+ "grad_norm": 6.010300159454346,
95
+ "learning_rate": 0.0001966769230769231,
96
+ "loss": 0.9265,
97
  "step": 110
98
  },
99
  {
100
  "epoch": 0.09230769230769231,
101
+ "grad_norm": 11.949559211730957,
102
+ "learning_rate": 0.00019636923076923077,
103
+ "loss": 0.4836,
104
  "step": 120
105
  },
106
  {
107
  "epoch": 0.1,
108
+ "grad_norm": 12.648119926452637,
109
+ "learning_rate": 0.00019606153846153847,
110
+ "loss": 0.7105,
111
  "step": 130
112
  },
113
  {
114
  "epoch": 0.1076923076923077,
115
+ "grad_norm": 11.041376113891602,
116
+ "learning_rate": 0.00019575384615384616,
117
+ "loss": 0.7821,
118
  "step": 140
119
  },
120
  {
121
  "epoch": 0.11538461538461539,
122
+ "grad_norm": 21.592378616333008,
123
+ "learning_rate": 0.00019544615384615386,
124
+ "loss": 0.5134,
125
  "step": 150
126
  },
127
  {
128
  "epoch": 0.12307692307692308,
129
+ "grad_norm": 9.74736499786377,
130
+ "learning_rate": 0.00019513846153846156,
131
+ "loss": 0.5469,
132
  "step": 160
133
  },
134
  {
135
  "epoch": 0.13076923076923078,
136
+ "grad_norm": 12.262529373168945,
137
+ "learning_rate": 0.00019483076923076923,
138
+ "loss": 0.392,
139
  "step": 170
140
  },
141
  {
142
  "epoch": 0.13846153846153847,
143
+ "grad_norm": 9.946588516235352,
144
+ "learning_rate": 0.00019452307692307695,
145
+ "loss": 0.5524,
146
  "step": 180
147
  },
148
  {
149
  "epoch": 0.14615384615384616,
150
+ "grad_norm": 18.688274383544922,
151
+ "learning_rate": 0.00019421538461538462,
152
+ "loss": 0.6801,
153
  "step": 190
154
  },
155
  {
156
  "epoch": 0.15384615384615385,
157
+ "grad_norm": 3.1866862773895264,
158
+ "learning_rate": 0.00019390769230769232,
159
+ "loss": 0.8153,
160
  "step": 200
161
  },
162
  {
163
  "epoch": 0.15384615384615385,
164
+ "eval_accuracy": 0.7013789824060865,
165
+ "eval_f1": 0.6079592746324705,
166
+ "eval_loss": 1.2107399702072144,
167
+ "eval_runtime": 12.8177,
168
+ "eval_samples_per_second": 164.07,
169
+ "eval_steps_per_second": 20.518,
170
  "step": 200
171
  },
172
  {
173
  "epoch": 0.16153846153846155,
174
+ "grad_norm": 10.409411430358887,
175
+ "learning_rate": 0.00019360000000000002,
176
+ "loss": 0.3139,
177
  "step": 210
178
  },
179
  {
180
  "epoch": 0.16923076923076924,
181
+ "grad_norm": 10.94472599029541,
182
+ "learning_rate": 0.00019329230769230771,
183
+ "loss": 0.7899,
184
  "step": 220
185
  },
186
  {
187
  "epoch": 0.17692307692307693,
188
+ "grad_norm": 13.132795333862305,
189
+ "learning_rate": 0.00019298461538461538,
190
+ "loss": 0.6232,
191
  "step": 230
192
  },
193
  {
194
  "epoch": 0.18461538461538463,
195
+ "grad_norm": 5.178615093231201,
196
+ "learning_rate": 0.00019267692307692308,
197
+ "loss": 0.6606,
198
  "step": 240
199
  },
200
  {
201
  "epoch": 0.19230769230769232,
202
+ "grad_norm": 3.8530232906341553,
203
+ "learning_rate": 0.00019236923076923078,
204
+ "loss": 0.648,
205
  "step": 250
206
  },
207
  {
208
  "epoch": 0.2,
209
+ "grad_norm": 9.361835479736328,
210
+ "learning_rate": 0.00019206153846153848,
211
+ "loss": 0.3533,
212
  "step": 260
213
  },
214
  {
215
  "epoch": 0.2076923076923077,
216
+ "grad_norm": 7.445398330688477,
217
+ "learning_rate": 0.00019175384615384617,
218
+ "loss": 0.5113,
219
  "step": 270
220
  },
221
  {
222
  "epoch": 0.2153846153846154,
223
+ "grad_norm": 10.053826332092285,
224
+ "learning_rate": 0.00019144615384615384,
225
+ "loss": 0.6366,
226
  "step": 280
227
  },
228
  {
229
  "epoch": 0.2230769230769231,
230
+ "grad_norm": 9.240071296691895,
231
+ "learning_rate": 0.00019113846153846157,
232
+ "loss": 0.7002,
233
  "step": 290
234
  },
235
  {
236
  "epoch": 0.23076923076923078,
237
+ "grad_norm": 2.5356545448303223,
238
+ "learning_rate": 0.00019083076923076924,
239
+ "loss": 0.4572,
240
  "step": 300
241
  },
242
  {
243
  "epoch": 0.23076923076923078,
244
+ "eval_accuracy": 0.7513076557299097,
245
+ "eval_f1": 0.675000963253108,
246
+ "eval_loss": 0.9055613279342651,
247
+ "eval_runtime": 12.7209,
248
+ "eval_samples_per_second": 165.319,
249
+ "eval_steps_per_second": 20.675,
250
  "step": 300
251
  },
252
  {
253
  "epoch": 0.23846153846153847,
254
+ "grad_norm": 5.946889400482178,
255
+ "learning_rate": 0.00019052307692307694,
256
+ "loss": 0.1694,
257
  "step": 310
258
  },
259
  {
260
  "epoch": 0.24615384615384617,
261
+ "grad_norm": 3.9482967853546143,
262
+ "learning_rate": 0.0001902153846153846,
263
+ "loss": 0.1883,
264
  "step": 320
265
  },
266
  {
267
  "epoch": 0.25384615384615383,
268
+ "grad_norm": 10.08955192565918,
269
+ "learning_rate": 0.00018990769230769233,
270
+ "loss": 0.2619,
271
  "step": 330
272
  },
273
  {
274
  "epoch": 0.26153846153846155,
275
+ "grad_norm": 6.082069396972656,
276
+ "learning_rate": 0.0001896,
277
+ "loss": 0.3425,
278
  "step": 340
279
  },
280
  {
281
  "epoch": 0.2692307692307692,
282
+ "grad_norm": 21.109539031982422,
283
+ "learning_rate": 0.0001892923076923077,
284
+ "loss": 0.4289,
285
  "step": 350
286
  },
287
  {
288
  "epoch": 0.27692307692307694,
289
+ "grad_norm": 12.561626434326172,
290
+ "learning_rate": 0.0001889846153846154,
291
+ "loss": 0.4526,
292
  "step": 360
293
  },
294
  {
295
  "epoch": 0.2846153846153846,
296
+ "grad_norm": 18.23844337463379,
297
+ "learning_rate": 0.0001886769230769231,
298
+ "loss": 0.6775,
299
  "step": 370
300
  },
301
  {
302
  "epoch": 0.2923076923076923,
303
+ "grad_norm": 14.852152824401855,
304
+ "learning_rate": 0.0001883692307692308,
305
+ "loss": 0.6215,
306
  "step": 380
307
  },
308
  {
309
  "epoch": 0.3,
310
+ "grad_norm": 4.879684925079346,
311
+ "learning_rate": 0.00018806153846153846,
312
+ "loss": 0.5646,
313
  "step": 390
314
  },
315
  {
316
  "epoch": 0.3076923076923077,
317
+ "grad_norm": 11.373760223388672,
318
+ "learning_rate": 0.00018775384615384616,
319
+ "loss": 0.5055,
320
  "step": 400
321
  },
322
  {
323
  "epoch": 0.3076923076923077,
324
+ "eval_accuracy": 0.6571564431764146,
325
+ "eval_f1": 0.5795112999251144,
326
+ "eval_loss": 1.2328877449035645,
327
+ "eval_runtime": 12.9793,
328
+ "eval_samples_per_second": 162.027,
329
+ "eval_steps_per_second": 20.263,
330
  "step": 400
331
  },
332
  {
333
  "epoch": 0.3153846153846154,
334
+ "grad_norm": 17.832473754882812,
335
+ "learning_rate": 0.00018744615384615386,
336
+ "loss": 0.5483,
337
  "step": 410
338
  },
339
  {
340
  "epoch": 0.3230769230769231,
341
+ "grad_norm": 7.938933372497559,
342
+ "learning_rate": 0.00018713846153846155,
343
+ "loss": 0.4665,
344
  "step": 420
345
  },
346
  {
347
  "epoch": 0.33076923076923076,
348
+ "grad_norm": 0.892392098903656,
349
+ "learning_rate": 0.00018683076923076922,
350
+ "loss": 0.4319,
351
  "step": 430
352
  },
353
  {
354
  "epoch": 0.3384615384615385,
355
+ "grad_norm": 12.021994590759277,
356
+ "learning_rate": 0.00018652307692307695,
357
+ "loss": 0.2883,
358
  "step": 440
359
  },
360
  {
361
  "epoch": 0.34615384615384615,
362
+ "grad_norm": 8.725289344787598,
363
+ "learning_rate": 0.00018621538461538462,
364
+ "loss": 0.3389,
365
  "step": 450
366
  },
367
  {
368
  "epoch": 0.35384615384615387,
369
+ "grad_norm": 13.443779945373535,
370
+ "learning_rate": 0.00018590769230769231,
371
+ "loss": 0.5416,
372
  "step": 460
373
  },
374
  {
375
  "epoch": 0.36153846153846153,
376
+ "grad_norm": 5.893988132476807,
377
+ "learning_rate": 0.0001856,
378
+ "loss": 0.3084,
379
  "step": 470
380
  },
381
  {
382
  "epoch": 0.36923076923076925,
383
+ "grad_norm": 2.6127381324768066,
384
+ "learning_rate": 0.0001852923076923077,
385
+ "loss": 0.3881,
386
  "step": 480
387
  },
388
  {
389
  "epoch": 0.3769230769230769,
390
+ "grad_norm": 4.684244155883789,
391
+ "learning_rate": 0.0001849846153846154,
392
+ "loss": 0.2972,
393
  "step": 490
394
  },
395
  {
396
  "epoch": 0.38461538461538464,
397
+ "grad_norm": 3.0813324451446533,
398
+ "learning_rate": 0.00018467692307692308,
399
+ "loss": 0.5404,
400
  "step": 500
401
  },
402
  {
403
  "epoch": 0.38461538461538464,
404
+ "eval_accuracy": 0.7156443176414645,
405
+ "eval_f1": 0.656711194356937,
406
+ "eval_loss": 1.1346296072006226,
407
+ "eval_runtime": 12.7812,
408
+ "eval_samples_per_second": 164.538,
409
+ "eval_steps_per_second": 20.577,
410
  "step": 500
411
  },
412
  {
413
  "epoch": 0.3923076923076923,
414
+ "grad_norm": 7.389800548553467,
415
+ "learning_rate": 0.00018436923076923077,
416
+ "loss": 0.3531,
417
  "step": 510
418
  },
419
  {
420
  "epoch": 0.4,
421
+ "grad_norm": 6.2316060066223145,
422
+ "learning_rate": 0.00018406153846153847,
423
+ "loss": 0.4253,
424
  "step": 520
425
  },
426
  {
427
  "epoch": 0.4076923076923077,
428
+ "grad_norm": 13.33460807800293,
429
+ "learning_rate": 0.00018375384615384617,
430
+ "loss": 0.23,
431
  "step": 530
432
  },
433
  {
434
  "epoch": 0.4153846153846154,
435
+ "grad_norm": 6.275513172149658,
436
+ "learning_rate": 0.00018344615384615384,
437
+ "loss": 0.6893,
438
  "step": 540
439
  },
440
  {
441
  "epoch": 0.4230769230769231,
442
+ "grad_norm": 7.9323649406433105,
443
+ "learning_rate": 0.00018313846153846154,
444
+ "loss": 0.5152,
445
  "step": 550
446
  },
447
  {
448
  "epoch": 0.4307692307692308,
449
+ "grad_norm": 0.08034035563468933,
450
+ "learning_rate": 0.00018283076923076926,
451
+ "loss": 0.3934,
452
  "step": 560
453
  },
454
  {
455
  "epoch": 0.43846153846153846,
456
+ "grad_norm": 0.45259109139442444,
457
+ "learning_rate": 0.00018252307692307693,
458
+ "loss": 0.332,
459
  "step": 570
460
  },
461
  {
462
  "epoch": 0.4461538461538462,
463
+ "grad_norm": 8.854157447814941,
464
+ "learning_rate": 0.00018221538461538463,
465
+ "loss": 0.321,
466
  "step": 580
467
  },
468
  {
469
  "epoch": 0.45384615384615384,
470
+ "grad_norm": 6.054807662963867,
471
+ "learning_rate": 0.00018190769230769233,
472
+ "loss": 0.2757,
473
  "step": 590
474
  },
475
  {
476
  "epoch": 0.46153846153846156,
477
+ "grad_norm": 0.04787035658955574,
478
+ "learning_rate": 0.00018160000000000002,
479
+ "loss": 0.3872,
480
  "step": 600
481
  },
482
  {
483
  "epoch": 0.46153846153846156,
484
+ "eval_accuracy": 0.7689015691868759,
485
+ "eval_f1": 0.6869204779737752,
486
+ "eval_loss": 1.0171802043914795,
487
+ "eval_runtime": 12.8764,
488
+ "eval_samples_per_second": 163.322,
489
+ "eval_steps_per_second": 20.425,
490
  "step": 600
491
  },
492
  {
493
  "epoch": 0.46923076923076923,
494
+ "grad_norm": 1.1864789724349976,
495
+ "learning_rate": 0.0001812923076923077,
496
+ "loss": 0.0565,
497
  "step": 610
498
  },
499
  {
500
  "epoch": 0.47692307692307695,
501
+ "grad_norm": 0.6733745336532593,
502
+ "learning_rate": 0.0001809846153846154,
503
+ "loss": 0.2029,
504
  "step": 620
505
  },
506
  {
507
  "epoch": 0.4846153846153846,
508
+ "grad_norm": 5.898662090301514,
509
+ "learning_rate": 0.0001806769230769231,
510
+ "loss": 0.3849,
511
  "step": 630
512
  },
513
  {
514
  "epoch": 0.49230769230769234,
515
+ "grad_norm": 11.095335006713867,
516
+ "learning_rate": 0.00018036923076923079,
517
+ "loss": 0.3647,
518
  "step": 640
519
  },
520
  {
521
  "epoch": 0.5,
522
+ "grad_norm": 1.633670687675476,
523
+ "learning_rate": 0.00018006153846153846,
524
+ "loss": 0.3256,
525
  "step": 650
526
  },
527
  {
528
  "epoch": 0.5076923076923077,
529
+ "grad_norm": 0.15434664487838745,
530
+ "learning_rate": 0.00017975384615384615,
531
+ "loss": 0.1804,
532
  "step": 660
533
  },
534
  {
535
  "epoch": 0.5153846153846153,
536
+ "grad_norm": 21.246501922607422,
537
+ "learning_rate": 0.00017944615384615385,
538
+ "loss": 0.5879,
539
  "step": 670
540
  },
541
  {
542
  "epoch": 0.5230769230769231,
543
+ "grad_norm": 4.079148769378662,
544
+ "learning_rate": 0.00017913846153846155,
545
+ "loss": 0.406,
546
  "step": 680
547
  },
548
  {
549
  "epoch": 0.5307692307692308,
550
+ "grad_norm": 14.820666313171387,
551
+ "learning_rate": 0.00017883076923076924,
552
+ "loss": 0.3732,
553
  "step": 690
554
  },
555
  {
556
  "epoch": 0.5384615384615384,
557
+ "grad_norm": 2.049006223678589,
558
+ "learning_rate": 0.00017852307692307692,
559
+ "loss": 0.2404,
560
  "step": 700
561
  },
562
  {
563
  "epoch": 0.5384615384615384,
564
+ "eval_accuracy": 0.7299096528768426,
565
+ "eval_f1": 0.7135465992080591,
566
+ "eval_loss": 1.1938942670822144,
567
+ "eval_runtime": 13.2482,
568
+ "eval_samples_per_second": 158.739,
569
+ "eval_steps_per_second": 19.852,
570
  "step": 700
571
  },
572
  {
573
  "epoch": 0.5461538461538461,
574
+ "grad_norm": 0.3764440715312958,
575
+ "learning_rate": 0.00017821538461538464,
576
+ "loss": 0.4883,
577
  "step": 710
578
  },
579
  {
580
  "epoch": 0.5538461538461539,
581
+ "grad_norm": 10.493640899658203,
582
+ "learning_rate": 0.0001779076923076923,
583
+ "loss": 0.3606,
584
  "step": 720
585
  },
586
  {
587
  "epoch": 0.5615384615384615,
588
+ "grad_norm": 5.019975662231445,
589
+ "learning_rate": 0.0001776,
590
+ "loss": 0.5277,
591
  "step": 730
592
  },
593
  {
594
  "epoch": 0.5692307692307692,
595
+ "grad_norm": 1.2601099014282227,
596
+ "learning_rate": 0.00017729230769230768,
597
+ "loss": 0.2193,
598
  "step": 740
599
  },
600
  {
601
  "epoch": 0.5769230769230769,
602
+ "grad_norm": 5.463764667510986,
603
+ "learning_rate": 0.0001769846153846154,
604
+ "loss": 0.3176,
605
  "step": 750
606
  },
607
  {
608
  "epoch": 0.5846153846153846,
609
+ "grad_norm": 3.235102653503418,
610
+ "learning_rate": 0.0001766769230769231,
611
+ "loss": 0.413,
612
  "step": 760
613
  },
614
  {
615
  "epoch": 0.5923076923076923,
616
+ "grad_norm": 2.2732224464416504,
617
+ "learning_rate": 0.00017636923076923077,
618
+ "loss": 0.2794,
619
  "step": 770
620
  },
621
  {
622
  "epoch": 0.6,
623
+ "grad_norm": 13.89130973815918,
624
+ "learning_rate": 0.00017606153846153847,
625
+ "loss": 0.7099,
626
  "step": 780
627
  },
628
  {
629
  "epoch": 0.6076923076923076,
630
+ "grad_norm": 11.57028865814209,
631
+ "learning_rate": 0.00017575384615384616,
632
+ "loss": 0.2651,
633
  "step": 790
634
  },
635
  {
636
  "epoch": 0.6153846153846154,
637
+ "grad_norm": 8.592397689819336,
638
+ "learning_rate": 0.00017544615384615386,
639
+ "loss": 0.3426,
640
  "step": 800
641
  },
642
  {
643
  "epoch": 0.6153846153846154,
644
+ "eval_accuracy": 0.7123157394198764,
645
+ "eval_f1": 0.7012828617998541,
646
+ "eval_loss": 1.3790241479873657,
647
+ "eval_runtime": 13.256,
648
+ "eval_samples_per_second": 158.645,
649
+ "eval_steps_per_second": 19.84,
650
  "step": 800
651
  },
652
  {
653
  "epoch": 0.6230769230769231,
654
+ "grad_norm": 3.83967924118042,
655
+ "learning_rate": 0.00017513846153846153,
656
+ "loss": 0.2772,
657
  "step": 810
658
  },
659
  {
660
  "epoch": 0.6307692307692307,
661
+ "grad_norm": 7.842036247253418,
662
+ "learning_rate": 0.00017483076923076923,
663
+ "loss": 0.0811,
664
  "step": 820
665
  },
666
  {
667
  "epoch": 0.6384615384615384,
668
+ "grad_norm": 8.102109909057617,
669
+ "learning_rate": 0.00017452307692307693,
670
+ "loss": 0.284,
671
  "step": 830
672
  },
673
  {
674
  "epoch": 0.6461538461538462,
675
+ "grad_norm": 3.7149875164031982,
676
+ "learning_rate": 0.00017421538461538462,
677
+ "loss": 0.2836,
678
  "step": 840
679
  },
680
  {
681
  "epoch": 0.6538461538461539,
682
+ "grad_norm": 1.5214999914169312,
683
+ "learning_rate": 0.00017390769230769232,
684
+ "loss": 0.5645,
685
  "step": 850
686
  },
687
  {
688
  "epoch": 0.6615384615384615,
689
+ "grad_norm": 2.1466846466064453,
690
+ "learning_rate": 0.00017360000000000002,
691
+ "loss": 0.2513,
692
  "step": 860
693
  },
694
  {
695
  "epoch": 0.6692307692307692,
696
+ "grad_norm": 3.5235519409179688,
697
+ "learning_rate": 0.00017329230769230772,
698
+ "loss": 0.2663,
699
  "step": 870
700
  },
701
  {
702
  "epoch": 0.676923076923077,
703
+ "grad_norm": 14.82608699798584,
704
+ "learning_rate": 0.00017298461538461539,
705
+ "loss": 0.6103,
706
  "step": 880
707
  },
708
  {
709
  "epoch": 0.6846153846153846,
710
+ "grad_norm": 6.010519504547119,
711
+ "learning_rate": 0.00017267692307692308,
712
+ "loss": 0.3315,
713
  "step": 890
714
  },
715
  {
716
  "epoch": 0.6923076923076923,
717
+ "grad_norm": 4.174232006072998,
718
+ "learning_rate": 0.00017236923076923078,
719
+ "loss": 0.3455,
720
  "step": 900
721
  },
722
  {
723
  "epoch": 0.6923076923076923,
724
+ "eval_accuracy": 0.722301474084641,
725
+ "eval_f1": 0.6945099209124845,
726
+ "eval_loss": 1.2071079015731812,
727
+ "eval_runtime": 13.1782,
728
+ "eval_samples_per_second": 159.582,
729
+ "eval_steps_per_second": 19.957,
730
  "step": 900
731
  },
732
  {
733
  "epoch": 0.7,
734
+ "grad_norm": 0.17157310247421265,
735
+ "learning_rate": 0.00017206153846153848,
736
+ "loss": 0.5492,
737
  "step": 910
738
  },
739
  {
740
  "epoch": 0.7076923076923077,
741
+ "grad_norm": 1.049086570739746,
742
+ "learning_rate": 0.00017175384615384615,
743
+ "loss": 0.1558,
744
  "step": 920
745
  },
746
  {
747
  "epoch": 0.7153846153846154,
748
+ "grad_norm": 9.643935203552246,
749
+ "learning_rate": 0.00017144615384615385,
750
+ "loss": 0.2683,
751
  "step": 930
752
  },
753
  {
754
  "epoch": 0.7230769230769231,
755
+ "grad_norm": 4.239430904388428,
756
+ "learning_rate": 0.00017113846153846154,
757
+ "loss": 0.0683,
758
  "step": 940
759
  },
760
  {
761
  "epoch": 0.7307692307692307,
762
+ "grad_norm": 12.830389976501465,
763
+ "learning_rate": 0.00017083076923076924,
764
+ "loss": 0.3568,
765
  "step": 950
766
  },
767
  {
768
  "epoch": 0.7384615384615385,
769
+ "grad_norm": 11.543557167053223,
770
+ "learning_rate": 0.00017052307692307694,
771
+ "loss": 0.37,
772
  "step": 960
773
  },
774
  {
775
  "epoch": 0.7461538461538462,
776
+ "grad_norm": 0.12294139713048935,
777
+ "learning_rate": 0.0001702153846153846,
778
+ "loss": 0.2944,
779
  "step": 970
780
  },
781
  {
782
  "epoch": 0.7538461538461538,
783
+ "grad_norm": 3.925886631011963,
784
+ "learning_rate": 0.00016990769230769233,
785
+ "loss": 0.1135,
786
  "step": 980
787
  },
788
  {
789
  "epoch": 0.7615384615384615,
790
+ "grad_norm": 0.4756164848804474,
791
+ "learning_rate": 0.0001696,
792
+ "loss": 0.3048,
793
  "step": 990
794
  },
795
  {
796
  "epoch": 0.7692307692307693,
797
+ "grad_norm": 9.28410816192627,
798
+ "learning_rate": 0.0001692923076923077,
799
+ "loss": 0.3843,
800
  "step": 1000
801
  },
802
  {
803
  "epoch": 0.7692307692307693,
804
+ "eval_accuracy": 0.5981930575368521,
805
+ "eval_f1": 0.6507281627404139,
806
+ "eval_loss": 2.4214253425598145,
807
+ "eval_runtime": 13.0632,
808
+ "eval_samples_per_second": 160.986,
809
+ "eval_steps_per_second": 20.133,
810
  "step": 1000
811
  },
812
  {
813
  "epoch": 0.7769230769230769,
814
+ "grad_norm": 2.1016364097595215,
815
+ "learning_rate": 0.0001689846153846154,
816
+ "loss": 0.5929,
817
  "step": 1010
818
  },
819
  {
820
  "epoch": 0.7846153846153846,
821
+ "grad_norm": 13.423405647277832,
822
+ "learning_rate": 0.0001686769230769231,
823
+ "loss": 0.2179,
824
  "step": 1020
825
  },
826
  {
827
  "epoch": 0.7923076923076923,
828
+ "grad_norm": 15.832393646240234,
829
+ "learning_rate": 0.00016836923076923076,
830
+ "loss": 0.3779,
831
  "step": 1030
832
  },
833
  {
834
  "epoch": 0.8,
835
+ "grad_norm": 4.272373199462891,
836
+ "learning_rate": 0.00016806153846153846,
837
+ "loss": 0.2062,
838
  "step": 1040
839
  },
840
  {
841
  "epoch": 0.8076923076923077,
842
+ "grad_norm": 19.82012939453125,
843
+ "learning_rate": 0.00016775384615384616,
844
+ "loss": 0.3419,
845
  "step": 1050
846
  },
847
  {
848
  "epoch": 0.8153846153846154,
849
+ "grad_norm": 1.0584157705307007,
850
+ "learning_rate": 0.00016744615384615386,
851
+ "loss": 0.2643,
852
  "step": 1060
853
  },
854
  {
855
  "epoch": 0.823076923076923,
856
+ "grad_norm": 8.123514175415039,
857
+ "learning_rate": 0.00016713846153846155,
858
+ "loss": 0.3832,
859
  "step": 1070
860
  },
861
  {
862
  "epoch": 0.8307692307692308,
863
+ "grad_norm": 0.30591386556625366,
864
+ "learning_rate": 0.00016683076923076922,
865
+ "loss": 0.3147,
866
  "step": 1080
867
  },
868
  {
869
  "epoch": 0.8384615384615385,
870
+ "grad_norm": 11.121192932128906,
871
+ "learning_rate": 0.00016652307692307695,
872
+ "loss": 0.2886,
873
  "step": 1090
874
  },
875
  {
876
  "epoch": 0.8461538461538461,
877
+ "grad_norm": 2.2156481742858887,
878
+ "learning_rate": 0.00016621538461538462,
879
+ "loss": 0.1851,
880
  "step": 1100
881
  },
882
  {
883
  "epoch": 0.8461538461538461,
884
+ "eval_accuracy": 0.7617689015691869,
885
+ "eval_f1": 0.7712739901548827,
886
+ "eval_loss": 0.9814907312393188,
887
+ "eval_runtime": 12.9429,
888
+ "eval_samples_per_second": 162.483,
889
+ "eval_steps_per_second": 20.32,
890
  "step": 1100
891
  },
892
  {
893
  "epoch": 0.8538461538461538,
894
+ "grad_norm": 4.73284912109375,
895
+ "learning_rate": 0.00016590769230769232,
896
+ "loss": 0.4132,
897
  "step": 1110
898
  },
899
  {
900
  "epoch": 0.8615384615384616,
901
+ "grad_norm": 8.023744583129883,
902
+ "learning_rate": 0.0001656,
903
+ "loss": 0.2097,
904
  "step": 1120
905
  },
906
  {
907
  "epoch": 0.8692307692307693,
908
+ "grad_norm": 0.5264157652854919,
909
+ "learning_rate": 0.0001652923076923077,
910
+ "loss": 0.2644,
911
  "step": 1130
912
  },
913
  {
914
  "epoch": 0.8769230769230769,
915
+ "grad_norm": 0.09222064912319183,
916
+ "learning_rate": 0.00016498461538461538,
917
+ "loss": 0.3584,
918
  "step": 1140
919
  },
920
  {
921
  "epoch": 0.8846153846153846,
922
+ "grad_norm": 5.8107757568359375,
923
+ "learning_rate": 0.00016467692307692308,
924
+ "loss": 0.1727,
925
  "step": 1150
926
  },
927
  {
928
  "epoch": 0.8923076923076924,
929
+ "grad_norm": 0.7490300536155701,
930
+ "learning_rate": 0.00016436923076923078,
931
+ "loss": 0.3538,
932
  "step": 1160
933
  },
934
  {
935
  "epoch": 0.9,
936
+ "grad_norm": 12.119208335876465,
937
+ "learning_rate": 0.00016406153846153847,
938
+ "loss": 0.1684,
939
  "step": 1170
940
  },
941
  {
942
  "epoch": 0.9076923076923077,
943
+ "grad_norm": 6.949498176574707,
944
+ "learning_rate": 0.00016375384615384617,
945
+ "loss": 0.2705,
946
  "step": 1180
947
  },
948
  {
949
  "epoch": 0.9153846153846154,
950
+ "grad_norm": 7.305801868438721,
951
+ "learning_rate": 0.00016344615384615384,
952
+ "loss": 0.226,
953
  "step": 1190
954
  },
955
  {
956
  "epoch": 0.9230769230769231,
957
+ "grad_norm": 5.5069684982299805,
958
+ "learning_rate": 0.00016313846153846154,
959
+ "loss": 0.1783,
960
  "step": 1200
961
  },
962
  {
963
  "epoch": 0.9230769230769231,
964
+ "eval_accuracy": 0.7883975273418925,
965
+ "eval_f1": 0.7973086083953446,
966
+ "eval_loss": 0.8585250973701477,
967
+ "eval_runtime": 12.7217,
968
+ "eval_samples_per_second": 165.309,
969
+ "eval_steps_per_second": 20.673,
970
  "step": 1200
971
  },
972
  {
973
  "epoch": 0.9307692307692308,
974
+ "grad_norm": 0.19160787761211395,
975
+ "learning_rate": 0.00016283076923076924,
976
+ "loss": 0.2238,
977
  "step": 1210
978
  },
979
  {
980
  "epoch": 0.9384615384615385,
981
+ "grad_norm": 8.888996124267578,
982
+ "learning_rate": 0.00016252307692307693,
983
+ "loss": 0.3376,
984
  "step": 1220
985
  },
986
  {
987
  "epoch": 0.9461538461538461,
988
+ "grad_norm": 0.8629558086395264,
989
+ "learning_rate": 0.0001622153846153846,
990
+ "loss": 0.1931,
991
  "step": 1230
992
  },
993
  {
994
  "epoch": 0.9538461538461539,
995
+ "grad_norm": 9.519225120544434,
996
+ "learning_rate": 0.00016190769230769233,
997
+ "loss": 0.4684,
998
  "step": 1240
999
  },
1000
  {
1001
  "epoch": 0.9615384615384616,
1002
+ "grad_norm": 14.200179100036621,
1003
+ "learning_rate": 0.00016160000000000002,
1004
+ "loss": 0.4139,
1005
  "step": 1250
1006
  },
1007
  {
1008
  "epoch": 0.9692307692307692,
1009
+ "grad_norm": 7.553689479827881,
1010
+ "learning_rate": 0.0001612923076923077,
1011
+ "loss": 0.0796,
1012
  "step": 1260
1013
  },
1014
  {
1015
  "epoch": 0.9769230769230769,
1016
+ "grad_norm": 0.355137437582016,
1017
+ "learning_rate": 0.0001609846153846154,
1018
+ "loss": 0.2785,
1019
  "step": 1270
1020
  },
1021
  {
1022
  "epoch": 0.9846153846153847,
1023
+ "grad_norm": 10.454266548156738,
1024
+ "learning_rate": 0.0001606769230769231,
1025
+ "loss": 0.2214,
1026
  "step": 1280
1027
  },
1028
  {
1029
  "epoch": 0.9923076923076923,
1030
+ "grad_norm": 4.790037631988525,
1031
+ "learning_rate": 0.0001603692307692308,
1032
+ "loss": 0.2865,
1033
  "step": 1290
1034
  },
1035
  {
1036
  "epoch": 1.0,
1037
+ "grad_norm": 0.6874844431877136,
1038
+ "learning_rate": 0.00016006153846153846,
1039
+ "loss": 0.2812,
1040
  "step": 1300
1041
  },
1042
  {
1043
  "epoch": 1.0,
1044
+ "eval_accuracy": 0.7394198763670946,
1045
+ "eval_f1": 0.7475294511131121,
1046
+ "eval_loss": 1.416084885597229,
1047
+ "eval_runtime": 12.6074,
1048
+ "eval_samples_per_second": 166.807,
1049
+ "eval_steps_per_second": 20.861,
1050
  "step": 1300
1051
  },
1052
  {
1053
  "epoch": 1.0076923076923077,
1054
+ "grad_norm": 27.64440155029297,
1055
+ "learning_rate": 0.00015975384615384615,
1056
+ "loss": 0.3337,
1057
  "step": 1310
1058
  },
1059
  {
1060
  "epoch": 1.0153846153846153,
1061
+ "grad_norm": 6.039366722106934,
1062
+ "learning_rate": 0.00015944615384615385,
1063
+ "loss": 0.1274,
1064
  "step": 1320
1065
  },
1066
  {
1067
  "epoch": 1.023076923076923,
1068
+ "grad_norm": 2.63989520072937,
1069
+ "learning_rate": 0.00015913846153846155,
1070
+ "loss": 0.1286,
1071
  "step": 1330
1072
  },
1073
  {
1074
  "epoch": 1.0307692307692307,
1075
+ "grad_norm": 0.025664901360869408,
1076
+ "learning_rate": 0.00015883076923076922,
1077
+ "loss": 0.0268,
1078
  "step": 1340
1079
  },
1080
  {
1081
  "epoch": 1.0384615384615385,
1082
+ "grad_norm": 0.7110622525215149,
1083
+ "learning_rate": 0.00015852307692307692,
1084
+ "loss": 0.2141,
1085
  "step": 1350
1086
  },
1087
  {
1088
  "epoch": 1.0461538461538462,
1089
+ "grad_norm": 0.16841351985931396,
1090
+ "learning_rate": 0.00015821538461538464,
1091
+ "loss": 0.1706,
1092
  "step": 1360
1093
  },
1094
  {
1095
  "epoch": 1.0538461538461539,
1096
+ "grad_norm": 2.8135828971862793,
1097
+ "learning_rate": 0.0001579076923076923,
1098
+ "loss": 0.213,
1099
  "step": 1370
1100
  },
1101
  {
1102
  "epoch": 1.0615384615384615,
1103
+ "grad_norm": 6.7190937995910645,
1104
+ "learning_rate": 0.0001576,
1105
+ "loss": 0.1559,
1106
  "step": 1380
1107
  },
1108
  {
1109
  "epoch": 1.0692307692307692,
1110
+ "grad_norm": 1.2817728519439697,
1111
+ "learning_rate": 0.0001572923076923077,
1112
+ "loss": 0.1296,
1113
  "step": 1390
1114
  },
1115
  {
1116
  "epoch": 1.0769230769230769,
1117
+ "grad_norm": 8.388606071472168,
1118
+ "learning_rate": 0.0001569846153846154,
1119
+ "loss": 0.0788,
1120
  "step": 1400
1121
  },
1122
  {
1123
  "epoch": 1.0769230769230769,
1124
+ "eval_accuracy": 0.7769852591535901,
1125
+ "eval_f1": 0.7855219452383467,
1126
+ "eval_loss": 1.0477243661880493,
1127
+ "eval_runtime": 12.6229,
1128
+ "eval_samples_per_second": 166.602,
1129
+ "eval_steps_per_second": 20.835,
1130
  "step": 1400
1131
  },
1132
  {
1133
  "epoch": 1.0846153846153845,
1134
+ "grad_norm": 9.62314510345459,
1135
+ "learning_rate": 0.00015667692307692307,
1136
+ "loss": 0.421,
1137
  "step": 1410
1138
  },
1139
  {
1140
  "epoch": 1.0923076923076924,
1141
+ "grad_norm": 6.402166366577148,
1142
+ "learning_rate": 0.00015636923076923077,
1143
+ "loss": 0.1328,
1144
  "step": 1420
1145
  },
1146
  {
1147
  "epoch": 1.1,
1148
+ "grad_norm": 3.701634168624878,
1149
+ "learning_rate": 0.00015606153846153847,
1150
+ "loss": 0.0716,
1151
  "step": 1430
1152
  },
1153
  {
1154
  "epoch": 1.1076923076923078,
1155
+ "grad_norm": 0.01674058847129345,
1156
+ "learning_rate": 0.00015575384615384617,
1157
+ "loss": 0.0287,
1158
  "step": 1440
1159
  },
1160
  {
1161
  "epoch": 1.1153846153846154,
1162
+ "grad_norm": 1.9546364545822144,
1163
+ "learning_rate": 0.00015544615384615386,
1164
+ "loss": 0.2283,
1165
  "step": 1450
1166
  },
1167
  {
1168
  "epoch": 1.123076923076923,
1169
+ "grad_norm": 14.071533203125,
1170
+ "learning_rate": 0.00015513846153846153,
1171
+ "loss": 0.3448,
1172
  "step": 1460
1173
  },
1174
  {
1175
  "epoch": 1.1307692307692307,
1176
+ "grad_norm": 4.5133585929870605,
1177
+ "learning_rate": 0.00015483076923076926,
1178
+ "loss": 0.0917,
1179
  "step": 1470
1180
  },
1181
  {
1182
  "epoch": 1.1384615384615384,
1183
+ "grad_norm": 1.5091907978057861,
1184
+ "learning_rate": 0.00015452307692307693,
1185
+ "loss": 0.2924,
1186
  "step": 1480
1187
  },
1188
  {
1189
  "epoch": 1.146153846153846,
1190
+ "grad_norm": 0.01831883378326893,
1191
+ "learning_rate": 0.00015421538461538463,
1192
+ "loss": 0.167,
1193
  "step": 1490
1194
  },
1195
  {
1196
  "epoch": 1.1538461538461537,
1197
+ "grad_norm": 0.15469826757907867,
1198
+ "learning_rate": 0.0001539076923076923,
1199
+ "loss": 0.1853,
1200
  "step": 1500
1201
  },
1202
  {
1203
  "epoch": 1.1538461538461537,
1204
+ "eval_accuracy": 0.7788873038516405,
1205
+ "eval_f1": 0.7907369196346151,
1206
+ "eval_loss": 1.0843039751052856,
1207
+ "eval_runtime": 12.9085,
1208
+ "eval_samples_per_second": 162.916,
1209
+ "eval_steps_per_second": 20.374,
1210
  "step": 1500
1211
  },
1212
  {
1213
  "epoch": 1.1615384615384616,
1214
+ "grad_norm": 0.009273377247154713,
1215
+ "learning_rate": 0.00015360000000000002,
1216
+ "loss": 0.133,
1217
  "step": 1510
1218
  },
1219
  {
1220
  "epoch": 1.1692307692307693,
1221
+ "grad_norm": 19.926687240600586,
1222
+ "learning_rate": 0.0001532923076923077,
1223
+ "loss": 0.1644,
1224
  "step": 1520
1225
  },
1226
  {
1227
  "epoch": 1.176923076923077,
1228
+ "grad_norm": 2.141300678253174,
1229
+ "learning_rate": 0.0001529846153846154,
1230
+ "loss": 0.2778,
1231
  "step": 1530
1232
  },
1233
  {
1234
  "epoch": 1.1846153846153846,
1235
+ "grad_norm": 9.439351081848145,
1236
+ "learning_rate": 0.00015267692307692309,
1237
+ "loss": 0.1928,
1238
  "step": 1540
1239
  },
1240
  {
1241
  "epoch": 1.1923076923076923,
1242
+ "grad_norm": 2.3497989177703857,
1243
+ "learning_rate": 0.00015236923076923078,
1244
+ "loss": 0.0872,
1245
  "step": 1550
1246
  },
1247
  {
1248
  "epoch": 1.2,
1249
+ "grad_norm": 0.3300742506980896,
1250
+ "learning_rate": 0.00015206153846153848,
1251
+ "loss": 0.0773,
1252
  "step": 1560
1253
  },
1254
  {
1255
  "epoch": 1.2076923076923076,
1256
+ "grad_norm": 4.574916839599609,
1257
+ "learning_rate": 0.00015175384615384615,
1258
+ "loss": 0.1768,
1259
  "step": 1570
1260
  },
1261
  {
1262
  "epoch": 1.2153846153846155,
1263
+ "grad_norm": 2.860717535018921,
1264
+ "learning_rate": 0.00015144615384615385,
1265
+ "loss": 0.1335,
1266
  "step": 1580
1267
  },
1268
  {
1269
  "epoch": 1.2230769230769232,
1270
+ "grad_norm": 0.060734041035175323,
1271
+ "learning_rate": 0.00015113846153846154,
1272
+ "loss": 0.1425,
1273
  "step": 1590
1274
  },
1275
  {
1276
  "epoch": 1.2307692307692308,
1277
+ "grad_norm": 2.519880533218384,
1278
+ "learning_rate": 0.00015083076923076924,
1279
+ "loss": 0.0463,
1280
  "step": 1600
1281
  },
1282
  {
1283
  "epoch": 1.2307692307692308,
1284
+ "eval_accuracy": 0.785544460294817,
1285
+ "eval_f1": 0.7421672482958516,
1286
+ "eval_loss": 1.1819196939468384,
1287
+ "eval_runtime": 12.7239,
1288
+ "eval_samples_per_second": 165.279,
1289
+ "eval_steps_per_second": 20.67,
1290
  "step": 1600
1291
  },
1292
  {
1293
  "epoch": 1.2384615384615385,
1294
+ "grad_norm": 0.014092416502535343,
1295
+ "learning_rate": 0.0001505230769230769,
1296
+ "loss": 0.347,
1297
  "step": 1610
1298
  },
1299
  {
1300
  "epoch": 1.2461538461538462,
1301
+ "grad_norm": 0.021726811304688454,
1302
+ "learning_rate": 0.00015021538461538464,
1303
+ "loss": 0.1134,
1304
  "step": 1620
1305
  },
1306
  {
1307
  "epoch": 1.2538461538461538,
1308
+ "grad_norm": 0.019754081964492798,
1309
+ "learning_rate": 0.0001499076923076923,
1310
+ "loss": 0.1126,
1311
  "step": 1630
1312
  },
1313
  {
1314
  "epoch": 1.2615384615384615,
1315
+ "grad_norm": 0.018249794840812683,
1316
+ "learning_rate": 0.0001496,
1317
+ "loss": 0.0706,
1318
  "step": 1640
1319
  },
1320
  {
1321
  "epoch": 1.2692307692307692,
1322
+ "grad_norm": 0.013677536509931087,
1323
+ "learning_rate": 0.0001492923076923077,
1324
+ "loss": 0.0554,
1325
  "step": 1650
1326
  },
1327
  {
1328
  "epoch": 1.2769230769230768,
1329
+ "grad_norm": 11.756999969482422,
1330
+ "learning_rate": 0.0001489846153846154,
1331
+ "loss": 0.3888,
1332
  "step": 1660
1333
  },
1334
  {
1335
  "epoch": 1.2846153846153845,
1336
+ "grad_norm": 0.5001311898231506,
1337
+ "learning_rate": 0.0001486769230769231,
1338
+ "loss": 0.3604,
1339
  "step": 1670
1340
  },
1341
  {
1342
  "epoch": 1.2923076923076924,
1343
+ "grad_norm": 4.095685005187988,
1344
+ "learning_rate": 0.00014836923076923077,
1345
+ "loss": 0.0589,
1346
  "step": 1680
1347
  },
1348
  {
1349
  "epoch": 1.3,
1350
+ "grad_norm": 0.42667385935783386,
1351
+ "learning_rate": 0.00014806153846153846,
1352
+ "loss": 0.1972,
1353
  "step": 1690
1354
  },
1355
  {
1356
  "epoch": 1.3076923076923077,
1357
+ "grad_norm": 0.4370737075805664,
1358
+ "learning_rate": 0.00014775384615384616,
1359
+ "loss": 0.1846,
1360
  "step": 1700
1361
  },
1362
  {
1363
  "epoch": 1.3076923076923077,
1364
+ "eval_accuracy": 0.7936281502615311,
1365
+ "eval_f1": 0.7834925540357461,
1366
+ "eval_loss": 0.9226651191711426,
1367
+ "eval_runtime": 13.0044,
1368
+ "eval_samples_per_second": 161.714,
1369
+ "eval_steps_per_second": 20.224,
1370
  "step": 1700
1371
  },
1372
  {
1373
  "epoch": 1.3153846153846154,
1374
+ "grad_norm": 2.9129936695098877,
1375
+ "learning_rate": 0.00014744615384615386,
1376
+ "loss": 0.0855,
1377
  "step": 1710
1378
  },
1379
  {
1380
  "epoch": 1.323076923076923,
1381
+ "grad_norm": 0.003025891724973917,
1382
+ "learning_rate": 0.00014713846153846153,
1383
+ "loss": 0.1383,
1384
  "step": 1720
1385
  },
1386
  {
1387
  "epoch": 1.3307692307692307,
1388
+ "grad_norm": 20.243345260620117,
1389
+ "learning_rate": 0.00014683076923076923,
1390
+ "loss": 0.3327,
1391
  "step": 1730
1392
  },
1393
  {
1394
  "epoch": 1.3384615384615386,
1395
+ "grad_norm": 0.08103461563587189,
1396
+ "learning_rate": 0.00014652307692307695,
1397
+ "loss": 0.0665,
1398
  "step": 1740
1399
  },
1400
  {
1401
  "epoch": 1.3461538461538463,
1402
+ "grad_norm": 7.912471771240234,
1403
+ "learning_rate": 0.00014621538461538462,
1404
+ "loss": 0.2611,
1405
  "step": 1750
1406
  },
1407
  {
1408
  "epoch": 1.353846153846154,
1409
+ "grad_norm": 3.1439859867095947,
1410
+ "learning_rate": 0.00014590769230769232,
1411
+ "loss": 0.148,
1412
  "step": 1760
1413
  },
1414
  {
1415
  "epoch": 1.3615384615384616,
1416
+ "grad_norm": 0.005958245135843754,
1417
+ "learning_rate": 0.00014560000000000002,
1418
+ "loss": 0.2973,
1419
  "step": 1770
1420
  },
1421
  {
1422
  "epoch": 1.3692307692307693,
1423
+ "grad_norm": 0.0040934206917881966,
1424
+ "learning_rate": 0.0001452923076923077,
1425
+ "loss": 0.0725,
1426
  "step": 1780
1427
  },
1428
  {
1429
  "epoch": 1.376923076923077,
1430
+ "grad_norm": 0.35973867774009705,
1431
+ "learning_rate": 0.00014498461538461538,
1432
+ "loss": 0.2344,
1433
  "step": 1790
1434
  },
1435
  {
1436
  "epoch": 1.3846153846153846,
1437
+ "grad_norm": 1.3999979496002197,
1438
+ "learning_rate": 0.00014467692307692308,
1439
+ "loss": 0.0886,
1440
  "step": 1800
1441
  },
1442
  {
1443
  "epoch": 1.3846153846153846,
1444
+ "eval_accuracy": 0.755587256300523,
1445
+ "eval_f1": 0.760332968619484,
1446
+ "eval_loss": 1.336824655532837,
1447
+ "eval_runtime": 13.0885,
1448
+ "eval_samples_per_second": 160.676,
1449
+ "eval_steps_per_second": 20.094,
1450
  "step": 1800
1451
  },
1452
  {
1453
  "epoch": 1.3923076923076922,
1454
+ "grad_norm": 12.196135520935059,
1455
+ "learning_rate": 0.00014436923076923078,
1456
+ "loss": 0.1538,
1457
  "step": 1810
1458
  },
1459
  {
1460
  "epoch": 1.4,
1461
+ "grad_norm": 0.052580125629901886,
1462
+ "learning_rate": 0.00014406153846153848,
1463
+ "loss": 0.287,
1464
  "step": 1820
1465
  },
1466
  {
1467
  "epoch": 1.4076923076923076,
1468
+ "grad_norm": 15.731462478637695,
1469
+ "learning_rate": 0.00014375384615384615,
1470
+ "loss": 0.2438,
1471
  "step": 1830
1472
  },
1473
  {
1474
  "epoch": 1.4153846153846155,
1475
+ "grad_norm": 1.4881103038787842,
1476
+ "learning_rate": 0.00014344615384615384,
1477
+ "loss": 0.0786,
1478
  "step": 1840
1479
  },
1480
  {
1481
  "epoch": 1.4230769230769231,
1482
+ "grad_norm": 0.061750538647174835,
1483
+ "learning_rate": 0.00014313846153846157,
1484
+ "loss": 0.1427,
1485
  "step": 1850
1486
  },
1487
  {
1488
  "epoch": 1.4307692307692308,
1489
+ "grad_norm": 0.21189741790294647,
1490
+ "learning_rate": 0.00014283076923076924,
1491
+ "loss": 0.1164,
1492
  "step": 1860
1493
  },
1494
  {
1495
  "epoch": 1.4384615384615385,
1496
+ "grad_norm": 0.04826455935835838,
1497
+ "learning_rate": 0.00014252307692307693,
1498
+ "loss": 0.0384,
1499
  "step": 1870
1500
  },
1501
  {
1502
  "epoch": 1.4461538461538461,
1503
+ "grad_norm": 1.3123809099197388,
1504
+ "learning_rate": 0.0001422153846153846,
1505
+ "loss": 0.4418,
1506
  "step": 1880
1507
  },
1508
  {
1509
  "epoch": 1.4538461538461538,
1510
+ "grad_norm": 14.41169548034668,
1511
+ "learning_rate": 0.00014190769230769233,
1512
+ "loss": 0.0724,
1513
  "step": 1890
1514
  },
1515
  {
1516
  "epoch": 1.4615384615384617,
1517
+ "grad_norm": 0.010243662633001804,
1518
+ "learning_rate": 0.0001416,
1519
+ "loss": 0.1971,
1520
  "step": 1900
1521
  },
1522
  {
1523
  "epoch": 1.4615384615384617,
1524
+ "eval_accuracy": 0.7527341892534475,
1525
+ "eval_f1": 0.7412732500492302,
1526
+ "eval_loss": 1.3275840282440186,
1527
+ "eval_runtime": 13.034,
1528
+ "eval_samples_per_second": 161.348,
1529
+ "eval_steps_per_second": 20.178,
1530
  "step": 1900
1531
  },
1532
  {
1533
  "epoch": 1.4692307692307693,
1534
+ "grad_norm": 0.004571467638015747,
1535
+ "learning_rate": 0.0001412923076923077,
1536
+ "loss": 0.2237,
1537
  "step": 1910
1538
  },
1539
  {
1540
  "epoch": 1.476923076923077,
1541
+ "grad_norm": 0.652137279510498,
1542
+ "learning_rate": 0.0001409846153846154,
1543
+ "loss": 0.1416,
1544
  "step": 1920
1545
  },
1546
  {
1547
  "epoch": 1.4846153846153847,
1548
+ "grad_norm": 1.6183210611343384,
1549
+ "learning_rate": 0.0001406769230769231,
1550
+ "loss": 0.1199,
1551
  "step": 1930
1552
  },
1553
  {
1554
  "epoch": 1.4923076923076923,
1555
+ "grad_norm": 6.971846103668213,
1556
+ "learning_rate": 0.0001403692307692308,
1557
+ "loss": 0.1777,
1558
  "step": 1940
1559
  },
1560
  {
1561
  "epoch": 1.5,
1562
+ "grad_norm": 1.2869207859039307,
1563
+ "learning_rate": 0.00014006153846153846,
1564
+ "loss": 0.1312,
1565
  "step": 1950
1566
  },
1567
  {
1568
  "epoch": 1.5076923076923077,
1569
+ "grad_norm": 11.167072296142578,
1570
+ "learning_rate": 0.00013975384615384616,
1571
+ "loss": 0.1798,
1572
  "step": 1960
1573
  },
1574
  {
1575
  "epoch": 1.5153846153846153,
1576
+ "grad_norm": 3.460939407348633,
1577
+ "learning_rate": 0.00013944615384615385,
1578
+ "loss": 0.0561,
1579
  "step": 1970
1580
  },
1581
  {
1582
  "epoch": 1.523076923076923,
1583
+ "grad_norm": 13.122215270996094,
1584
+ "learning_rate": 0.00013913846153846155,
1585
+ "loss": 0.3705,
1586
  "step": 1980
1587
  },
1588
  {
1589
  "epoch": 1.5307692307692307,
1590
+ "grad_norm": 0.06862486898899078,
1591
+ "learning_rate": 0.00013883076923076922,
1592
+ "loss": 0.0927,
1593
  "step": 1990
1594
  },
1595
  {
1596
  "epoch": 1.5384615384615383,
1597
+ "grad_norm": 0.11031440645456314,
1598
+ "learning_rate": 0.00013852307692307695,
1599
+ "loss": 0.2069,
1600
  "step": 2000
1601
  },
1602
  {
1603
  "epoch": 1.5384615384615383,
1604
+ "eval_accuracy": 0.7727056585829767,
1605
+ "eval_f1": 0.7804628247388471,
1606
+ "eval_loss": 1.3338302373886108,
1607
+ "eval_runtime": 12.9099,
1608
+ "eval_samples_per_second": 162.898,
1609
+ "eval_steps_per_second": 20.372,
1610
  "step": 2000
1611
  },
1612
  {
1613
  "epoch": 1.546153846153846,
1614
+ "grad_norm": 0.09383740276098251,
1615
+ "learning_rate": 0.00013821538461538462,
1616
+ "loss": 0.1164,
1617
  "step": 2010
1618
  },
1619
  {
1620
  "epoch": 1.5538461538461539,
1621
+ "grad_norm": 0.005354443099349737,
1622
+ "learning_rate": 0.0001379076923076923,
1623
+ "loss": 0.1628,
1624
  "step": 2020
1625
  },
1626
  {
1627
  "epoch": 1.5615384615384615,
1628
+ "grad_norm": 0.48392564058303833,
1629
+ "learning_rate": 0.00013759999999999998,
1630
+ "loss": 0.107,
1631
  "step": 2030
1632
  },
1633
  {
1634
  "epoch": 1.5692307692307692,
1635
+ "grad_norm": 0.013797705993056297,
1636
  "learning_rate": 0.00013732307692307694,
1637
+ "loss": 0.1443,
1638
  "step": 2040
1639
  },
1640
  {
1641
  "epoch": 1.5769230769230769,
1642
+ "grad_norm": 0.0021225737873464823,
1643
  "learning_rate": 0.00013701538461538461,
1644
+ "loss": 0.0892,
1645
  "step": 2050
1646
  },
1647
  {
1648
  "epoch": 1.5846153846153848,
1649
+ "grad_norm": 0.409858763217926,
1650
  "learning_rate": 0.0001367076923076923,
1651
+ "loss": 0.1198,
1652
  "step": 2060
1653
  },
1654
  {
1655
  "epoch": 1.5923076923076924,
1656
+ "grad_norm": 0.07346770912408829,
1657
  "learning_rate": 0.0001364,
1658
+ "loss": 0.1431,
1659
  "step": 2070
1660
  },
1661
  {
1662
  "epoch": 1.6,
1663
+ "grad_norm": 0.07542140781879425,
1664
  "learning_rate": 0.0001360923076923077,
1665
+ "loss": 0.0692,
1666
  "step": 2080
1667
  },
1668
  {
1669
  "epoch": 1.6076923076923078,
1670
+ "grad_norm": 12.474515914916992,
1671
  "learning_rate": 0.0001357846153846154,
1672
+ "loss": 0.2697,
1673
  "step": 2090
1674
  },
1675
  {
1676
  "epoch": 1.6153846153846154,
1677
+ "grad_norm": 1.3258837461471558,
1678
  "learning_rate": 0.00013547692307692307,
1679
+ "loss": 0.1479,
1680
  "step": 2100
1681
  },
1682
  {
1683
  "epoch": 1.6153846153846154,
1684
+ "eval_accuracy": 0.7717546362339515,
1685
+ "eval_f1": 0.8021104948644624,
1686
+ "eval_loss": 1.2605870962142944,
1687
+ "eval_runtime": 12.6589,
1688
+ "eval_samples_per_second": 166.128,
1689
+ "eval_steps_per_second": 20.776,
1690
  "step": 2100
1691
  },
1692
  {
1693
  "epoch": 1.623076923076923,
1694
+ "grad_norm": 0.09133293479681015,
1695
  "learning_rate": 0.00013516923076923077,
1696
+ "loss": 0.0579,
1697
  "step": 2110
1698
  },
1699
  {
1700
  "epoch": 1.6307692307692307,
1701
+ "grad_norm": 0.13408294320106506,
1702
  "learning_rate": 0.00013486153846153847,
1703
+ "loss": 0.1774,
1704
  "step": 2120
1705
  },
1706
  {
1707
  "epoch": 1.6384615384615384,
1708
+ "grad_norm": 0.02371808886528015,
1709
  "learning_rate": 0.00013455384615384617,
1710
+ "loss": 0.1071,
1711
  "step": 2130
1712
  },
1713
  {
1714
  "epoch": 1.646153846153846,
1715
+ "grad_norm": 0.03445754200220108,
1716
  "learning_rate": 0.00013424615384615384,
1717
+ "loss": 0.063,
1718
  "step": 2140
1719
  },
1720
  {
1721
  "epoch": 1.6538461538461537,
1722
+ "grad_norm": 11.491479873657227,
1723
  "learning_rate": 0.00013393846153846153,
1724
+ "loss": 0.1706,
1725
  "step": 2150
1726
  },
1727
  {
1728
  "epoch": 1.6615384615384614,
1729
+ "grad_norm": 5.018512725830078,
1730
  "learning_rate": 0.00013363076923076926,
1731
+ "loss": 0.126,
1732
  "step": 2160
1733
  },
1734
  {
1735
  "epoch": 1.669230769230769,
1736
+ "grad_norm": 0.008263733237981796,
1737
  "learning_rate": 0.00013332307692307693,
1738
+ "loss": 0.1425,
1739
  "step": 2170
1740
  },
1741
  {
1742
  "epoch": 1.676923076923077,
1743
+ "grad_norm": 1.61963951587677,
1744
  "learning_rate": 0.00013301538461538463,
1745
+ "loss": 0.2236,
1746
  "step": 2180
1747
  },
1748
  {
1749
  "epoch": 1.6846153846153846,
1750
+ "grad_norm": 0.2399456948041916,
1751
  "learning_rate": 0.00013270769230769232,
1752
+ "loss": 0.2712,
1753
  "step": 2190
1754
  },
1755
  {
1756
  "epoch": 1.6923076923076923,
1757
+ "grad_norm": 16.2120418548584,
1758
  "learning_rate": 0.00013240000000000002,
1759
+ "loss": 0.1076,
1760
  "step": 2200
1761
  },
1762
  {
1763
  "epoch": 1.6923076923076923,
1764
+ "eval_accuracy": 0.8007608178792202,
1765
+ "eval_f1": 0.8183142194677944,
1766
+ "eval_loss": 0.9954975247383118,
1767
+ "eval_runtime": 12.719,
1768
+ "eval_samples_per_second": 165.344,
1769
+ "eval_steps_per_second": 20.678,
1770
  "step": 2200
1771
  },
1772
  {
1773
+ "epoch": 1.6923076923076923,
1774
+ "step": 2200,
1775
+ "total_flos": 1.3637892549580186e+18,
1776
+ "train_loss": 0.3320726641470736,
1777
+ "train_runtime": 627.5337,
1778
+ "train_samples_per_second": 82.824,
1779
+ "train_steps_per_second": 10.358
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1780
  }
1781
  ],
1782
  "logging_steps": 10,
 
1788
  "EarlyStoppingCallback": {
1789
  "args": {
1790
  "early_stopping_patience": 10,
1791
+ "early_stopping_threshold": 0.0001
1792
  },
1793
  "attributes": {
1794
  "early_stopping_patience_counter": 10
 
1805
  "attributes": {}
1806
  }
1807
  },
1808
+ "total_flos": 1.3637892549580186e+18,
1809
  "train_batch_size": 8,
1810
  "trial_name": null,
1811
  "trial_params": null