desarrolloasesoreslocales commited on
Commit
73037a6
·
verified ·
1 Parent(s): ce20f68

Training in progress, epoch 1

Browse files
all_results.json CHANGED
@@ -1,8 +1,13 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 1.2269789449924608e+18,
4
- "train_loss": 8.366611099243164,
5
- "train_runtime": 1658.1045,
6
- "train_samples_per_second": 41.759,
7
- "train_steps_per_second": 0.084
 
 
 
 
 
8
  }
 
1
  {
2
+ "epoch": 100.0,
3
+ "eval_accuracy": 0.7954939341421143,
4
+ "eval_loss": 0.8635059595108032,
5
+ "eval_runtime": 20.1007,
6
+ "eval_samples_per_second": 57.411,
7
+ "eval_steps_per_second": 0.497,
8
+ "total_flos": 6.134894724962304e+18,
9
+ "train_loss": 3.4289448138645717,
10
+ "train_runtime": 8283.6476,
11
+ "train_samples_per_second": 41.793,
12
+ "train_steps_per_second": 0.085
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 27.0,
3
- "eval_accuracy": 0.42857142857142855,
4
- "eval_loss": 2.1981732845306396,
5
- "eval_runtime": 1.4229,
6
- "eval_samples_per_second": 44.275,
7
- "eval_steps_per_second": 11.244
8
  }
 
1
  {
2
+ "epoch": 100.0,
3
+ "eval_accuracy": 0.7954939341421143,
4
+ "eval_loss": 0.8635059595108032,
5
+ "eval_runtime": 20.1007,
6
+ "eval_samples_per_second": 57.411,
7
+ "eval_steps_per_second": 0.497
8
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c717f0038a6816912731c1b8fd868289ea37b3f4b3420b7567c0c3655ddcba7
3
  size 78658852
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55a03acd8c0bad8dacd709687a5e47875bceea009c5507ad4867468667b387b4
3
  size 78658852
runs/Jan10_17-02-19_d30be481abf6/events.out.tfevents.1736542147.d30be481abf6.3489.5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a94820334bf537e2fa83f350b280dbd175864c36047bb05bf4df7a7d4621379
3
+ size 411
runs/Jan10_20-53-50_d30be481abf6/events.out.tfevents.1736542449.d30be481abf6.3489.6 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e07875b82ac7207520ff12ce58f96c6d6f0cdcbb3366e79868a41686eb07ef
3
+ size 8807
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 20.0,
3
- "total_flos": 1.2269789449924608e+18,
4
- "train_loss": 8.366611099243164,
5
- "train_runtime": 1658.1045,
6
- "train_samples_per_second": 41.759,
7
- "train_steps_per_second": 0.084
8
  }
 
1
  {
2
+ "epoch": 100.0,
3
+ "total_flos": 6.134894724962304e+18,
4
+ "train_loss": 3.4289448138645717,
5
+ "train_runtime": 8283.6476,
6
+ "train_samples_per_second": 41.793,
7
+ "train_steps_per_second": 0.085
8
  }
trainer_state.json CHANGED
@@ -1,985 +1,1411 @@
1
  {
2
- "best_metric": 0.7140381282495667,
3
- "best_model_checkpoint": "cvt-13-normal/checkpoint-483",
4
- "epoch": 70.42857142857143,
5
  "eval_steps": 500,
6
- "global_step": 493,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5441941074523396,
14
- "eval_loss": 1.7207281589508057,
15
- "eval_runtime": 17.8598,
16
- "eval_samples_per_second": 64.614,
17
- "eval_steps_per_second": 0.56,
18
  "step": 7
19
  },
20
  {
21
  "epoch": 1.4285714285714286,
22
- "grad_norm": 5.889595031738281,
23
- "learning_rate": 4.2857142857142855e-06,
24
- "loss": 7.8206,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.5476603119584056,
30
- "eval_loss": 1.7008943557739258,
31
- "eval_runtime": 17.8351,
32
- "eval_samples_per_second": 64.704,
33
- "eval_steps_per_second": 0.561,
34
  "step": 14
35
  },
36
  {
37
  "epoch": 2.857142857142857,
38
- "grad_norm": 5.1420416831970215,
39
- "learning_rate": 8.571428571428571e-06,
40
- "loss": 7.8605,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 3.0,
45
- "eval_accuracy": 0.5476603119584056,
46
- "eval_loss": 1.6983410120010376,
47
- "eval_runtime": 18.0689,
48
- "eval_samples_per_second": 63.867,
49
- "eval_steps_per_second": 0.553,
50
  "step": 21
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.5493934142114385,
55
- "eval_loss": 1.6871569156646729,
56
- "eval_runtime": 17.8993,
57
- "eval_samples_per_second": 64.472,
58
- "eval_steps_per_second": 0.559,
59
  "step": 28
60
  },
61
  {
62
  "epoch": 4.285714285714286,
63
- "grad_norm": 7.019086837768555,
64
- "learning_rate": 1.2857142857142857e-05,
65
- "loss": 7.6321,
66
  "step": 30
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.5528596187175043,
71
- "eval_loss": 1.6777015924453735,
72
- "eval_runtime": 17.7744,
73
- "eval_samples_per_second": 64.925,
74
- "eval_steps_per_second": 0.563,
75
  "step": 35
76
  },
77
  {
78
  "epoch": 5.714285714285714,
79
- "grad_norm": 7.464715003967285,
80
- "learning_rate": 1.7142857142857142e-05,
81
- "loss": 7.6979,
82
  "step": 40
83
  },
84
  {
85
  "epoch": 6.0,
86
- "eval_accuracy": 0.5537261698440208,
87
- "eval_loss": 1.6649365425109863,
88
- "eval_runtime": 18.063,
89
- "eval_samples_per_second": 63.888,
90
- "eval_steps_per_second": 0.554,
91
  "step": 42
92
  },
93
  {
94
  "epoch": 7.0,
95
- "eval_accuracy": 0.5545927209705372,
96
- "eval_loss": 1.645348072052002,
97
- "eval_runtime": 17.915,
98
- "eval_samples_per_second": 64.415,
99
- "eval_steps_per_second": 0.558,
100
  "step": 49
101
  },
102
  {
103
  "epoch": 7.142857142857143,
104
- "grad_norm": 5.575019359588623,
105
- "learning_rate": 2.1428571428571428e-05,
106
- "loss": 7.6676,
107
  "step": 50
108
  },
109
  {
110
  "epoch": 8.0,
111
- "eval_accuracy": 0.5589254766031195,
112
- "eval_loss": 1.62311589717865,
113
- "eval_runtime": 18.2755,
114
- "eval_samples_per_second": 63.145,
115
- "eval_steps_per_second": 0.547,
116
  "step": 56
117
  },
118
  {
119
  "epoch": 8.571428571428571,
120
- "grad_norm": 5.814152717590332,
121
- "learning_rate": 2.5714285714285714e-05,
122
- "loss": 7.5464,
123
  "step": 60
124
  },
125
  {
126
  "epoch": 9.0,
127
- "eval_accuracy": 0.5597920277296361,
128
- "eval_loss": 1.6006678342819214,
129
- "eval_runtime": 17.9099,
130
- "eval_samples_per_second": 64.434,
131
- "eval_steps_per_second": 0.558,
132
  "step": 63
133
  },
134
  {
135
  "epoch": 10.0,
136
- "grad_norm": 13.754400253295898,
137
- "learning_rate": 3e-05,
138
- "loss": 7.5915,
139
  "step": 70
140
  },
141
  {
142
  "epoch": 10.0,
143
- "eval_accuracy": 0.5675909878682842,
144
- "eval_loss": 1.5768613815307617,
145
- "eval_runtime": 17.9864,
146
- "eval_samples_per_second": 64.16,
147
- "eval_steps_per_second": 0.556,
148
  "step": 70
149
  },
150
  {
151
  "epoch": 11.0,
152
- "eval_accuracy": 0.5701906412478336,
153
- "eval_loss": 1.5586786270141602,
154
- "eval_runtime": 17.9226,
155
- "eval_samples_per_second": 64.388,
156
- "eval_steps_per_second": 0.558,
157
  "step": 77
158
  },
159
  {
160
  "epoch": 11.428571428571429,
161
- "grad_norm": 5.879215717315674,
162
- "learning_rate": 2.9523809523809523e-05,
163
- "loss": 7.402,
164
  "step": 80
165
  },
166
  {
167
  "epoch": 12.0,
168
- "eval_accuracy": 0.5788561525129983,
169
- "eval_loss": 1.536378264427185,
170
- "eval_runtime": 17.7909,
171
- "eval_samples_per_second": 64.865,
172
- "eval_steps_per_second": 0.562,
173
  "step": 84
174
  },
175
  {
176
  "epoch": 12.857142857142858,
177
- "grad_norm": 6.11987829208374,
178
- "learning_rate": 2.904761904761905e-05,
179
- "loss": 7.3927,
180
  "step": 90
181
  },
182
  {
183
  "epoch": 13.0,
184
- "eval_accuracy": 0.5901213171577123,
185
- "eval_loss": 1.5154387950897217,
186
- "eval_runtime": 18.0226,
187
- "eval_samples_per_second": 64.031,
188
- "eval_steps_per_second": 0.555,
189
  "step": 91
190
  },
191
  {
192
  "epoch": 14.0,
193
- "eval_accuracy": 0.5909878682842288,
194
- "eval_loss": 1.4974240064620972,
195
- "eval_runtime": 18.0511,
196
- "eval_samples_per_second": 63.93,
197
- "eval_steps_per_second": 0.554,
198
  "step": 98
199
  },
200
  {
201
  "epoch": 14.285714285714286,
202
- "grad_norm": 5.535419940948486,
203
- "learning_rate": 2.857142857142857e-05,
204
- "loss": 7.3485,
205
  "step": 100
206
  },
207
  {
208
  "epoch": 15.0,
209
- "eval_accuracy": 0.598786828422877,
210
- "eval_loss": 1.4781272411346436,
211
- "eval_runtime": 17.949,
212
- "eval_samples_per_second": 64.293,
213
- "eval_steps_per_second": 0.557,
214
  "step": 105
215
  },
216
  {
217
  "epoch": 15.714285714285714,
218
- "grad_norm": 5.653200626373291,
219
- "learning_rate": 2.8095238095238096e-05,
220
- "loss": 7.2043,
221
  "step": 110
222
  },
223
  {
224
  "epoch": 16.0,
225
- "eval_accuracy": 0.6005199306759099,
226
- "eval_loss": 1.45522141456604,
227
- "eval_runtime": 17.8923,
228
- "eval_samples_per_second": 64.497,
229
- "eval_steps_per_second": 0.559,
230
  "step": 112
231
  },
232
  {
233
  "epoch": 17.0,
234
- "eval_accuracy": 0.608318890814558,
235
- "eval_loss": 1.4350148439407349,
236
- "eval_runtime": 17.9807,
237
- "eval_samples_per_second": 64.18,
238
- "eval_steps_per_second": 0.556,
239
  "step": 119
240
  },
241
  {
242
  "epoch": 17.142857142857142,
243
- "grad_norm": 7.7084059715271,
244
- "learning_rate": 2.761904761904762e-05,
245
- "loss": 7.1885,
246
  "step": 120
247
  },
248
  {
249
  "epoch": 18.0,
250
- "eval_accuracy": 0.6143847487001733,
251
- "eval_loss": 1.4141334295272827,
252
- "eval_runtime": 18.3637,
253
- "eval_samples_per_second": 62.841,
254
- "eval_steps_per_second": 0.545,
255
  "step": 126
256
  },
257
  {
258
  "epoch": 18.571428571428573,
259
- "grad_norm": 6.948986530303955,
260
- "learning_rate": 2.7142857142857144e-05,
261
- "loss": 6.9858,
262
  "step": 130
263
  },
264
  {
265
  "epoch": 19.0,
266
- "eval_accuracy": 0.6135181975736569,
267
- "eval_loss": 1.401322364807129,
268
- "eval_runtime": 17.9636,
269
- "eval_samples_per_second": 64.241,
270
- "eval_steps_per_second": 0.557,
271
  "step": 133
272
  },
273
  {
274
  "epoch": 20.0,
275
- "grad_norm": 15.973128318786621,
276
- "learning_rate": 2.6666666666666667e-05,
277
- "loss": 7.0214,
278
  "step": 140
279
  },
280
  {
281
  "epoch": 20.0,
282
- "eval_accuracy": 0.6187175043327556,
283
- "eval_loss": 1.3919869661331177,
284
- "eval_runtime": 18.3811,
285
- "eval_samples_per_second": 62.782,
286
- "eval_steps_per_second": 0.544,
287
  "step": 140
288
  },
289
  {
290
  "epoch": 21.0,
291
- "eval_accuracy": 0.6247833622183708,
292
- "eval_loss": 1.3702974319458008,
293
- "eval_runtime": 18.2422,
294
- "eval_samples_per_second": 63.26,
295
- "eval_steps_per_second": 0.548,
296
  "step": 147
297
  },
298
  {
299
  "epoch": 21.428571428571427,
300
- "grad_norm": 7.672400951385498,
301
- "learning_rate": 2.6190476190476192e-05,
302
- "loss": 6.8722,
303
  "step": 150
304
  },
305
  {
306
  "epoch": 22.0,
307
- "eval_accuracy": 0.6273830155979203,
308
- "eval_loss": 1.3540722131729126,
309
- "eval_runtime": 18.1835,
310
- "eval_samples_per_second": 63.464,
311
- "eval_steps_per_second": 0.55,
312
  "step": 154
313
  },
314
  {
315
  "epoch": 22.857142857142858,
316
- "grad_norm": 6.436771392822266,
317
- "learning_rate": 2.5714285714285714e-05,
318
- "loss": 6.8852,
319
  "step": 160
320
  },
321
  {
322
  "epoch": 23.0,
323
- "eval_accuracy": 0.6325823223570191,
324
- "eval_loss": 1.3347454071044922,
325
- "eval_runtime": 18.055,
326
- "eval_samples_per_second": 63.916,
327
- "eval_steps_per_second": 0.554,
328
  "step": 161
329
  },
330
  {
331
  "epoch": 24.0,
332
- "eval_accuracy": 0.634315424610052,
333
- "eval_loss": 1.3246690034866333,
334
- "eval_runtime": 18.2576,
335
- "eval_samples_per_second": 63.207,
336
- "eval_steps_per_second": 0.548,
337
  "step": 168
338
  },
339
  {
340
  "epoch": 24.285714285714285,
341
- "grad_norm": 6.365950584411621,
342
- "learning_rate": 2.523809523809524e-05,
343
- "loss": 6.7711,
344
  "step": 170
345
  },
346
  {
347
  "epoch": 25.0,
348
- "eval_accuracy": 0.6403812824956673,
349
- "eval_loss": 1.3150994777679443,
350
- "eval_runtime": 18.317,
351
- "eval_samples_per_second": 63.002,
352
- "eval_steps_per_second": 0.546,
353
  "step": 175
354
  },
355
  {
356
  "epoch": 25.714285714285715,
357
- "grad_norm": 6.6363525390625,
358
- "learning_rate": 2.4761904761904762e-05,
359
- "loss": 6.7634,
360
  "step": 180
361
  },
362
  {
363
  "epoch": 26.0,
364
- "eval_accuracy": 0.6429809358752167,
365
- "eval_loss": 1.293558955192566,
366
- "eval_runtime": 17.8053,
367
- "eval_samples_per_second": 64.812,
368
- "eval_steps_per_second": 0.562,
369
  "step": 182
370
  },
371
  {
372
  "epoch": 27.0,
373
- "eval_accuracy": 0.6403812824956673,
374
- "eval_loss": 1.2807742357254028,
375
- "eval_runtime": 17.8786,
376
- "eval_samples_per_second": 64.547,
377
- "eval_steps_per_second": 0.559,
378
  "step": 189
379
  },
380
  {
381
  "epoch": 27.142857142857142,
382
- "grad_norm": 6.076549530029297,
383
- "learning_rate": 2.4285714285714288e-05,
384
- "loss": 6.7208,
385
  "step": 190
386
  },
387
  {
388
  "epoch": 28.0,
389
- "eval_accuracy": 0.6421143847487002,
390
- "eval_loss": 1.2911421060562134,
391
- "eval_runtime": 18.0286,
392
- "eval_samples_per_second": 64.009,
393
- "eval_steps_per_second": 0.555,
394
  "step": 196
395
  },
396
  {
397
  "epoch": 28.571428571428573,
398
- "grad_norm": 7.341259479522705,
399
- "learning_rate": 2.380952380952381e-05,
400
- "loss": 6.7227,
401
  "step": 200
402
  },
403
  {
404
  "epoch": 29.0,
405
- "eval_accuracy": 0.6412478336221837,
406
- "eval_loss": 1.2630300521850586,
407
- "eval_runtime": 18.2787,
408
- "eval_samples_per_second": 63.134,
409
- "eval_steps_per_second": 0.547,
410
  "step": 203
411
  },
412
  {
413
  "epoch": 30.0,
414
- "grad_norm": 20.281497955322266,
415
- "learning_rate": 2.3333333333333336e-05,
416
- "loss": 6.5104,
417
  "step": 210
418
  },
419
  {
420
  "epoch": 30.0,
421
- "eval_accuracy": 0.6490467937608319,
422
- "eval_loss": 1.2508955001831055,
423
- "eval_runtime": 17.7789,
424
- "eval_samples_per_second": 64.908,
425
- "eval_steps_per_second": 0.562,
426
  "step": 210
427
  },
428
  {
429
  "epoch": 31.0,
430
- "eval_accuracy": 0.6507798960138648,
431
- "eval_loss": 1.2419097423553467,
432
- "eval_runtime": 17.7997,
433
- "eval_samples_per_second": 64.832,
434
- "eval_steps_per_second": 0.562,
435
  "step": 217
436
  },
437
  {
438
  "epoch": 31.428571428571427,
439
- "grad_norm": 5.900386333465576,
440
- "learning_rate": 2.2857142857142858e-05,
441
- "loss": 6.5766,
442
  "step": 220
443
  },
444
  {
445
  "epoch": 32.0,
446
- "eval_accuracy": 0.6516464471403813,
447
- "eval_loss": 1.2375727891921997,
448
- "eval_runtime": 17.8795,
449
- "eval_samples_per_second": 64.543,
450
- "eval_steps_per_second": 0.559,
451
  "step": 224
452
  },
453
  {
454
  "epoch": 32.857142857142854,
455
- "grad_norm": 5.9472551345825195,
456
- "learning_rate": 2.238095238095238e-05,
457
- "loss": 6.4044,
458
  "step": 230
459
  },
460
  {
461
  "epoch": 33.0,
462
- "eval_accuracy": 0.6499133448873483,
463
- "eval_loss": 1.2272541522979736,
464
- "eval_runtime": 17.9919,
465
- "eval_samples_per_second": 64.14,
466
- "eval_steps_per_second": 0.556,
467
  "step": 231
468
  },
469
  {
470
  "epoch": 34.0,
471
- "eval_accuracy": 0.6559792027729636,
472
- "eval_loss": 1.2196742296218872,
473
- "eval_runtime": 17.8267,
474
- "eval_samples_per_second": 64.734,
475
- "eval_steps_per_second": 0.561,
476
  "step": 238
477
  },
478
  {
479
  "epoch": 34.285714285714285,
480
- "grad_norm": 6.831246376037598,
481
- "learning_rate": 2.1904761904761903e-05,
482
- "loss": 6.5076,
483
  "step": 240
484
  },
485
  {
486
  "epoch": 35.0,
487
- "eval_accuracy": 0.6603119584055459,
488
- "eval_loss": 1.2083369493484497,
489
- "eval_runtime": 17.9273,
490
- "eval_samples_per_second": 64.371,
491
- "eval_steps_per_second": 0.558,
492
  "step": 245
493
  },
494
  {
495
  "epoch": 35.714285714285715,
496
- "grad_norm": 7.716729164123535,
497
- "learning_rate": 2.1428571428571428e-05,
498
- "loss": 6.3117,
499
  "step": 250
500
  },
501
  {
502
  "epoch": 36.0,
503
- "eval_accuracy": 0.6689774696707106,
504
- "eval_loss": 1.1937824487686157,
505
- "eval_runtime": 18.0584,
506
- "eval_samples_per_second": 63.904,
507
- "eval_steps_per_second": 0.554,
508
  "step": 252
509
  },
510
  {
511
  "epoch": 37.0,
512
- "eval_accuracy": 0.6759098786828422,
513
- "eval_loss": 1.186074137687683,
514
- "eval_runtime": 17.9647,
515
- "eval_samples_per_second": 64.237,
516
- "eval_steps_per_second": 0.557,
517
  "step": 259
518
  },
519
  {
520
  "epoch": 37.142857142857146,
521
- "grad_norm": 6.573192596435547,
522
- "learning_rate": 2.095238095238095e-05,
523
- "loss": 6.3993,
524
  "step": 260
525
  },
526
  {
527
  "epoch": 38.0,
528
- "eval_accuracy": 0.6767764298093587,
529
- "eval_loss": 1.1791296005249023,
530
- "eval_runtime": 17.8936,
531
- "eval_samples_per_second": 64.492,
532
  "eval_steps_per_second": 0.559,
533
  "step": 266
534
  },
535
  {
536
  "epoch": 38.57142857142857,
537
- "grad_norm": 6.08190393447876,
538
- "learning_rate": 2.0476190476190476e-05,
539
- "loss": 6.3086,
540
  "step": 270
541
  },
542
  {
543
  "epoch": 39.0,
544
- "eval_accuracy": 0.6663778162911612,
545
- "eval_loss": 1.182248592376709,
546
- "eval_runtime": 18.2175,
547
- "eval_samples_per_second": 63.346,
548
- "eval_steps_per_second": 0.549,
549
  "step": 273
550
  },
551
  {
552
  "epoch": 40.0,
553
- "grad_norm": 22.630414962768555,
554
- "learning_rate": 1.9999999999999998e-05,
555
- "loss": 6.1463,
556
  "step": 280
557
  },
558
  {
559
  "epoch": 40.0,
560
- "eval_accuracy": 0.6733102253032929,
561
- "eval_loss": 1.171223759651184,
562
- "eval_runtime": 17.9372,
563
- "eval_samples_per_second": 64.335,
564
- "eval_steps_per_second": 0.557,
565
  "step": 280
566
  },
567
  {
568
  "epoch": 41.0,
569
- "eval_accuracy": 0.6741767764298093,
570
- "eval_loss": 1.1596518754959106,
571
- "eval_runtime": 17.9876,
572
- "eval_samples_per_second": 64.155,
573
- "eval_steps_per_second": 0.556,
574
  "step": 287
575
  },
576
  {
577
  "epoch": 41.42857142857143,
578
- "grad_norm": 6.7577290534973145,
579
- "learning_rate": 1.9523809523809524e-05,
580
- "loss": 6.167,
581
  "step": 290
582
  },
583
  {
584
  "epoch": 42.0,
585
- "eval_accuracy": 0.682842287694974,
586
- "eval_loss": 1.1458425521850586,
587
- "eval_runtime": 18.1717,
588
- "eval_samples_per_second": 63.505,
589
- "eval_steps_per_second": 0.55,
590
  "step": 294
591
  },
592
  {
593
  "epoch": 42.857142857142854,
594
- "grad_norm": 6.671822547912598,
595
- "learning_rate": 1.9047619047619046e-05,
596
- "loss": 6.1862,
597
  "step": 300
598
  },
599
  {
600
  "epoch": 43.0,
601
- "eval_accuracy": 0.6845753899480069,
602
- "eval_loss": 1.1359864473342896,
603
- "eval_runtime": 17.8165,
604
- "eval_samples_per_second": 64.771,
605
- "eval_steps_per_second": 0.561,
606
  "step": 301
607
  },
608
  {
609
  "epoch": 44.0,
610
- "eval_accuracy": 0.6811091854419411,
611
- "eval_loss": 1.1295443773269653,
612
- "eval_runtime": 18.1519,
613
- "eval_samples_per_second": 63.575,
614
- "eval_steps_per_second": 0.551,
615
  "step": 308
616
  },
617
  {
618
  "epoch": 44.285714285714285,
619
- "grad_norm": 6.430041790008545,
620
- "learning_rate": 1.8571428571428572e-05,
621
- "loss": 6.2483,
622
  "step": 310
623
  },
624
  {
625
  "epoch": 45.0,
626
- "eval_accuracy": 0.6776429809358753,
627
- "eval_loss": 1.1391350030899048,
628
- "eval_runtime": 17.7669,
629
- "eval_samples_per_second": 64.952,
630
- "eval_steps_per_second": 0.563,
631
  "step": 315
632
  },
633
  {
634
  "epoch": 45.714285714285715,
635
- "grad_norm": 6.411828994750977,
636
- "learning_rate": 1.8095238095238094e-05,
637
- "loss": 6.1589,
638
  "step": 320
639
  },
640
  {
641
  "epoch": 46.0,
642
- "eval_accuracy": 0.6837088388214905,
643
- "eval_loss": 1.122534990310669,
644
- "eval_runtime": 17.9067,
645
- "eval_samples_per_second": 64.445,
646
- "eval_steps_per_second": 0.558,
647
  "step": 322
648
  },
649
  {
650
  "epoch": 47.0,
651
- "eval_accuracy": 0.6932409012131716,
652
- "eval_loss": 1.112987995147705,
653
- "eval_runtime": 18.1695,
654
- "eval_samples_per_second": 63.513,
655
- "eval_steps_per_second": 0.55,
656
  "step": 329
657
  },
658
  {
659
  "epoch": 47.142857142857146,
660
- "grad_norm": 9.403596878051758,
661
- "learning_rate": 1.761904761904762e-05,
662
- "loss": 6.0494,
663
  "step": 330
664
  },
665
  {
666
  "epoch": 48.0,
667
- "eval_accuracy": 0.6854419410745234,
668
- "eval_loss": 1.1091946363449097,
669
- "eval_runtime": 18.1367,
670
- "eval_samples_per_second": 63.628,
671
- "eval_steps_per_second": 0.551,
672
  "step": 336
673
  },
674
  {
675
  "epoch": 48.57142857142857,
676
- "grad_norm": 6.834399700164795,
677
- "learning_rate": 1.7142857142857142e-05,
678
- "loss": 6.0085,
679
  "step": 340
680
  },
681
  {
682
  "epoch": 49.0,
683
- "eval_accuracy": 0.6949740034662045,
684
- "eval_loss": 1.103259801864624,
685
- "eval_runtime": 18.1084,
686
- "eval_samples_per_second": 63.728,
687
- "eval_steps_per_second": 0.552,
688
  "step": 343
689
  },
690
  {
691
  "epoch": 50.0,
692
- "grad_norm": 15.006871223449707,
693
- "learning_rate": 1.6666666666666667e-05,
694
- "loss": 5.9398,
695
  "step": 350
696
  },
697
  {
698
  "epoch": 50.0,
699
- "eval_accuracy": 0.6915077989601387,
700
- "eval_loss": 1.1018480062484741,
701
- "eval_runtime": 18.1373,
702
- "eval_samples_per_second": 63.626,
703
- "eval_steps_per_second": 0.551,
704
  "step": 350
705
  },
706
  {
707
  "epoch": 51.0,
708
- "eval_accuracy": 0.6889081455805892,
709
- "eval_loss": 1.1009267568588257,
710
- "eval_runtime": 17.9979,
711
- "eval_samples_per_second": 64.118,
712
- "eval_steps_per_second": 0.556,
713
  "step": 357
714
  },
715
  {
716
  "epoch": 51.42857142857143,
717
- "grad_norm": 8.054534912109375,
718
- "learning_rate": 1.619047619047619e-05,
719
- "loss": 5.9514,
720
  "step": 360
721
  },
722
  {
723
  "epoch": 52.0,
724
- "eval_accuracy": 0.6941074523396881,
725
- "eval_loss": 1.090258002281189,
726
- "eval_runtime": 17.9851,
727
- "eval_samples_per_second": 64.164,
728
- "eval_steps_per_second": 0.556,
729
  "step": 364
730
  },
731
  {
732
  "epoch": 52.857142857142854,
733
- "grad_norm": 6.628889560699463,
734
- "learning_rate": 1.5714285714285715e-05,
735
- "loss": 5.8988,
736
  "step": 370
737
  },
738
  {
739
  "epoch": 53.0,
740
- "eval_accuracy": 0.6949740034662045,
741
- "eval_loss": 1.0866734981536865,
742
- "eval_runtime": 18.0959,
743
- "eval_samples_per_second": 63.771,
744
- "eval_steps_per_second": 0.553,
745
  "step": 371
746
  },
747
  {
748
  "epoch": 54.0,
749
- "eval_accuracy": 0.6967071057192374,
750
- "eval_loss": 1.0808851718902588,
751
- "eval_runtime": 18.2093,
752
- "eval_samples_per_second": 63.374,
753
- "eval_steps_per_second": 0.549,
754
  "step": 378
755
  },
756
  {
757
  "epoch": 54.285714285714285,
758
- "grad_norm": 8.836517333984375,
759
- "learning_rate": 1.5238095238095238e-05,
760
- "loss": 5.8841,
761
  "step": 380
762
  },
763
  {
764
  "epoch": 55.0,
765
- "eval_accuracy": 0.6915077989601387,
766
- "eval_loss": 1.0933481454849243,
767
- "eval_runtime": 17.7944,
768
- "eval_samples_per_second": 64.852,
769
- "eval_steps_per_second": 0.562,
770
  "step": 385
771
  },
772
  {
773
  "epoch": 55.714285714285715,
774
- "grad_norm": 7.778214931488037,
775
- "learning_rate": 1.4761904761904761e-05,
776
- "loss": 5.8698,
777
  "step": 390
778
  },
779
  {
780
  "epoch": 56.0,
781
- "eval_accuracy": 0.6975736568457539,
782
- "eval_loss": 1.08028244972229,
783
- "eval_runtime": 17.9491,
784
- "eval_samples_per_second": 64.293,
785
- "eval_steps_per_second": 0.557,
786
  "step": 392
787
  },
788
  {
789
  "epoch": 57.0,
790
- "eval_accuracy": 0.6993067590987868,
791
- "eval_loss": 1.0766432285308838,
792
- "eval_runtime": 17.9076,
793
- "eval_samples_per_second": 64.442,
794
- "eval_steps_per_second": 0.558,
795
  "step": 399
796
  },
797
  {
798
  "epoch": 57.142857142857146,
799
- "grad_norm": 6.896730899810791,
800
- "learning_rate": 1.4285714285714285e-05,
801
- "loss": 5.8995,
802
  "step": 400
803
  },
804
  {
805
  "epoch": 58.0,
806
- "eval_accuracy": 0.6949740034662045,
807
- "eval_loss": 1.0741863250732422,
808
- "eval_runtime": 17.9818,
809
- "eval_samples_per_second": 64.176,
810
- "eval_steps_per_second": 0.556,
811
  "step": 406
812
  },
813
  {
814
  "epoch": 58.57142857142857,
815
- "grad_norm": 9.20103931427002,
816
- "learning_rate": 1.380952380952381e-05,
817
- "loss": 5.7637,
818
  "step": 410
819
  },
820
  {
821
  "epoch": 59.0,
822
- "eval_accuracy": 0.7010398613518197,
823
- "eval_loss": 1.0637978315353394,
824
- "eval_runtime": 18.161,
825
- "eval_samples_per_second": 63.543,
826
- "eval_steps_per_second": 0.551,
827
  "step": 413
828
  },
829
  {
830
  "epoch": 60.0,
831
- "grad_norm": 30.250612258911133,
832
- "learning_rate": 1.3333333333333333e-05,
833
- "loss": 5.8425,
834
  "step": 420
835
  },
836
  {
837
  "epoch": 60.0,
838
- "eval_accuracy": 0.7036395147313691,
839
- "eval_loss": 1.0613973140716553,
840
- "eval_runtime": 18.3413,
841
- "eval_samples_per_second": 62.918,
842
- "eval_steps_per_second": 0.545,
843
  "step": 420
844
  },
845
  {
846
  "epoch": 61.0,
847
- "eval_accuracy": 0.7027729636048526,
848
- "eval_loss": 1.052578330039978,
849
- "eval_runtime": 18.0527,
850
- "eval_samples_per_second": 63.924,
851
- "eval_steps_per_second": 0.554,
852
  "step": 427
853
  },
854
  {
855
  "epoch": 61.42857142857143,
856
- "grad_norm": 6.542630672454834,
857
- "learning_rate": 1.2857142857142857e-05,
858
- "loss": 5.7953,
859
  "step": 430
860
  },
861
  {
862
  "epoch": 62.0,
863
- "eval_accuracy": 0.7097053726169844,
864
- "eval_loss": 1.0465185642242432,
865
- "eval_runtime": 17.8725,
866
- "eval_samples_per_second": 64.569,
867
- "eval_steps_per_second": 0.56,
868
  "step": 434
869
  },
870
  {
871
  "epoch": 62.857142857142854,
872
- "grad_norm": 7.69216775894165,
873
- "learning_rate": 1.2380952380952381e-05,
874
- "loss": 5.7288,
875
  "step": 440
876
  },
877
  {
878
  "epoch": 63.0,
879
- "eval_accuracy": 0.707105719237435,
880
- "eval_loss": 1.0427676439285278,
881
- "eval_runtime": 17.9866,
882
- "eval_samples_per_second": 64.159,
883
- "eval_steps_per_second": 0.556,
884
  "step": 441
885
  },
886
  {
887
  "epoch": 64.0,
888
- "eval_accuracy": 0.7079722703639515,
889
- "eval_loss": 1.0372076034545898,
890
- "eval_runtime": 18.052,
891
- "eval_samples_per_second": 63.926,
892
- "eval_steps_per_second": 0.554,
893
  "step": 448
894
  },
895
  {
896
  "epoch": 64.28571428571429,
897
- "grad_norm": 7.137964248657227,
898
- "learning_rate": 1.1904761904761905e-05,
899
- "loss": 5.6821,
900
  "step": 450
901
  },
902
  {
903
  "epoch": 65.0,
904
- "eval_accuracy": 0.7027729636048526,
905
- "eval_loss": 1.03859543800354,
906
- "eval_runtime": 18.0567,
907
- "eval_samples_per_second": 63.91,
908
- "eval_steps_per_second": 0.554,
909
  "step": 455
910
  },
911
  {
912
  "epoch": 65.71428571428571,
913
- "grad_norm": 7.532965660095215,
914
- "learning_rate": 1.1428571428571429e-05,
915
- "loss": 5.7068,
916
  "step": 460
917
  },
918
  {
919
  "epoch": 66.0,
920
- "eval_accuracy": 0.707105719237435,
921
- "eval_loss": 1.0439196825027466,
922
- "eval_runtime": 18.1635,
923
- "eval_samples_per_second": 63.534,
924
  "eval_steps_per_second": 0.551,
925
  "step": 462
926
  },
927
  {
928
  "epoch": 67.0,
929
- "eval_accuracy": 0.7062391681109186,
930
- "eval_loss": 1.0294309854507446,
931
- "eval_runtime": 17.9083,
932
- "eval_samples_per_second": 64.44,
933
- "eval_steps_per_second": 0.558,
934
  "step": 469
935
  },
936
  {
937
  "epoch": 67.14285714285714,
938
- "grad_norm": 6.9406609535217285,
939
- "learning_rate": 1.0952380952380951e-05,
940
- "loss": 5.716,
941
  "step": 470
942
  },
943
  {
944
  "epoch": 68.0,
945
- "eval_accuracy": 0.708838821490468,
946
- "eval_loss": 1.0337029695510864,
947
- "eval_runtime": 17.7972,
948
- "eval_samples_per_second": 64.842,
949
- "eval_steps_per_second": 0.562,
950
  "step": 476
951
  },
952
  {
953
  "epoch": 68.57142857142857,
954
- "grad_norm": 7.802277088165283,
955
- "learning_rate": 1.0476190476190475e-05,
956
- "loss": 5.7033,
957
  "step": 480
958
  },
959
  {
960
  "epoch": 69.0,
961
- "eval_accuracy": 0.7140381282495667,
962
- "eval_loss": 1.0229322910308838,
963
- "eval_runtime": 18.0063,
964
- "eval_samples_per_second": 64.089,
965
- "eval_steps_per_second": 0.555,
966
  "step": 483
967
  },
968
  {
969
  "epoch": 70.0,
970
- "grad_norm": 19.770496368408203,
971
- "learning_rate": 9.999999999999999e-06,
972
- "loss": 5.6381,
973
  "step": 490
974
  },
975
  {
976
  "epoch": 70.0,
977
- "eval_accuracy": 0.7140381282495667,
978
- "eval_loss": 1.022310733795166,
979
- "eval_runtime": 17.7667,
980
- "eval_samples_per_second": 64.953,
981
- "eval_steps_per_second": 0.563,
982
  "step": 490
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
983
  }
984
  ],
985
  "logging_steps": 10,
@@ -994,12 +1420,12 @@
994
  "should_evaluate": false,
995
  "should_log": false,
996
  "should_save": true,
997
- "should_training_stop": false
998
  },
999
  "attributes": {}
1000
  }
1001
  },
1002
- "total_flos": 4.294426307473613e+18,
1003
  "train_batch_size": 128,
1004
  "trial_name": null,
1005
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7954939341421143,
3
+ "best_model_checkpoint": "cvt-13-normal/checkpoint-700",
4
+ "epoch": 100.0,
5
  "eval_steps": 500,
6
+ "global_step": 700,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.7105719237435009,
14
+ "eval_loss": 1.0209927558898926,
15
+ "eval_runtime": 17.9138,
16
+ "eval_samples_per_second": 64.419,
17
+ "eval_steps_per_second": 0.558,
18
  "step": 7
19
  },
20
  {
21
  "epoch": 1.4285714285714286,
22
+ "grad_norm": 7.967917442321777,
23
+ "learning_rate": 4.285714285714285e-05,
24
+ "loss": 5.5642,
25
  "step": 10
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.7097053726169844,
30
+ "eval_loss": 1.0071666240692139,
31
+ "eval_runtime": 17.7355,
32
+ "eval_samples_per_second": 65.067,
33
+ "eval_steps_per_second": 0.564,
34
  "step": 14
35
  },
36
  {
37
  "epoch": 2.857142857142857,
38
+ "grad_norm": 8.133280754089355,
39
+ "learning_rate": 8.57142857142857e-05,
40
+ "loss": 5.662,
41
  "step": 20
42
  },
43
  {
44
  "epoch": 3.0,
45
+ "eval_accuracy": 0.708838821490468,
46
+ "eval_loss": 1.0150678157806396,
47
+ "eval_runtime": 17.8577,
48
+ "eval_samples_per_second": 64.622,
49
+ "eval_steps_per_second": 0.56,
50
  "step": 21
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.7140381282495667,
55
+ "eval_loss": 1.0016363859176636,
56
+ "eval_runtime": 17.837,
57
+ "eval_samples_per_second": 64.697,
58
+ "eval_steps_per_second": 0.561,
59
  "step": 28
60
  },
61
  {
62
  "epoch": 4.285714285714286,
63
+ "grad_norm": 8.433135986328125,
64
+ "learning_rate": 0.00012857142857142855,
65
+ "loss": 5.381,
66
  "step": 30
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.7123050259965338,
71
+ "eval_loss": 1.0119163990020752,
72
+ "eval_runtime": 17.7345,
73
+ "eval_samples_per_second": 65.071,
74
+ "eval_steps_per_second": 0.564,
75
  "step": 35
76
  },
77
  {
78
  "epoch": 5.714285714285714,
79
+ "grad_norm": 9.856744766235352,
80
+ "learning_rate": 0.0001714285714285714,
81
+ "loss": 5.3348,
82
  "step": 40
83
  },
84
  {
85
  "epoch": 6.0,
86
+ "eval_accuracy": 0.720103986135182,
87
+ "eval_loss": 0.9661750793457031,
88
+ "eval_runtime": 17.9039,
89
+ "eval_samples_per_second": 64.455,
90
+ "eval_steps_per_second": 0.559,
91
  "step": 42
92
  },
93
  {
94
  "epoch": 7.0,
95
+ "eval_accuracy": 0.7261698440207972,
96
+ "eval_loss": 0.9513705372810364,
97
+ "eval_runtime": 17.8649,
98
+ "eval_samples_per_second": 64.596,
99
+ "eval_steps_per_second": 0.56,
100
  "step": 49
101
  },
102
  {
103
  "epoch": 7.142857142857143,
104
+ "grad_norm": 10.7362699508667,
105
+ "learning_rate": 0.00021428571428571427,
106
+ "loss": 5.2423,
107
  "step": 50
108
  },
109
  {
110
  "epoch": 8.0,
111
+ "eval_accuracy": 0.7105719237435009,
112
+ "eval_loss": 0.9588707685470581,
113
+ "eval_runtime": 17.8964,
114
+ "eval_samples_per_second": 64.482,
115
+ "eval_steps_per_second": 0.559,
116
  "step": 56
117
  },
118
  {
119
  "epoch": 8.571428571428571,
120
+ "grad_norm": 11.099422454833984,
121
+ "learning_rate": 0.0002571428571428571,
122
+ "loss": 5.0251,
123
  "step": 60
124
  },
125
  {
126
  "epoch": 9.0,
127
+ "eval_accuracy": 0.7279029462738301,
128
+ "eval_loss": 0.908963680267334,
129
+ "eval_runtime": 17.9404,
130
+ "eval_samples_per_second": 64.324,
131
+ "eval_steps_per_second": 0.557,
132
  "step": 63
133
  },
134
  {
135
  "epoch": 10.0,
136
+ "grad_norm": 16.643394470214844,
137
+ "learning_rate": 0.0003,
138
+ "loss": 5.0547,
139
  "step": 70
140
  },
141
  {
142
  "epoch": 10.0,
143
+ "eval_accuracy": 0.7123050259965338,
144
+ "eval_loss": 0.9352001547813416,
145
+ "eval_runtime": 18.2788,
146
+ "eval_samples_per_second": 63.133,
147
+ "eval_steps_per_second": 0.547,
148
  "step": 70
149
  },
150
  {
151
  "epoch": 11.0,
152
+ "eval_accuracy": 0.6993067590987868,
153
+ "eval_loss": 1.0062916278839111,
154
+ "eval_runtime": 17.9901,
155
+ "eval_samples_per_second": 64.146,
156
+ "eval_steps_per_second": 0.556,
157
  "step": 77
158
  },
159
  {
160
  "epoch": 11.428571428571429,
161
+ "grad_norm": 9.376890182495117,
162
+ "learning_rate": 0.0002952380952380952,
163
+ "loss": 4.8246,
164
  "step": 80
165
  },
166
  {
167
  "epoch": 12.0,
168
+ "eval_accuracy": 0.7105719237435009,
169
+ "eval_loss": 0.9190986752510071,
170
+ "eval_runtime": 18.1793,
171
+ "eval_samples_per_second": 63.479,
172
+ "eval_steps_per_second": 0.55,
173
  "step": 84
174
  },
175
  {
176
  "epoch": 12.857142857142858,
177
+ "grad_norm": 7.629549026489258,
178
+ "learning_rate": 0.00029047619047619045,
179
+ "loss": 4.7811,
180
  "step": 90
181
  },
182
  {
183
  "epoch": 13.0,
184
+ "eval_accuracy": 0.7123050259965338,
185
+ "eval_loss": 0.9947251677513123,
186
+ "eval_runtime": 17.9036,
187
+ "eval_samples_per_second": 64.456,
188
+ "eval_steps_per_second": 0.559,
189
  "step": 91
190
  },
191
  {
192
  "epoch": 14.0,
193
+ "eval_accuracy": 0.7175043327556326,
194
+ "eval_loss": 0.9671235084533691,
195
+ "eval_runtime": 18.1306,
196
+ "eval_samples_per_second": 63.649,
197
+ "eval_steps_per_second": 0.552,
198
  "step": 98
199
  },
200
  {
201
  "epoch": 14.285714285714286,
202
+ "grad_norm": 13.771581649780273,
203
+ "learning_rate": 0.0002857142857142857,
204
+ "loss": 4.8234,
205
  "step": 100
206
  },
207
  {
208
  "epoch": 15.0,
209
+ "eval_accuracy": 0.7235701906412478,
210
+ "eval_loss": 0.9055125117301941,
211
+ "eval_runtime": 18.3144,
212
+ "eval_samples_per_second": 63.01,
213
+ "eval_steps_per_second": 0.546,
214
  "step": 105
215
  },
216
  {
217
  "epoch": 15.714285714285714,
218
+ "grad_norm": 9.288651466369629,
219
+ "learning_rate": 0.0002809523809523809,
220
+ "loss": 4.4787,
221
  "step": 110
222
  },
223
  {
224
  "epoch": 16.0,
225
+ "eval_accuracy": 0.744367417677643,
226
+ "eval_loss": 0.8837802410125732,
227
+ "eval_runtime": 18.2071,
228
+ "eval_samples_per_second": 63.382,
229
+ "eval_steps_per_second": 0.549,
230
  "step": 112
231
  },
232
  {
233
  "epoch": 17.0,
234
+ "eval_accuracy": 0.729636048526863,
235
+ "eval_loss": 0.9059325456619263,
236
+ "eval_runtime": 18.0331,
237
+ "eval_samples_per_second": 63.994,
238
+ "eval_steps_per_second": 0.555,
239
  "step": 119
240
  },
241
  {
242
  "epoch": 17.142857142857142,
243
+ "grad_norm": 8.790782928466797,
244
+ "learning_rate": 0.00027619047619047615,
245
+ "loss": 4.39,
246
  "step": 120
247
  },
248
  {
249
  "epoch": 18.0,
250
+ "eval_accuracy": 0.7461005199306759,
251
+ "eval_loss": 0.8639523983001709,
252
+ "eval_runtime": 18.0609,
253
+ "eval_samples_per_second": 63.895,
254
+ "eval_steps_per_second": 0.554,
255
  "step": 126
256
  },
257
  {
258
  "epoch": 18.571428571428573,
259
+ "grad_norm": 7.883941650390625,
260
+ "learning_rate": 0.0002714285714285714,
261
+ "loss": 4.1424,
262
  "step": 130
263
  },
264
  {
265
  "epoch": 19.0,
266
+ "eval_accuracy": 0.7487001733102253,
267
+ "eval_loss": 0.8660562634468079,
268
+ "eval_runtime": 17.7478,
269
+ "eval_samples_per_second": 65.022,
270
+ "eval_steps_per_second": 0.563,
271
  "step": 133
272
  },
273
  {
274
  "epoch": 20.0,
275
+ "grad_norm": 21.828325271606445,
276
+ "learning_rate": 0.0002666666666666666,
277
+ "loss": 4.1065,
278
  "step": 140
279
  },
280
  {
281
  "epoch": 20.0,
282
+ "eval_accuracy": 0.7305025996533796,
283
+ "eval_loss": 0.9056758284568787,
284
+ "eval_runtime": 17.8484,
285
+ "eval_samples_per_second": 64.656,
286
+ "eval_steps_per_second": 0.56,
287
  "step": 140
288
  },
289
  {
290
  "epoch": 21.0,
291
+ "eval_accuracy": 0.7348353552859619,
292
+ "eval_loss": 0.8865219354629517,
293
+ "eval_runtime": 18.0329,
294
+ "eval_samples_per_second": 63.994,
295
+ "eval_steps_per_second": 0.555,
296
  "step": 147
297
  },
298
  {
299
  "epoch": 21.428571428571427,
300
+ "grad_norm": 7.540792465209961,
301
+ "learning_rate": 0.00026190476190476186,
302
+ "loss": 4.0844,
303
  "step": 150
304
  },
305
  {
306
  "epoch": 22.0,
307
+ "eval_accuracy": 0.7391681109185442,
308
+ "eval_loss": 0.8928019404411316,
309
+ "eval_runtime": 17.9197,
310
+ "eval_samples_per_second": 64.398,
311
+ "eval_steps_per_second": 0.558,
312
  "step": 154
313
  },
314
  {
315
  "epoch": 22.857142857142858,
316
+ "grad_norm": 14.240620613098145,
317
+ "learning_rate": 0.0002571428571428571,
318
+ "loss": 3.9835,
319
  "step": 160
320
  },
321
  {
322
  "epoch": 23.0,
323
+ "eval_accuracy": 0.7538994800693241,
324
+ "eval_loss": 0.8675404787063599,
325
+ "eval_runtime": 18.0176,
326
+ "eval_samples_per_second": 64.048,
327
+ "eval_steps_per_second": 0.555,
328
  "step": 161
329
  },
330
  {
331
  "epoch": 24.0,
332
+ "eval_accuracy": 0.755632582322357,
333
+ "eval_loss": 0.8828888535499573,
334
+ "eval_runtime": 17.7466,
335
+ "eval_samples_per_second": 65.027,
336
+ "eval_steps_per_second": 0.563,
337
  "step": 168
338
  },
339
  {
340
  "epoch": 24.285714285714285,
341
+ "grad_norm": 8.749543190002441,
342
+ "learning_rate": 0.0002523809523809524,
343
+ "loss": 3.8199,
344
  "step": 170
345
  },
346
  {
347
  "epoch": 25.0,
348
+ "eval_accuracy": 0.7616984402079723,
349
+ "eval_loss": 0.8176947832107544,
350
+ "eval_runtime": 17.983,
351
+ "eval_samples_per_second": 64.172,
352
+ "eval_steps_per_second": 0.556,
353
  "step": 175
354
  },
355
  {
356
  "epoch": 25.714285714285715,
357
+ "grad_norm": 9.475801467895508,
358
+ "learning_rate": 0.00024761904761904757,
359
+ "loss": 3.7898,
360
  "step": 180
361
  },
362
  {
363
  "epoch": 26.0,
364
+ "eval_accuracy": 0.7461005199306759,
365
+ "eval_loss": 0.8885547518730164,
366
+ "eval_runtime": 18.0273,
367
+ "eval_samples_per_second": 64.014,
368
+ "eval_steps_per_second": 0.555,
369
  "step": 182
370
  },
371
  {
372
  "epoch": 27.0,
373
+ "eval_accuracy": 0.7461005199306759,
374
+ "eval_loss": 0.9394861459732056,
375
+ "eval_runtime": 18.1419,
376
+ "eval_samples_per_second": 63.61,
377
+ "eval_steps_per_second": 0.551,
378
  "step": 189
379
  },
380
  {
381
  "epoch": 27.142857142857142,
382
+ "grad_norm": 7.944543361663818,
383
+ "learning_rate": 0.00024285714285714283,
384
+ "loss": 3.7734,
385
  "step": 190
386
  },
387
  {
388
  "epoch": 28.0,
389
+ "eval_accuracy": 0.7608318890814558,
390
+ "eval_loss": 0.8348239064216614,
391
+ "eval_runtime": 17.9109,
392
+ "eval_samples_per_second": 64.43,
393
+ "eval_steps_per_second": 0.558,
394
  "step": 196
395
  },
396
  {
397
  "epoch": 28.571428571428573,
398
+ "grad_norm": 9.20173168182373,
399
+ "learning_rate": 0.00023809523809523807,
400
+ "loss": 3.7835,
401
  "step": 200
402
  },
403
  {
404
  "epoch": 29.0,
405
+ "eval_accuracy": 0.75736568457539,
406
+ "eval_loss": 0.836903989315033,
407
+ "eval_runtime": 18.1677,
408
+ "eval_samples_per_second": 63.519,
409
+ "eval_steps_per_second": 0.55,
410
  "step": 203
411
  },
412
  {
413
  "epoch": 30.0,
414
+ "grad_norm": 17.463150024414062,
415
+ "learning_rate": 0.0002333333333333333,
416
+ "loss": 3.6414,
417
  "step": 210
418
  },
419
  {
420
  "epoch": 30.0,
421
+ "eval_accuracy": 0.7660311958405546,
422
+ "eval_loss": 0.8668186664581299,
423
+ "eval_runtime": 17.8247,
424
+ "eval_samples_per_second": 64.742,
425
+ "eval_steps_per_second": 0.561,
426
  "step": 210
427
  },
428
  {
429
  "epoch": 31.0,
430
+ "eval_accuracy": 0.7599653379549394,
431
+ "eval_loss": 0.8909233808517456,
432
+ "eval_runtime": 18.1581,
433
+ "eval_samples_per_second": 63.553,
434
+ "eval_steps_per_second": 0.551,
435
  "step": 217
436
  },
437
  {
438
  "epoch": 31.428571428571427,
439
+ "grad_norm": 13.756216049194336,
440
+ "learning_rate": 0.00022857142857142854,
441
+ "loss": 3.5076,
442
  "step": 220
443
  },
444
  {
445
  "epoch": 32.0,
446
+ "eval_accuracy": 0.7495667244367418,
447
+ "eval_loss": 0.8795309066772461,
448
+ "eval_runtime": 17.8514,
449
+ "eval_samples_per_second": 64.645,
450
+ "eval_steps_per_second": 0.56,
451
  "step": 224
452
  },
453
  {
454
  "epoch": 32.857142857142854,
455
+ "grad_norm": 9.03218936920166,
456
+ "learning_rate": 0.0002238095238095238,
457
+ "loss": 3.5447,
458
  "step": 230
459
  },
460
  {
461
  "epoch": 33.0,
462
+ "eval_accuracy": 0.7538994800693241,
463
+ "eval_loss": 0.9227800369262695,
464
+ "eval_runtime": 17.9657,
465
+ "eval_samples_per_second": 64.233,
466
+ "eval_steps_per_second": 0.557,
467
  "step": 231
468
  },
469
  {
470
  "epoch": 34.0,
471
+ "eval_accuracy": 0.7521663778162911,
472
+ "eval_loss": 0.8850377798080444,
473
+ "eval_runtime": 17.9906,
474
+ "eval_samples_per_second": 64.144,
475
+ "eval_steps_per_second": 0.556,
476
  "step": 238
477
  },
478
  {
479
  "epoch": 34.285714285714285,
480
+ "grad_norm": 7.675583839416504,
481
+ "learning_rate": 0.000219047619047619,
482
+ "loss": 3.5344,
483
  "step": 240
484
  },
485
  {
486
  "epoch": 35.0,
487
+ "eval_accuracy": 0.7651646447140381,
488
+ "eval_loss": 0.8584573864936829,
489
+ "eval_runtime": 18.1255,
490
+ "eval_samples_per_second": 63.667,
491
+ "eval_steps_per_second": 0.552,
492
  "step": 245
493
  },
494
  {
495
  "epoch": 35.714285714285715,
496
+ "grad_norm": 7.848378658294678,
497
+ "learning_rate": 0.00021428571428571427,
498
+ "loss": 3.3678,
499
  "step": 250
500
  },
501
  {
502
  "epoch": 36.0,
503
+ "eval_accuracy": 0.75736568457539,
504
+ "eval_loss": 0.8631114959716797,
505
+ "eval_runtime": 18.0275,
506
+ "eval_samples_per_second": 64.013,
507
+ "eval_steps_per_second": 0.555,
508
  "step": 252
509
  },
510
  {
511
  "epoch": 37.0,
512
+ "eval_accuracy": 0.770363951473137,
513
+ "eval_loss": 0.8675860166549683,
514
+ "eval_runtime": 18.0196,
515
+ "eval_samples_per_second": 64.042,
516
+ "eval_steps_per_second": 0.555,
517
  "step": 259
518
  },
519
  {
520
  "epoch": 37.142857142857146,
521
+ "grad_norm": 9.06800651550293,
522
+ "learning_rate": 0.00020952380952380948,
523
+ "loss": 3.4061,
524
  "step": 260
525
  },
526
  {
527
  "epoch": 38.0,
528
+ "eval_accuracy": 0.7616984402079723,
529
+ "eval_loss": 0.9131080508232117,
530
+ "eval_runtime": 17.9025,
531
+ "eval_samples_per_second": 64.46,
532
  "eval_steps_per_second": 0.559,
533
  "step": 266
534
  },
535
  {
536
  "epoch": 38.57142857142857,
537
+ "grad_norm": 11.665525436401367,
538
+ "learning_rate": 0.00020476190476190475,
539
+ "loss": 3.3177,
540
  "step": 270
541
  },
542
  {
543
  "epoch": 39.0,
544
+ "eval_accuracy": 0.7677642980935875,
545
+ "eval_loss": 0.8631002902984619,
546
+ "eval_runtime": 17.9771,
547
+ "eval_samples_per_second": 64.193,
548
+ "eval_steps_per_second": 0.556,
549
  "step": 273
550
  },
551
  {
552
  "epoch": 40.0,
553
+ "grad_norm": 15.023707389831543,
554
+ "learning_rate": 0.00019999999999999998,
555
+ "loss": 3.2767,
556
  "step": 280
557
  },
558
  {
559
  "epoch": 40.0,
560
+ "eval_accuracy": 0.7642980935875217,
561
+ "eval_loss": 0.8802210092544556,
562
+ "eval_runtime": 17.9247,
563
+ "eval_samples_per_second": 64.381,
564
+ "eval_steps_per_second": 0.558,
565
  "step": 280
566
  },
567
  {
568
  "epoch": 41.0,
569
+ "eval_accuracy": 0.7677642980935875,
570
+ "eval_loss": 0.8518037796020508,
571
+ "eval_runtime": 18.183,
572
+ "eval_samples_per_second": 63.466,
573
+ "eval_steps_per_second": 0.55,
574
  "step": 287
575
  },
576
  {
577
  "epoch": 41.42857142857143,
578
+ "grad_norm": 8.431020736694336,
579
+ "learning_rate": 0.00019523809523809522,
580
+ "loss": 3.1992,
581
  "step": 290
582
  },
583
  {
584
  "epoch": 42.0,
585
+ "eval_accuracy": 0.75736568457539,
586
+ "eval_loss": 0.923156201839447,
587
+ "eval_runtime": 18.0318,
588
+ "eval_samples_per_second": 63.998,
589
+ "eval_steps_per_second": 0.555,
590
  "step": 294
591
  },
592
  {
593
  "epoch": 42.857142857142854,
594
+ "grad_norm": 8.130815505981445,
595
+ "learning_rate": 0.00019047619047619045,
596
+ "loss": 3.2743,
597
  "step": 300
598
  },
599
  {
600
  "epoch": 43.0,
601
+ "eval_accuracy": 0.7521663778162911,
602
+ "eval_loss": 0.9305623173713684,
603
+ "eval_runtime": 17.9901,
604
+ "eval_samples_per_second": 64.146,
605
+ "eval_steps_per_second": 0.556,
606
  "step": 301
607
  },
608
  {
609
  "epoch": 44.0,
610
+ "eval_accuracy": 0.7755632582322357,
611
+ "eval_loss": 0.8419708013534546,
612
+ "eval_runtime": 17.9031,
613
+ "eval_samples_per_second": 64.458,
614
+ "eval_steps_per_second": 0.559,
615
  "step": 308
616
  },
617
  {
618
  "epoch": 44.285714285714285,
619
+ "grad_norm": 9.007019996643066,
620
+ "learning_rate": 0.00018571428571428572,
621
+ "loss": 3.1704,
622
  "step": 310
623
  },
624
  {
625
  "epoch": 45.0,
626
+ "eval_accuracy": 0.7564991334488734,
627
+ "eval_loss": 0.8801714777946472,
628
+ "eval_runtime": 17.8984,
629
+ "eval_samples_per_second": 64.475,
630
+ "eval_steps_per_second": 0.559,
631
  "step": 315
632
  },
633
  {
634
  "epoch": 45.714285714285715,
635
+ "grad_norm": 8.079572677612305,
636
+ "learning_rate": 0.00018095238095238093,
637
+ "loss": 3.2466,
638
  "step": 320
639
  },
640
  {
641
  "epoch": 46.0,
642
+ "eval_accuracy": 0.7677642980935875,
643
+ "eval_loss": 0.878183901309967,
644
+ "eval_runtime": 18.135,
645
+ "eval_samples_per_second": 63.634,
646
+ "eval_steps_per_second": 0.551,
647
  "step": 322
648
  },
649
  {
650
  "epoch": 47.0,
651
+ "eval_accuracy": 0.7746967071057193,
652
+ "eval_loss": 0.844364583492279,
653
+ "eval_runtime": 18.003,
654
+ "eval_samples_per_second": 64.1,
655
+ "eval_steps_per_second": 0.555,
656
  "step": 329
657
  },
658
  {
659
  "epoch": 47.142857142857146,
660
+ "grad_norm": 6.920067310333252,
661
+ "learning_rate": 0.0001761904761904762,
662
+ "loss": 3.0879,
663
  "step": 330
664
  },
665
  {
666
  "epoch": 48.0,
667
+ "eval_accuracy": 0.7694974003466204,
668
+ "eval_loss": 0.8579216003417969,
669
+ "eval_runtime": 17.8532,
670
+ "eval_samples_per_second": 64.638,
671
+ "eval_steps_per_second": 0.56,
672
  "step": 336
673
  },
674
  {
675
  "epoch": 48.57142857142857,
676
+ "grad_norm": 6.670530796051025,
677
+ "learning_rate": 0.0001714285714285714,
678
+ "loss": 3.1677,
679
  "step": 340
680
  },
681
  {
682
  "epoch": 49.0,
683
+ "eval_accuracy": 0.7712305025996534,
684
+ "eval_loss": 0.858402669429779,
685
+ "eval_runtime": 17.75,
686
+ "eval_samples_per_second": 65.014,
687
+ "eval_steps_per_second": 0.563,
688
  "step": 343
689
  },
690
  {
691
  "epoch": 50.0,
692
+ "grad_norm": 13.106241226196289,
693
+ "learning_rate": 0.00016666666666666666,
694
+ "loss": 3.0965,
695
  "step": 350
696
  },
697
  {
698
  "epoch": 50.0,
699
+ "eval_accuracy": 0.7755632582322357,
700
+ "eval_loss": 0.8400810956954956,
701
+ "eval_runtime": 18.0075,
702
+ "eval_samples_per_second": 64.084,
703
+ "eval_steps_per_second": 0.555,
704
  "step": 350
705
  },
706
  {
707
  "epoch": 51.0,
708
+ "eval_accuracy": 0.7651646447140381,
709
+ "eval_loss": 0.8724238872528076,
710
+ "eval_runtime": 18.0097,
711
+ "eval_samples_per_second": 64.077,
712
+ "eval_steps_per_second": 0.555,
713
  "step": 357
714
  },
715
  {
716
  "epoch": 51.42857142857143,
717
+ "grad_norm": 8.85236930847168,
718
+ "learning_rate": 0.00016190476190476187,
719
+ "loss": 3.0611,
720
  "step": 360
721
  },
722
  {
723
  "epoch": 52.0,
724
+ "eval_accuracy": 0.7807625649913345,
725
+ "eval_loss": 0.8638470768928528,
726
+ "eval_runtime": 18.0439,
727
+ "eval_samples_per_second": 63.955,
728
+ "eval_steps_per_second": 0.554,
729
  "step": 364
730
  },
731
  {
732
  "epoch": 52.857142857142854,
733
+ "grad_norm": 7.648194789886475,
734
+ "learning_rate": 0.00015714285714285713,
735
+ "loss": 3.0204,
736
  "step": 370
737
  },
738
  {
739
  "epoch": 53.0,
740
+ "eval_accuracy": 0.7660311958405546,
741
+ "eval_loss": 0.9167099595069885,
742
+ "eval_runtime": 17.9056,
743
+ "eval_samples_per_second": 64.449,
744
+ "eval_steps_per_second": 0.558,
745
  "step": 371
746
  },
747
  {
748
  "epoch": 54.0,
749
+ "eval_accuracy": 0.7738301559792028,
750
+ "eval_loss": 0.8322371244430542,
751
+ "eval_runtime": 17.9741,
752
+ "eval_samples_per_second": 64.204,
753
+ "eval_steps_per_second": 0.556,
754
  "step": 378
755
  },
756
  {
757
  "epoch": 54.285714285714285,
758
+ "grad_norm": 6.742936611175537,
759
+ "learning_rate": 0.00015238095238095237,
760
+ "loss": 2.9704,
761
  "step": 380
762
  },
763
  {
764
  "epoch": 55.0,
765
+ "eval_accuracy": 0.7642980935875217,
766
+ "eval_loss": 0.8577215671539307,
767
+ "eval_runtime": 18.0258,
768
+ "eval_samples_per_second": 64.019,
769
+ "eval_steps_per_second": 0.555,
770
  "step": 385
771
  },
772
  {
773
  "epoch": 55.714285714285715,
774
+ "grad_norm": 6.2735395431518555,
775
+ "learning_rate": 0.0001476190476190476,
776
+ "loss": 2.939,
777
  "step": 390
778
  },
779
  {
780
  "epoch": 56.0,
781
+ "eval_accuracy": 0.7859618717504333,
782
+ "eval_loss": 0.8296905755996704,
783
+ "eval_runtime": 18.0649,
784
+ "eval_samples_per_second": 63.881,
785
+ "eval_steps_per_second": 0.554,
786
  "step": 392
787
  },
788
  {
789
  "epoch": 57.0,
790
+ "eval_accuracy": 0.7686308492201039,
791
+ "eval_loss": 0.874596893787384,
792
+ "eval_runtime": 17.9658,
793
+ "eval_samples_per_second": 64.233,
794
+ "eval_steps_per_second": 0.557,
795
  "step": 399
796
  },
797
  {
798
  "epoch": 57.142857142857146,
799
+ "grad_norm": 6.44887113571167,
800
+ "learning_rate": 0.00014285714285714284,
801
+ "loss": 3.0341,
802
  "step": 400
803
  },
804
  {
805
  "epoch": 58.0,
806
+ "eval_accuracy": 0.7824956672443674,
807
+ "eval_loss": 0.8620171546936035,
808
+ "eval_runtime": 17.939,
809
+ "eval_samples_per_second": 64.329,
810
+ "eval_steps_per_second": 0.557,
811
  "step": 406
812
  },
813
  {
814
  "epoch": 58.57142857142857,
815
+ "grad_norm": 6.199102401733398,
816
+ "learning_rate": 0.00013809523809523808,
817
+ "loss": 2.8997,
818
  "step": 410
819
  },
820
  {
821
  "epoch": 59.0,
822
+ "eval_accuracy": 0.75736568457539,
823
+ "eval_loss": 0.8835130333900452,
824
+ "eval_runtime": 18.2434,
825
+ "eval_samples_per_second": 63.256,
826
+ "eval_steps_per_second": 0.548,
827
  "step": 413
828
  },
829
  {
830
  "epoch": 60.0,
831
+ "grad_norm": 27.795392990112305,
832
+ "learning_rate": 0.0001333333333333333,
833
+ "loss": 3.0187,
834
  "step": 420
835
  },
836
  {
837
  "epoch": 60.0,
838
+ "eval_accuracy": 0.7694974003466204,
839
+ "eval_loss": 0.9018464684486389,
840
+ "eval_runtime": 18.2513,
841
+ "eval_samples_per_second": 63.228,
842
+ "eval_steps_per_second": 0.548,
843
  "step": 420
844
  },
845
  {
846
  "epoch": 61.0,
847
+ "eval_accuracy": 0.7772963604852686,
848
+ "eval_loss": 0.8939943909645081,
849
+ "eval_runtime": 18.1365,
850
+ "eval_samples_per_second": 63.629,
851
+ "eval_steps_per_second": 0.551,
852
  "step": 427
853
  },
854
  {
855
  "epoch": 61.42857142857143,
856
+ "grad_norm": 10.215301513671875,
857
+ "learning_rate": 0.00012857142857142855,
858
+ "loss": 2.9316,
859
  "step": 430
860
  },
861
  {
862
  "epoch": 62.0,
863
+ "eval_accuracy": 0.7712305025996534,
864
+ "eval_loss": 0.8858510851860046,
865
+ "eval_runtime": 18.1655,
866
+ "eval_samples_per_second": 63.527,
867
+ "eval_steps_per_second": 0.55,
868
  "step": 434
869
  },
870
  {
871
  "epoch": 62.857142857142854,
872
+ "grad_norm": 5.105686187744141,
873
+ "learning_rate": 0.00012380952380952378,
874
+ "loss": 2.8746,
875
  "step": 440
876
  },
877
  {
878
  "epoch": 63.0,
879
+ "eval_accuracy": 0.7764298093587522,
880
+ "eval_loss": 0.8661392331123352,
881
+ "eval_runtime": 17.9626,
882
+ "eval_samples_per_second": 64.245,
883
+ "eval_steps_per_second": 0.557,
884
  "step": 441
885
  },
886
  {
887
  "epoch": 64.0,
888
+ "eval_accuracy": 0.7712305025996534,
889
+ "eval_loss": 0.8916440010070801,
890
+ "eval_runtime": 17.94,
891
+ "eval_samples_per_second": 64.326,
892
+ "eval_steps_per_second": 0.557,
893
  "step": 448
894
  },
895
  {
896
  "epoch": 64.28571428571429,
897
+ "grad_norm": 9.268267631530762,
898
+ "learning_rate": 0.00011904761904761903,
899
+ "loss": 2.817,
900
  "step": 450
901
  },
902
  {
903
  "epoch": 65.0,
904
+ "eval_accuracy": 0.7781629116117851,
905
+ "eval_loss": 0.8645418286323547,
906
+ "eval_runtime": 18.2441,
907
+ "eval_samples_per_second": 63.253,
908
+ "eval_steps_per_second": 0.548,
909
  "step": 455
910
  },
911
  {
912
  "epoch": 65.71428571428571,
913
+ "grad_norm": 6.703152179718018,
914
+ "learning_rate": 0.00011428571428571427,
915
+ "loss": 2.7593,
916
  "step": 460
917
  },
918
  {
919
  "epoch": 66.0,
920
+ "eval_accuracy": 0.7686308492201039,
921
+ "eval_loss": 0.8828719854354858,
922
+ "eval_runtime": 18.1608,
923
+ "eval_samples_per_second": 63.543,
924
  "eval_steps_per_second": 0.551,
925
  "step": 462
926
  },
927
  {
928
  "epoch": 67.0,
929
+ "eval_accuracy": 0.7790294627383015,
930
+ "eval_loss": 0.8883015513420105,
931
+ "eval_runtime": 18.1166,
932
+ "eval_samples_per_second": 63.698,
933
+ "eval_steps_per_second": 0.552,
934
  "step": 469
935
  },
936
  {
937
  "epoch": 67.14285714285714,
938
+ "grad_norm": 5.34393310546875,
939
+ "learning_rate": 0.0001095238095238095,
940
+ "loss": 2.9212,
941
  "step": 470
942
  },
943
  {
944
  "epoch": 68.0,
945
+ "eval_accuracy": 0.7824956672443674,
946
+ "eval_loss": 0.8507192134857178,
947
+ "eval_runtime": 18.0504,
948
+ "eval_samples_per_second": 63.932,
949
+ "eval_steps_per_second": 0.554,
950
  "step": 476
951
  },
952
  {
953
  "epoch": 68.57142857142857,
954
+ "grad_norm": 6.5966668128967285,
955
+ "learning_rate": 0.00010476190476190474,
956
+ "loss": 2.8659,
957
  "step": 480
958
  },
959
  {
960
  "epoch": 69.0,
961
+ "eval_accuracy": 0.7876949740034662,
962
+ "eval_loss": 0.8553578853607178,
963
+ "eval_runtime": 18.0681,
964
+ "eval_samples_per_second": 63.869,
965
+ "eval_steps_per_second": 0.553,
966
  "step": 483
967
  },
968
  {
969
  "epoch": 70.0,
970
+ "grad_norm": 22.730220794677734,
971
+ "learning_rate": 9.999999999999999e-05,
972
+ "loss": 2.9068,
973
  "step": 490
974
  },
975
  {
976
  "epoch": 70.0,
977
+ "eval_accuracy": 0.7764298093587522,
978
+ "eval_loss": 0.8812502026557922,
979
+ "eval_runtime": 17.9671,
980
+ "eval_samples_per_second": 64.229,
981
+ "eval_steps_per_second": 0.557,
982
  "step": 490
983
+ },
984
+ {
985
+ "epoch": 71.0,
986
+ "eval_accuracy": 0.7859618717504333,
987
+ "eval_loss": 0.8555229902267456,
988
+ "eval_runtime": 18.0711,
989
+ "eval_samples_per_second": 63.859,
990
+ "eval_steps_per_second": 0.553,
991
+ "step": 497
992
+ },
993
+ {
994
+ "epoch": 71.42857142857143,
995
+ "grad_norm": 5.773199558258057,
996
+ "learning_rate": 9.523809523809523e-05,
997
+ "loss": 2.8334,
998
+ "step": 500
999
+ },
1000
+ {
1001
+ "epoch": 72.0,
1002
+ "eval_accuracy": 0.7790294627383015,
1003
+ "eval_loss": 0.8665823340415955,
1004
+ "eval_runtime": 18.4819,
1005
+ "eval_samples_per_second": 62.439,
1006
+ "eval_steps_per_second": 0.541,
1007
+ "step": 504
1008
+ },
1009
+ {
1010
+ "epoch": 72.85714285714286,
1011
+ "grad_norm": 6.063803672790527,
1012
+ "learning_rate": 9.047619047619046e-05,
1013
+ "loss": 2.7322,
1014
+ "step": 510
1015
+ },
1016
+ {
1017
+ "epoch": 73.0,
1018
+ "eval_accuracy": 0.7824956672443674,
1019
+ "eval_loss": 0.8682228922843933,
1020
+ "eval_runtime": 18.1239,
1021
+ "eval_samples_per_second": 63.673,
1022
+ "eval_steps_per_second": 0.552,
1023
+ "step": 511
1024
+ },
1025
+ {
1026
+ "epoch": 74.0,
1027
+ "eval_accuracy": 0.7885615251299827,
1028
+ "eval_loss": 0.881618320941925,
1029
+ "eval_runtime": 17.8842,
1030
+ "eval_samples_per_second": 64.526,
1031
+ "eval_steps_per_second": 0.559,
1032
+ "step": 518
1033
+ },
1034
+ {
1035
+ "epoch": 74.28571428571429,
1036
+ "grad_norm": 5.207172870635986,
1037
+ "learning_rate": 8.57142857142857e-05,
1038
+ "loss": 2.8548,
1039
+ "step": 520
1040
+ },
1041
+ {
1042
+ "epoch": 75.0,
1043
+ "eval_accuracy": 0.7902946273830156,
1044
+ "eval_loss": 0.8523378968238831,
1045
+ "eval_runtime": 18.1134,
1046
+ "eval_samples_per_second": 63.71,
1047
+ "eval_steps_per_second": 0.552,
1048
+ "step": 525
1049
+ },
1050
+ {
1051
+ "epoch": 75.71428571428571,
1052
+ "grad_norm": 6.294586658477783,
1053
+ "learning_rate": 8.095238095238093e-05,
1054
+ "loss": 2.8696,
1055
+ "step": 530
1056
+ },
1057
+ {
1058
+ "epoch": 76.0,
1059
+ "eval_accuracy": 0.7894280762564991,
1060
+ "eval_loss": 0.8509147763252258,
1061
+ "eval_runtime": 18.182,
1062
+ "eval_samples_per_second": 63.469,
1063
+ "eval_steps_per_second": 0.55,
1064
+ "step": 532
1065
+ },
1066
+ {
1067
+ "epoch": 77.0,
1068
+ "eval_accuracy": 0.7807625649913345,
1069
+ "eval_loss": 0.8682960867881775,
1070
+ "eval_runtime": 18.3628,
1071
+ "eval_samples_per_second": 62.845,
1072
+ "eval_steps_per_second": 0.545,
1073
+ "step": 539
1074
+ },
1075
+ {
1076
+ "epoch": 77.14285714285714,
1077
+ "grad_norm": 5.558056831359863,
1078
+ "learning_rate": 7.619047619047618e-05,
1079
+ "loss": 2.6439,
1080
+ "step": 540
1081
+ },
1082
+ {
1083
+ "epoch": 78.0,
1084
+ "eval_accuracy": 0.7876949740034662,
1085
+ "eval_loss": 0.860653281211853,
1086
+ "eval_runtime": 18.2632,
1087
+ "eval_samples_per_second": 63.187,
1088
+ "eval_steps_per_second": 0.548,
1089
+ "step": 546
1090
+ },
1091
+ {
1092
+ "epoch": 78.57142857142857,
1093
+ "grad_norm": 5.7894415855407715,
1094
+ "learning_rate": 7.142857142857142e-05,
1095
+ "loss": 2.9039,
1096
+ "step": 550
1097
+ },
1098
+ {
1099
+ "epoch": 79.0,
1100
+ "eval_accuracy": 0.7842287694974004,
1101
+ "eval_loss": 0.8698387742042542,
1102
+ "eval_runtime": 18.1385,
1103
+ "eval_samples_per_second": 63.622,
1104
+ "eval_steps_per_second": 0.551,
1105
+ "step": 553
1106
+ },
1107
+ {
1108
+ "epoch": 80.0,
1109
+ "grad_norm": 28.787755966186523,
1110
+ "learning_rate": 6.666666666666666e-05,
1111
+ "loss": 2.6338,
1112
+ "step": 560
1113
+ },
1114
+ {
1115
+ "epoch": 80.0,
1116
+ "eval_accuracy": 0.7876949740034662,
1117
+ "eval_loss": 0.8718376755714417,
1118
+ "eval_runtime": 18.0357,
1119
+ "eval_samples_per_second": 63.984,
1120
+ "eval_steps_per_second": 0.554,
1121
+ "step": 560
1122
+ },
1123
+ {
1124
+ "epoch": 81.0,
1125
+ "eval_accuracy": 0.7902946273830156,
1126
+ "eval_loss": 0.8370843529701233,
1127
+ "eval_runtime": 18.1407,
1128
+ "eval_samples_per_second": 63.614,
1129
+ "eval_steps_per_second": 0.551,
1130
+ "step": 567
1131
+ },
1132
+ {
1133
+ "epoch": 81.42857142857143,
1134
+ "grad_norm": 6.290432929992676,
1135
+ "learning_rate": 6.190476190476189e-05,
1136
+ "loss": 2.7271,
1137
+ "step": 570
1138
+ },
1139
+ {
1140
+ "epoch": 82.0,
1141
+ "eval_accuracy": 0.792894280762565,
1142
+ "eval_loss": 0.8426641821861267,
1143
+ "eval_runtime": 17.8494,
1144
+ "eval_samples_per_second": 64.652,
1145
+ "eval_steps_per_second": 0.56,
1146
+ "step": 574
1147
+ },
1148
+ {
1149
+ "epoch": 82.85714285714286,
1150
+ "grad_norm": 4.4193525314331055,
1151
+ "learning_rate": 5.7142857142857135e-05,
1152
+ "loss": 2.7555,
1153
+ "step": 580
1154
+ },
1155
+ {
1156
+ "epoch": 83.0,
1157
+ "eval_accuracy": 0.7937608318890814,
1158
+ "eval_loss": 0.8621939420700073,
1159
+ "eval_runtime": 17.8242,
1160
+ "eval_samples_per_second": 64.743,
1161
+ "eval_steps_per_second": 0.561,
1162
+ "step": 581
1163
+ },
1164
+ {
1165
+ "epoch": 84.0,
1166
+ "eval_accuracy": 0.7859618717504333,
1167
+ "eval_loss": 0.8768612146377563,
1168
+ "eval_runtime": 17.9828,
1169
+ "eval_samples_per_second": 64.172,
1170
+ "eval_steps_per_second": 0.556,
1171
+ "step": 588
1172
+ },
1173
+ {
1174
+ "epoch": 84.28571428571429,
1175
+ "grad_norm": 5.777393341064453,
1176
+ "learning_rate": 5.238095238095237e-05,
1177
+ "loss": 2.7702,
1178
+ "step": 590
1179
+ },
1180
+ {
1181
+ "epoch": 85.0,
1182
+ "eval_accuracy": 0.7859618717504333,
1183
+ "eval_loss": 0.88438481092453,
1184
+ "eval_runtime": 17.8963,
1185
+ "eval_samples_per_second": 64.483,
1186
+ "eval_steps_per_second": 0.559,
1187
+ "step": 595
1188
+ },
1189
+ {
1190
+ "epoch": 85.71428571428571,
1191
+ "grad_norm": 5.748138904571533,
1192
+ "learning_rate": 4.7619047619047614e-05,
1193
+ "loss": 2.8678,
1194
+ "step": 600
1195
+ },
1196
+ {
1197
+ "epoch": 86.0,
1198
+ "eval_accuracy": 0.7824956672443674,
1199
+ "eval_loss": 0.8882182836532593,
1200
+ "eval_runtime": 17.8524,
1201
+ "eval_samples_per_second": 64.641,
1202
+ "eval_steps_per_second": 0.56,
1203
+ "step": 602
1204
+ },
1205
+ {
1206
+ "epoch": 87.0,
1207
+ "eval_accuracy": 0.7824956672443674,
1208
+ "eval_loss": 0.8715818524360657,
1209
+ "eval_runtime": 17.8328,
1210
+ "eval_samples_per_second": 64.712,
1211
+ "eval_steps_per_second": 0.561,
1212
+ "step": 609
1213
+ },
1214
+ {
1215
+ "epoch": 87.14285714285714,
1216
+ "grad_norm": 4.612086772918701,
1217
+ "learning_rate": 4.285714285714285e-05,
1218
+ "loss": 2.6334,
1219
+ "step": 610
1220
+ },
1221
+ {
1222
+ "epoch": 88.0,
1223
+ "eval_accuracy": 0.7781629116117851,
1224
+ "eval_loss": 0.8782148361206055,
1225
+ "eval_runtime": 17.9213,
1226
+ "eval_samples_per_second": 64.393,
1227
+ "eval_steps_per_second": 0.558,
1228
+ "step": 616
1229
+ },
1230
+ {
1231
+ "epoch": 88.57142857142857,
1232
+ "grad_norm": 6.36035680770874,
1233
+ "learning_rate": 3.809523809523809e-05,
1234
+ "loss": 2.7782,
1235
+ "step": 620
1236
+ },
1237
+ {
1238
+ "epoch": 89.0,
1239
+ "eval_accuracy": 0.7807625649913345,
1240
+ "eval_loss": 0.8752433657646179,
1241
+ "eval_runtime": 18.042,
1242
+ "eval_samples_per_second": 63.962,
1243
+ "eval_steps_per_second": 0.554,
1244
+ "step": 623
1245
+ },
1246
+ {
1247
+ "epoch": 90.0,
1248
+ "grad_norm": 6.581643581390381,
1249
+ "learning_rate": 3.333333333333333e-05,
1250
+ "loss": 2.5527,
1251
+ "step": 630
1252
+ },
1253
+ {
1254
+ "epoch": 90.0,
1255
+ "eval_accuracy": 0.7807625649913345,
1256
+ "eval_loss": 0.8674911856651306,
1257
+ "eval_runtime": 17.811,
1258
+ "eval_samples_per_second": 64.791,
1259
+ "eval_steps_per_second": 0.561,
1260
+ "step": 630
1261
+ },
1262
+ {
1263
+ "epoch": 91.0,
1264
+ "eval_accuracy": 0.7842287694974004,
1265
+ "eval_loss": 0.8734576106071472,
1266
+ "eval_runtime": 17.906,
1267
+ "eval_samples_per_second": 64.448,
1268
+ "eval_steps_per_second": 0.558,
1269
+ "step": 637
1270
+ },
1271
+ {
1272
+ "epoch": 91.42857142857143,
1273
+ "grad_norm": 6.266481399536133,
1274
+ "learning_rate": 2.8571428571428567e-05,
1275
+ "loss": 2.6812,
1276
+ "step": 640
1277
+ },
1278
+ {
1279
+ "epoch": 92.0,
1280
+ "eval_accuracy": 0.7885615251299827,
1281
+ "eval_loss": 0.8649889826774597,
1282
+ "eval_runtime": 18.1196,
1283
+ "eval_samples_per_second": 63.688,
1284
+ "eval_steps_per_second": 0.552,
1285
+ "step": 644
1286
+ },
1287
+ {
1288
+ "epoch": 92.85714285714286,
1289
+ "grad_norm": 5.178635597229004,
1290
+ "learning_rate": 2.3809523809523807e-05,
1291
+ "loss": 2.6167,
1292
+ "step": 650
1293
+ },
1294
+ {
1295
+ "epoch": 93.0,
1296
+ "eval_accuracy": 0.7946273830155979,
1297
+ "eval_loss": 0.8530935049057007,
1298
+ "eval_runtime": 17.8992,
1299
+ "eval_samples_per_second": 64.472,
1300
+ "eval_steps_per_second": 0.559,
1301
+ "step": 651
1302
+ },
1303
+ {
1304
+ "epoch": 94.0,
1305
+ "eval_accuracy": 0.7868284228769498,
1306
+ "eval_loss": 0.8698766827583313,
1307
+ "eval_runtime": 17.9684,
1308
+ "eval_samples_per_second": 64.224,
1309
+ "eval_steps_per_second": 0.557,
1310
+ "step": 658
1311
+ },
1312
+ {
1313
+ "epoch": 94.28571428571429,
1314
+ "grad_norm": 4.488171100616455,
1315
+ "learning_rate": 1.9047619047619046e-05,
1316
+ "loss": 2.6553,
1317
+ "step": 660
1318
+ },
1319
+ {
1320
+ "epoch": 95.0,
1321
+ "eval_accuracy": 0.7894280762564991,
1322
+ "eval_loss": 0.8666642308235168,
1323
+ "eval_runtime": 17.9669,
1324
+ "eval_samples_per_second": 64.229,
1325
+ "eval_steps_per_second": 0.557,
1326
+ "step": 665
1327
+ },
1328
+ {
1329
+ "epoch": 95.71428571428571,
1330
+ "grad_norm": 6.009092330932617,
1331
+ "learning_rate": 1.4285714285714284e-05,
1332
+ "loss": 2.7758,
1333
+ "step": 670
1334
+ },
1335
+ {
1336
+ "epoch": 96.0,
1337
+ "eval_accuracy": 0.7920277296360485,
1338
+ "eval_loss": 0.8650416731834412,
1339
+ "eval_runtime": 18.0841,
1340
+ "eval_samples_per_second": 63.813,
1341
+ "eval_steps_per_second": 0.553,
1342
+ "step": 672
1343
+ },
1344
+ {
1345
+ "epoch": 97.0,
1346
+ "eval_accuracy": 0.7902946273830156,
1347
+ "eval_loss": 0.8684815764427185,
1348
+ "eval_runtime": 17.8482,
1349
+ "eval_samples_per_second": 64.656,
1350
+ "eval_steps_per_second": 0.56,
1351
+ "step": 679
1352
+ },
1353
+ {
1354
+ "epoch": 97.14285714285714,
1355
+ "grad_norm": 5.19600772857666,
1356
+ "learning_rate": 9.523809523809523e-06,
1357
+ "loss": 2.6592,
1358
+ "step": 680
1359
+ },
1360
+ {
1361
+ "epoch": 98.0,
1362
+ "eval_accuracy": 0.7885615251299827,
1363
+ "eval_loss": 0.8592236042022705,
1364
+ "eval_runtime": 17.9065,
1365
+ "eval_samples_per_second": 64.446,
1366
+ "eval_steps_per_second": 0.558,
1367
+ "step": 686
1368
+ },
1369
+ {
1370
+ "epoch": 98.57142857142857,
1371
+ "grad_norm": 5.676305770874023,
1372
+ "learning_rate": 4.7619047619047615e-06,
1373
+ "loss": 2.5202,
1374
+ "step": 690
1375
+ },
1376
+ {
1377
+ "epoch": 99.0,
1378
+ "eval_accuracy": 0.7894280762564991,
1379
+ "eval_loss": 0.8744557499885559,
1380
+ "eval_runtime": 17.8619,
1381
+ "eval_samples_per_second": 64.607,
1382
+ "eval_steps_per_second": 0.56,
1383
+ "step": 693
1384
+ },
1385
+ {
1386
+ "epoch": 100.0,
1387
+ "grad_norm": 48.86530685424805,
1388
+ "learning_rate": 0.0,
1389
+ "loss": 2.6577,
1390
+ "step": 700
1391
+ },
1392
+ {
1393
+ "epoch": 100.0,
1394
+ "eval_accuracy": 0.7954939341421143,
1395
+ "eval_loss": 0.8635059595108032,
1396
+ "eval_runtime": 18.1084,
1397
+ "eval_samples_per_second": 63.727,
1398
+ "eval_steps_per_second": 0.552,
1399
+ "step": 700
1400
+ },
1401
+ {
1402
+ "epoch": 100.0,
1403
+ "step": 700,
1404
+ "total_flos": 6.134894724962304e+18,
1405
+ "train_loss": 3.4289448138645717,
1406
+ "train_runtime": 8283.6476,
1407
+ "train_samples_per_second": 41.793,
1408
+ "train_steps_per_second": 0.085
1409
  }
1410
  ],
1411
  "logging_steps": 10,
 
1420
  "should_evaluate": false,
1421
  "should_log": false,
1422
  "should_save": true,
1423
+ "should_training_stop": true
1424
  },
1425
  "attributes": {}
1426
  }
1427
  },
1428
+ "total_flos": 6.134894724962304e+18,
1429
  "train_batch_size": 128,
1430
  "trial_name": null,
1431
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73530e41251800b972013692d5a7eb3224f0171bc366ff6fe2f9cb2946d17136
3
  size 5368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b45f2a43a993f893b4fbd6c9537f41343150eb53527e66d53527833b633d402
3
  size 5368